Пример #1
0
    def runningMachinesCount(self):
        """Return dictionary with number of machines running at Freiburg. Depending on config file
        this may account for draining slots (claimed|retiring = working vs. claimed|idle = offline).

        The number of running machines needs to be recalculated when accounting for draining slots.
        Claimed but retiring slots are still being counted as working slots and thus contributing
        to the number of running machines -> remove idle draining slots from running machines
        and recalculate the actual number of running machines.

        :return {machine_type: integer, ...}:
        """
        # fall back to base method if required
        if self.getConfig(self.configIgnoreDrainingMachines) is True:
            return super(FreiburgSiteAdapter, self).runningMachinesCount
        else:
            runningMachines = self.runningMachines
            runningMachinesCount = dict()
            for machineType in runningMachines:
                # calculate number of drained slots (idle and not accepting new jobs -> not usable)
                nDrainedSlots = 0

                for mid in runningMachines[machineType]:
                    nDrainedSlots += HTCondor.calcDrainStatus(mid)[0]
                nCores = self.getConfig(self.ConfigMachines)[machineType]["cores"]
                nMachines = len(runningMachines[machineType])
                # Calculate the number of available slots
                # Little trick: floor division with negative values: -9//4 = -3
                nDrainedSlots = -nDrainedSlots
                runningMachinesCount[machineType] = nMachines + nDrainedSlots // nCores
                if nDrainedSlots != 0:
                    self.logger.debug("%s: running: %d, drained slots: %d"
                                      " -> recalculated running machines count: %s"
                                      % (machineType, nMachines, nDrainedSlots,
                                         runningMachinesCount[machineType]))
            return runningMachinesCount
Пример #2
0
    def runningMachinesCount(self):
        """Return dictionary with number of machines running at Freiburg. Depending on config file
        this may account for draining slots (claimed|retiring = working vs. claimed|idle = offline).

        The number of running machines needs to be recalculated when accounting for draining slots.
        Claimed but retiring slots are still being counted as working slots and thus contributing
        to the number of running machines -> remove idle draining slots from running machines
        and recalculate the actual number of running machines.

        :return {machine_type: integer, ...}:
        """
        # fall back to base method if required
        if self.getConfig(self.configIgnoreDrainingMachines) is True:
            return super(FreiburgSiteAdapter, self).runningMachinesCount
        else:
            runningMachines = self.runningMachines
            runningMachinesCount = dict()
            for machineType in runningMachines:
                # calculate number of drained slots (idle and not accepting new jobs -> not usable)
                nDrainedSlots = 0

                for mid in runningMachines[machineType]:
                    nDrainedSlots += HTCondor.calcDrainStatus(mid)[0]
                nCores = self.getConfig(
                    self.ConfigMachines)[machineType]["cores"]
                nMachines = len(runningMachines[machineType])
                # Calculate the number of available slots
                # Little trick: floor division with negative values: -9//4 = -3
                nDrainedSlots = -nDrainedSlots
                runningMachinesCount[
                    machineType] = nMachines + nDrainedSlots // nCores
                if nDrainedSlots != 0:
                    self.logger.debug(
                        "%s: running: %d, drained slots: %d"
                        " -> recalculated running machines count: %s" %
                        (machineType, nMachines, nDrainedSlots,
                         runningMachinesCount[machineType]))
            return runningMachinesCount
Пример #3
0
    def manage(self):
        # type: () -> None
        """Manages status changes of machines by checking  jobs in Freiburg.

        Booting = Freiburg batch job for machine was submitted
        Up      = Freiburg batch job is running, VM is Booting,
                  HTCondorIntegrationAdapter switches this to "integrating" and "working".
        Disintegrated & Down

        HTCondorIntegrationAdapter is responsible for handling Integrating, Working,
        PendingDisintegration, Disintegrating
        """
        try:
            frJobsRunning = self.__runningJobs
            if frJobsRunning is None:
                raise ValueError
        except ValueError:
            frJobsRunning = {}
        try:
            frJobsCompleted = self.__completedJobs
            if frJobsCompleted is None:
                raise ValueError
        except ValueError:
            frJobsCompleted = {}
        try:
            frJobsIdle = self.__idleJobs
            if frJobsIdle is None:
                raise ValueError
        except ValueError:
            frJobsIdle = {}

        mr = self.getSiteMachines()
        for mid in mr:
            batchJobId = mr[mid][self.regMachineJobId]
            # Status handled by Integration Adapter
            if mr[mid][self.mr.regStatus] in [
                    self.mr.statusIntegrating, self.mr.statusWorking,
                    self.mr.statusPendingDisintegration,
                    self.mr.statusDisintegrating
            ]:
                try:
                    frJobsRunning.pop(batchJobId)
                    continue
                except (KeyError, AttributeError):
                    # AttributeError: frJobsRunning is Empty
                    # KeyError: batchJobId not in frJobsRunning
                    pass
            # Machines which failed to boot/died/got canceled (return code != 0) -> down
            # A machine MAY fail to boot with return code 0 or we just missed some states -> regular shutdown
            if mr[mid][self.mr.regStatus] != self.mr.statusDown:
                if batchJobId in frJobsCompleted:
                    if mr[mid][self.mr.regStatus] == self.mr.statusBooting:
                        self.logger.info("VM (%s) failed to boot!" %
                                         batchJobId)
                    else:
                        if frJobsCompleted[batchJobId] != "0":
                            self.logger.info("VM (%s) died!" % batchJobId)
                        else:
                            self.logger.debug("VM (%s) died with status 0!" %
                                              batchJobId)
                    self.mr.updateMachineStatus(mid, self.mr.statusDown)
            elif batchJobId in frJobsCompleted or self.mr.calcLastStateChange(
                    mid) > 24 * 60 * 60:
                # Remove machines, which are:
                # 1. finished in ROCED & Freiburg // 2. Finished for more than 1 day [= job history purge time]
                self.mr.removeMachine(mid)
                continue
            elif batchJobId in frJobsRunning:
                # ROCED machine down, but job still running
                frJobsRunning.pop(batchJobId)
                if self.mr.calcLastStateChange(mid) > 5 * 60:
                    self.__cancelFreiburgMachines(batchJobId)
                continue

            if mr[mid][self.mr.regStatus] == self.mr.statusBooting:
                # batch job running: machine -> up
                if batchJobId in frJobsRunning:
                    self.mr.updateMachineStatus(mid, self.mr.statusUp)
                    frJobsRunning.pop(batchJobId)
                # Machine disappeared. If the machine later appears again, it will be added automatically.
                elif batchJobId not in frJobsIdle and batchJobId not in frJobsCompleted:
                    self.mr.updateMachineStatus(mid, self.mr.statusDown)

        # All remaining unaccounted batch jobs
        for batchJobId in frJobsRunning:
            mid = self.mr.newMachine()
            # TODO: try to identify machine type, using cores & wall-time
            self.mr.machines[mid][self.mr.regSite] = self.siteName
            self.mr.machines[mid][self.mr.regSiteType] = self.siteType
            self.mr.machines[mid][
                self.mr.regMachineType] = self.__default_machine
            self.mr.machines[mid][self.regMachineJobId] = batchJobId
            self.mr.machines[mid][
                self.reg_site_server_condor_name] = self.__getCondorName(
                    batchJobId)
            self.mr.updateMachineStatus(mid, self.mr.statusUp)

        self.logger.info("Machines using resources (Freiburg): %d" %
                         self.cloudOccupyingMachinesCount)

        with JsonLog() as jsonLog:
            jsonLog.addItem(
                self.siteName, "condor_nodes",
                len(self.getSiteMachines(status=self.mr.statusWorking)))
            jsonLog.addItem(
                self.siteName, "condor_nodes_draining",
                len([
                    mid for mid in self.getSiteMachines(
                        status=self.mr.statusPendingDisintegration)
                    if HTCondor.calcDrainStatus(mid)[1] is True
                ]))
            jsonLog.addItem(
                self.siteName, "machines_requested",
                len(self.getSiteMachines(status=self.mr.statusBooting)) +
                len(self.getSiteMachines(status=self.mr.statusUp)) +
                len(self.getSiteMachines(status=self.mr.statusIntegrating)))
Пример #4
0
    def terminateMachines(self, machineType, count):
        """Terminate machines in Freiburg.

        Working machines are untouched by default, but they may get put into drain mode if
        the configuration is set accordingly.

        :param machineType:
        :param count:
        :return:
        """
        # booting machines, sorted by request time (newest first).
        bootingMachines = self.getSiteMachines(self.mr.statusBooting,
                                               machineType)
        try:
            bootingMachines = sorted(
                bootingMachines.items(),
                key=lambda machine_: machine_[1][self.mr.regStatusLastUpdate],
                reverse=True)
        except KeyError:
            bootingMachines = []

        # Running machines, sorted by load (idle first). These machines are put into drain mode
        if self.getConfig(self.configDrainWorkingMachines) is True:
            workingMachines = merge_dicts(
                self.getSiteMachines(self.mr.statusIntegrating, machineType),
                self.getSiteMachines(self.mr.statusWorking, machineType),
                self.getSiteMachines(self.mr.statusPendingDisintegration,
                                     machineType))
            try:
                workingMachines = sorted(
                    workingMachines.items(),
                    key=lambda machine_: HTCondor.calcMachineLoad(machine_[0]),
                    reverse=True)
            except KeyError:
                workingMachines = []
            # Merge lists
            machinesToRemove = bootingMachines + workingMachines
        else:
            machinesToRemove = bootingMachines

        # needed amount of machines
        machinesToRemove = machinesToRemove[0:count]

        # list of batch job ids to terminate/drain
        idsToTerminate = []
        idsToDrain = []
        idsRemoved = []
        idsInvalidated = []

        for mid, machine in machinesToRemove:
            if machine[self.mr.regStatus] == self.mr.statusBooting:
                # booting machines can be terminated immediately
                idsToTerminate.append(machine[self.regMachineJobId])
            elif self.getConfig(self.configDrainWorkingMachines):
                if HTCondor.calcDrainStatus(mid)[1] is True:
                    continue
                # working machines should be set to drain mode
                idsToDrain.append(machine[self.regMachineJobId])

        self.logger.debug("Machines to terminate (%d): %s" %
                          (len(idsToTerminate), ", ".join(idsToTerminate)))
        if idsToTerminate:
            idsRemoved, idsInvalidated = self.__cancelFreiburgMachines(
                idsToTerminate)

        self.logger.debug("Machines to drain (%d): %s" %
                          (len(idsToDrain), ", ".join(idsToDrain)))
        if idsToDrain:
            [
                HTCondor.drainMachine(mid)
                for mid, machine in self.getSiteMachines().items()
                if machine[self.regMachineJobId] in idsToDrain
            ]

        if len(idsRemoved + idsInvalidated) > 0:
            # update status
            [
                self.mr.updateMachineStatus(mid, self.mr.statusDown)
                for mid, machine in self.getSiteMachines().items()
                if machine[self.regMachineJobId] in idsRemoved + idsInvalidated
            ]
Пример #5
0
    def manage(self):
        # type: () -> None
        """Manages status changes of machines by checking  jobs in Freiburg.

        Booting = Freiburg batch job for machine was submitted
        Up      = Freiburg batch job is running, VM is Booting,
                  HTCondorIntegrationAdapter switches this to "integrating" and "working".
        Disintegrated & Down

        HTCondorIntegrationAdapter is responsible for handling Integrating, Working,
        PendingDisintegration, Disintegrating
        """
        try:
            frJobsRunning = self.__runningJobs
            if frJobsRunning is None:
                raise ValueError
        except ValueError:
            frJobsRunning = {}
        try:
            frJobsCompleted = self.__completedJobs
            if frJobsCompleted is None:
                raise ValueError
        except ValueError:
            frJobsCompleted = {}
        try:
            frJobsIdle = self.__idleJobs
            if frJobsIdle is None:
                raise ValueError
        except ValueError:
            frJobsIdle = {}

        mr = self.getSiteMachines()
        for mid in mr:
            batchJobId = mr[mid][self.regMachineJobId]
            # Status handled by Integration Adapter
            if mr[mid][self.mr.regStatus] in [self.mr.statusIntegrating, self.mr.statusWorking,
                                              self.mr.statusPendingDisintegration,
                                              self.mr.statusDisintegrating]:
                try:
                    frJobsRunning.pop(batchJobId)
                    continue
                except (KeyError, AttributeError):
                    # AttributeError: frJobsRunning is Empty
                    # KeyError: batchJobId not in frJobsRunning
                    pass
            # Machines which failed to boot/died/got canceled (return code != 0) -> down
            # A machine MAY fail to boot with return code 0 or we just missed some states -> regular shutdown
            if mr[mid][self.mr.regStatus] != self.mr.statusDown:
                if batchJobId in frJobsCompleted:
                    if mr[mid][self.mr.regStatus] == self.mr.statusBooting:
                        self.logger.info("VM (%s) failed to boot!" % batchJobId)
                    else:
                        if frJobsCompleted[batchJobId] != "0":
                            self.logger.info("VM (%s) died!" % batchJobId)
                        else:
                            self.logger.debug("VM (%s) died with status 0!" % batchJobId)
                    self.mr.updateMachineStatus(mid, self.mr.statusDown)
            elif batchJobId in frJobsCompleted or self.mr.calcLastStateChange(mid) > 24 * 60 * 60:
                # Remove machines, which are:
                # 1. finished in ROCED & Freiburg // 2. Finished for more than 1 day [= job history purge time]
                self.mr.removeMachine(mid)
                continue
            elif batchJobId in frJobsRunning:
                # ROCED machine down, but job still running
                frJobsRunning.pop(batchJobId)
                if self.mr.calcLastStateChange(mid) > 5*60:
                    self.__cancelFreiburgMachines(batchJobId)
                continue

            if mr[mid][self.mr.regStatus] == self.mr.statusBooting:
                # batch job running: machine -> up
                if batchJobId in frJobsRunning:
                    self.mr.updateMachineStatus(mid, self.mr.statusUp)
                    frJobsRunning.pop(batchJobId)
                # Machine disappeared. If the machine later appears again, it will be added automatically.
                elif batchJobId not in frJobsIdle and batchJobId not in frJobsCompleted:
                    self.mr.updateMachineStatus(mid, self.mr.statusDown)

        # All remaining unaccounted batch jobs
        for batchJobId in frJobsRunning:
            mid = self.mr.newMachine()
            # TODO: try to identify machine type, using cores & wall-time
            self.mr.machines[mid][self.mr.regSite] = self.siteName
            self.mr.machines[mid][self.mr.regSiteType] = self.siteType
            self.mr.machines[mid][self.mr.regMachineType] = self.__default_machine
            self.mr.machines[mid][self.regMachineJobId] = batchJobId
            self.mr.machines[mid][self.reg_site_server_condor_name] = self.__getCondorName(batchJobId)
            self.mr.updateMachineStatus(mid, self.mr.statusUp)

        self.logger.info("Machines using resources (Freiburg): %d" % self.cloudOccupyingMachinesCount)

        with JsonLog() as jsonLog:
            jsonLog.addItem(self.siteName, "condor_nodes",
                            len(self.getSiteMachines(status=self.mr.statusWorking)))
            jsonLog.addItem(self.siteName, "condor_nodes_draining",
                            len([mid for mid in self.getSiteMachines(status=self.mr.statusPendingDisintegration)
                                 if HTCondor.calcDrainStatus(mid)[1] is True]))
            jsonLog.addItem(self.siteName, "machines_requested",
                            len(self.getSiteMachines(status=self.mr.statusBooting)) +
                            len(self.getSiteMachines(status=self.mr.statusUp)) +
                            len(self.getSiteMachines(status=self.mr.statusIntegrating)))
Пример #6
0
    def terminateMachines(self, machineType, count):
        """Terminate machines in Freiburg.

        Working machines are untouched by default, but they may get put into drain mode if
        the configuration is set accordingly.

        :param machineType:
        :param count:
        :return:
        """
        # booting machines, sorted by request time (newest first).
        bootingMachines = self.getSiteMachines(self.mr.statusBooting, machineType)
        try:
            bootingMachines = sorted(bootingMachines.items(),
                                     key=lambda machine_: machine_[1][self.mr.regStatusLastUpdate],
                                     reverse=True)
        except KeyError:
            bootingMachines = []

        # Running machines, sorted by load (idle first). These machines are put into drain mode
        if self.getConfig(self.configDrainWorkingMachines) is True:
            workingMachines = merge_dicts(
                self.getSiteMachines(self.mr.statusIntegrating, machineType),
                self.getSiteMachines(self.mr.statusWorking, machineType),
                self.getSiteMachines(self.mr.statusPendingDisintegration, machineType))
            try:
                workingMachines = sorted(workingMachines.items(),
                                         key=lambda machine_: HTCondor.calcMachineLoad(machine_[0]),
                                         reverse=True)
            except KeyError:
                workingMachines = []
            # Merge lists
            machinesToRemove = bootingMachines + workingMachines
        else:
            machinesToRemove = bootingMachines

        # needed amount of machines
        machinesToRemove = machinesToRemove[0:count]

        # list of batch job ids to terminate/drain
        idsToTerminate = []
        idsToDrain = []
        idsRemoved = []
        idsInvalidated = []

        for mid, machine in machinesToRemove:
            if machine[self.mr.regStatus] == self.mr.statusBooting:
                # booting machines can be terminated immediately
                idsToTerminate.append(machine[self.regMachineJobId])
            elif self.getConfig(self.configDrainWorkingMachines):
                if HTCondor.calcDrainStatus(mid)[1] is True:
                    continue
                # working machines should be set to drain mode
                idsToDrain.append(machine[self.regMachineJobId])

        self.logger.debug("Machines to terminate (%d): %s" % (len(idsToTerminate), ", ".join(idsToTerminate)))
        if idsToTerminate:
            idsRemoved, idsInvalidated = self.__cancelFreiburgMachines(idsToTerminate)

        self.logger.debug("Machines to drain (%d): %s" % (len(idsToDrain), ", ".join(idsToDrain)))
        if idsToDrain:
            [HTCondor.drainMachine(mid) for mid, machine in self.getSiteMachines().items()
             if machine[self.regMachineJobId] in idsToDrain]

        if len(idsRemoved + idsInvalidated) > 0:
            # update status
            [self.mr.updateMachineStatus(mid, self.mr.statusDown) for mid, machine
             in self.getSiteMachines().items()
             if machine[self.regMachineJobId] in idsRemoved + idsInvalidated]