Пример #1
0
    def create_workers(self, worker_spec_list):
        """
        Creates a worker
        """
        start_time = time.time()
        tmp_log = core_utils.make_logger(_base_logger,
                                         'harvester_id={0}'.format(
                                             self.harvester_id),
                                         method_name='create_workers')

        if not self.__active:
            tmp_log.debug('APFMon reporting not enabled')
            return

        try:
            tmp_log.debug('start')

            url = '{0}/jobs'.format(self.base_url)

            for worker_spec_shard in core_utils.create_shards(
                    worker_spec_list, 20):
                apfmon_workers = []
                for worker_spec in worker_spec_shard:
                    batch_id = worker_spec.batchID
                    worker_id = worker_spec.workerID
                    if not batch_id:
                        tmp_log.debug(
                            'no batchID found for workerID {0}... skipping'.
                            format(worker_id))
                        continue
                    factory = self.harvester_id
                    computingsite = worker_spec.computingSite
                    try:
                        ce = clean_ce(worker_spec.computingElement)
                    except AttributeError:
                        tmp_log.debug(
                            'no CE found for workerID {0} batchID {1}'.format(
                                worker_id, batch_id))
                        ce = NO_CE

                    # extract the log URLs
                    stdout_url = ''
                    stderr_url = ''
                    log_url = ''
                    jdl_url = ''

                    work_attribs = worker_spec.workAttributes
                    if work_attribs:
                        if 'stdOut' in work_attribs:
                            stdout_url = work_attribs['stdOut']
                            # jdl_url = '{0}.jdl'.format(stdout_url[:-4])
                        if 'stdErr' in work_attribs:
                            stderr_url = work_attribs['stdErr']
                        if 'batchLog' in work_attribs:
                            log_url = work_attribs['batchLog']
                        if 'jdl' in work_attribs:
                            jdl_url = work_attribs['jdl']

                    apfmon_worker = {
                        'cid': batch_id,
                        'factory': factory,
                        'label': '{0}-{1}'.format(computingsite, ce),
                        'jdlurl': jdl_url,
                        'stdouturl': stdout_url,
                        'stderrurl': stderr_url,
                        'logurl': log_url
                    }
                    tmp_log.debug('packed worker: {0}'.format(apfmon_worker))
                    apfmon_workers.append(apfmon_worker)

                payload = json.dumps(apfmon_workers)

                try:
                    r = requests.put(url,
                                     data=payload,
                                     timeout=self.__worker_timeout)
                    tmp_log.debug(
                        'worker creation for {0} ended with {1} {2}'.format(
                            apfmon_workers, r.status_code, r.text))
                except:
                    tmp_log.debug('worker creation for {0} failed with'.format(
                        apfmon_workers, format(traceback.format_exc())))

            end_time = time.time()
            tmp_log.debug('done (took {0})'.format(end_time - start_time))
        except:
            tmp_log.error('Excepted with: {0}'.format(traceback.format_exc()))
Пример #2
0
    def run(self):
        """
        main
        """
        main_log = core_utils.make_logger(_logger,
                                          'id={0}'.format(self.ident),
                                          method_name='run')
        bulk_size = harvester_config.commandmanager.commands_bulk_size
        locked = self.db_proxy.get_process_lock(
            'commandmanager', self.get_pid(),
            harvester_config.commandmanager.sleepTime)
        if locked:
            # send command list to be received
            siteNames = set()
            commandList = []
            for queueName, queueConfig in iteritems(
                    self.queueConfigMapper.get_active_queues()):
                if queueConfig is None or queueConfig.runMode != 'slave':
                    continue
                # one command for all queues in one site
                if queueConfig.siteName not in siteNames:
                    commandItem = {
                        'command': CommandSpec.COM_reportWorkerStats,
                        'computingSite': queueConfig.siteName,
                        'resourceType': queueConfig.resourceType
                    }
                    commandList.append(commandItem)
                siteNames.add(queueConfig.siteName)
                # one command for each queue
                commandItem = {
                    'command': CommandSpec.COM_setNWorkers,
                    'computingSite': queueConfig.siteName,
                    'resourceType': queueConfig.resourceType
                }
                commandList.append(commandItem)
            if len(commandList) > 0:
                main_log.debug('sending command list to receive')
                self.communicator.is_alive({
                    'startTime':
                    datetime.datetime.utcnow(),
                    'commands':
                    commandList
                })

        # main loop
        while True:
            # get lock
            locked = self.db_proxy.get_process_lock(
                'commandmanager', self.get_pid(),
                harvester_config.commandmanager.sleepTime)
            if locked or self.singleMode:

                main_log.debug('polling commands loop')

                # send heartbeat
                if self.lastHeartbeat is None \
                        or self.lastHeartbeat < datetime.datetime.utcnow() - datetime.timedelta(minutes=10):
                    self.lastHeartbeat = datetime.datetime.utcnow()
                    self.communicator.is_alive(
                        {'startTime': datetime.datetime.utcnow()})

                continuous_loop = True  # as long as there are commands, retrieve them

                while continuous_loop:

                    # get commands from panda server for this harvester instance
                    commands = self.communicator.get_commands(bulk_size)
                    main_log.debug('got {0} commands (bulk size: {1})'.format(
                        len(commands), bulk_size))
                    command_specs = self.convert_to_command_specs(commands)

                    # cache commands in internal DB
                    self.db_proxy.store_commands(command_specs)
                    main_log.debug('cached {0} commands in internal DB'.format(
                        len(command_specs)))

                    # retrieve processed commands from harvester cache
                    command_ids_ack = self.db_proxy.get_commands_ack()

                    for shard in core_utils.create_shards(
                            command_ids_ack, bulk_size):
                        # post acknowledgements to panda server
                        self.communicator.ack_commands(shard)
                        main_log.debug(
                            'acknowledged {0} commands to panda server'.format(
                                len(shard)))

                        # clean acknowledged commands
                        self.db_proxy.clean_commands_by_id(shard)

                    # clean commands that have been processed and do not need acknowledgement
                    self.db_proxy.clean_processed_commands()

                    # if we didn't collect the full bulk, give panda server a break
                    if len(commands) < bulk_size:
                        continuous_loop = False

            # check if being terminated
            if self.terminated(harvester_config.commandmanager.sleepTime,
                               randomize=False):
                main_log.debug('terminated')
                return
Пример #3
0
    def create_labels(self):
        """
        Creates or updates a collection of labels (=panda queue+CE)
        """
        start_time = time.time()
        tmp_log = core_utils.make_logger(_base_logger,
                                         'harvester_id={0}'.format(
                                             self.harvester_id),
                                         method_name='create_labels')

        if not self.__active:
            tmp_log.debug('APFMon reporting not enabled')
            return

        try:
            tmp_log.debug('start')

            url = '{0}/labels'.format(self.base_url)

            # get the active queues from the config mapper
            all_sites = self.queue_config_mapper.get_active_queues().keys()
            panda_queues_dict = PandaQueuesDict()

            # publish the active queues to APF mon in shards
            for sites in core_utils.create_shards(all_sites, 20):
                labels = []
                for site in sites:
                    try:
                        site_info = panda_queues_dict.get(site, dict())
                        if not site_info:
                            tmp_log.warning(
                                'No site info for {0}'.format(site))
                            continue

                        # when no CEs associated to a queue, e.g. P1, HPCs, etc. Try to see if there is something
                        # in local configuration, otherwise set it to a dummy value
                        try:
                            ce = self.queue_config_mapper.queueConfig[
                                site].submitter['ceEndpoint']
                            queues = [{'ce_endpoint': ce}]
                        except KeyError:
                            if site_info['queues']:
                                queues = site_info['queues']
                            else:
                                queues = [{'ce_endpoint': NO_CE}]

                        for queue in queues:
                            try:
                                ce = clean_ce(queue['ce_endpoint'])
                            except:
                                ce = ''

                            try:
                                ce_queue_id = queue['ce_queue_id']
                            except KeyError:
                                ce_queue_id = 0

                            labels.append({
                                'name': '{0}-{1}'.format(site, ce),
                                'wmsqueue': site,
                                'ce_queue_id': ce_queue_id,
                                'factory': self.harvester_id
                            })
                    except:
                        tmp_log.error('Excepted for site {0} with: {1}'.format(
                            site, traceback.format_exc()))
                        continue

                payload = json.dumps(labels)

                r = requests.put(url,
                                 data=payload,
                                 timeout=self.__label_timeout)
                tmp_log.debug(
                    'label creation for {0} ended with {1} {2}'.format(
                        sites, r.status_code, r.text))

            end_time = time.time()
            tmp_log.debug('done (took {0})'.format(end_time - start_time))
        except:
            tmp_log.error('Excepted with: {0}'.format(traceback.format_exc()))
Пример #4
0
    def run(self):
        """
        main
        """
        main_log = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run')
        bulk_size = harvester_config.commandmanager.commands_bulk_size
        locked = self.db_proxy.get_process_lock('commandmanager', self.get_pid(),
                                                harvester_config.commandmanager.sleepTime)
        if locked:
            # send command list to be received
            siteNames = set()
            commandList = []
            for queueName, queueConfig in iteritems(self.queueConfigMapper.get_active_queues()):
                if queueConfig is None or queueConfig.runMode != 'slave':
                    continue
                # one command for all queues in one site
                if queueConfig.siteName not in siteNames:
                    commandItem = {'command': CommandSpec.COM_reportWorkerStats,
                                   'computingSite': queueConfig.siteName,
                                   'resourceType': queueConfig.resourceType
                                   }
                    commandList.append(commandItem)
                siteNames.add(queueConfig.siteName)
                # one command for each queue
                commandItem = {'command': CommandSpec.COM_setNWorkers,
                               'computingSite': queueConfig.siteName,
                               'resourceType': queueConfig.resourceType
                               }
                commandList.append(commandItem)
            data = {'startTime': datetime.datetime.utcnow(),
                    'sw_version': panda_pkg_info.release_version,
                    'commit_stamp': commit_timestamp.timestamp}
            if len(commandList) > 0:
                main_log.debug('sending command list to receive')
                data['commands'] = commandList
            self.communicator.is_alive(data)

        # main loop
        while True:
            # get lock
            locked = self.db_proxy.get_process_lock('commandmanager', self.get_pid(),
                                                    harvester_config.commandmanager.sleepTime)
            if locked or self.singleMode:

                main_log.debug('polling commands loop')

                # send heartbeat
                if self.lastHeartbeat is None \
                        or self.lastHeartbeat < datetime.datetime.utcnow() - datetime.timedelta(minutes=10):
                    self.lastHeartbeat = datetime.datetime.utcnow()
                    self.communicator.is_alive({})

                continuous_loop = True  # as long as there are commands, retrieve them

                while continuous_loop:

                    # get commands from panda server for this harvester instance
                    commands = self.communicator.get_commands(bulk_size)
                    main_log.debug('got {0} commands (bulk size: {1})'.format(len(commands), bulk_size))
                    command_specs = self.convert_to_command_specs(commands)

                    # cache commands in internal DB
                    self.db_proxy.store_commands(command_specs)
                    main_log.debug('cached {0} commands in internal DB'.format(len(command_specs)))

                    # retrieve processed commands from harvester cache
                    command_ids_ack = self.db_proxy.get_commands_ack()

                    for shard in core_utils.create_shards(command_ids_ack, bulk_size):
                        # post acknowledgements to panda server
                        self.communicator.ack_commands(shard)
                        main_log.debug('acknowledged {0} commands to panda server'.format(len(shard)))

                        # clean acknowledged commands
                        self.db_proxy.clean_commands_by_id(shard)

                    # clean commands that have been processed and do not need acknowledgement
                    self.db_proxy.clean_processed_commands()

                    # if we didn't collect the full bulk, give panda server a break
                    if len(commands) < bulk_size:
                        continuous_loop = False

            # check if being terminated
            if self.terminated(harvester_config.commandmanager.sleepTime, randomize=False):
                main_log.debug('terminated')
                return