def create_workers(self, worker_spec_list): """ Creates a worker """ start_time = time.time() tmp_log = core_utils.make_logger(_base_logger, 'harvester_id={0}'.format( self.harvester_id), method_name='create_workers') if not self.__active: tmp_log.debug('APFMon reporting not enabled') return try: tmp_log.debug('start') url = '{0}/jobs'.format(self.base_url) for worker_spec_shard in core_utils.create_shards( worker_spec_list, 20): apfmon_workers = [] for worker_spec in worker_spec_shard: batch_id = worker_spec.batchID worker_id = worker_spec.workerID if not batch_id: tmp_log.debug( 'no batchID found for workerID {0}... skipping'. format(worker_id)) continue factory = self.harvester_id computingsite = worker_spec.computingSite try: ce = clean_ce(worker_spec.computingElement) except AttributeError: tmp_log.debug( 'no CE found for workerID {0} batchID {1}'.format( worker_id, batch_id)) ce = NO_CE # extract the log URLs stdout_url = '' stderr_url = '' log_url = '' jdl_url = '' work_attribs = worker_spec.workAttributes if work_attribs: if 'stdOut' in work_attribs: stdout_url = work_attribs['stdOut'] # jdl_url = '{0}.jdl'.format(stdout_url[:-4]) if 'stdErr' in work_attribs: stderr_url = work_attribs['stdErr'] if 'batchLog' in work_attribs: log_url = work_attribs['batchLog'] if 'jdl' in work_attribs: jdl_url = work_attribs['jdl'] apfmon_worker = { 'cid': batch_id, 'factory': factory, 'label': '{0}-{1}'.format(computingsite, ce), 'jdlurl': jdl_url, 'stdouturl': stdout_url, 'stderrurl': stderr_url, 'logurl': log_url } tmp_log.debug('packed worker: {0}'.format(apfmon_worker)) apfmon_workers.append(apfmon_worker) payload = json.dumps(apfmon_workers) try: r = requests.put(url, data=payload, timeout=self.__worker_timeout) tmp_log.debug( 'worker creation for {0} ended with {1} {2}'.format( apfmon_workers, r.status_code, r.text)) except: tmp_log.debug('worker creation for {0} failed with'.format( apfmon_workers, format(traceback.format_exc()))) end_time = time.time() tmp_log.debug('done (took {0})'.format(end_time - start_time)) except: tmp_log.error('Excepted with: {0}'.format(traceback.format_exc()))
def run(self): """ main """ main_log = core_utils.make_logger(_logger, 'id={0}'.format(self.ident), method_name='run') bulk_size = harvester_config.commandmanager.commands_bulk_size locked = self.db_proxy.get_process_lock( 'commandmanager', self.get_pid(), harvester_config.commandmanager.sleepTime) if locked: # send command list to be received siteNames = set() commandList = [] for queueName, queueConfig in iteritems( self.queueConfigMapper.get_active_queues()): if queueConfig is None or queueConfig.runMode != 'slave': continue # one command for all queues in one site if queueConfig.siteName not in siteNames: commandItem = { 'command': CommandSpec.COM_reportWorkerStats, 'computingSite': queueConfig.siteName, 'resourceType': queueConfig.resourceType } commandList.append(commandItem) siteNames.add(queueConfig.siteName) # one command for each queue commandItem = { 'command': CommandSpec.COM_setNWorkers, 'computingSite': queueConfig.siteName, 'resourceType': queueConfig.resourceType } commandList.append(commandItem) if len(commandList) > 0: main_log.debug('sending command list to receive') self.communicator.is_alive({ 'startTime': datetime.datetime.utcnow(), 'commands': commandList }) # main loop while True: # get lock locked = self.db_proxy.get_process_lock( 'commandmanager', self.get_pid(), harvester_config.commandmanager.sleepTime) if locked or self.singleMode: main_log.debug('polling commands loop') # send heartbeat if self.lastHeartbeat is None \ or self.lastHeartbeat < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): self.lastHeartbeat = datetime.datetime.utcnow() self.communicator.is_alive( {'startTime': datetime.datetime.utcnow()}) continuous_loop = True # as long as there are commands, retrieve them while continuous_loop: # get commands from panda server for this harvester instance commands = self.communicator.get_commands(bulk_size) main_log.debug('got {0} commands (bulk size: {1})'.format( len(commands), bulk_size)) command_specs = self.convert_to_command_specs(commands) # cache commands in internal DB self.db_proxy.store_commands(command_specs) main_log.debug('cached {0} commands in internal DB'.format( len(command_specs))) # retrieve processed commands from harvester cache command_ids_ack = self.db_proxy.get_commands_ack() for shard in core_utils.create_shards( command_ids_ack, bulk_size): # post acknowledgements to panda server self.communicator.ack_commands(shard) main_log.debug( 'acknowledged {0} commands to panda server'.format( len(shard))) # clean acknowledged commands self.db_proxy.clean_commands_by_id(shard) # clean commands that have been processed and do not need acknowledgement self.db_proxy.clean_processed_commands() # if we didn't collect the full bulk, give panda server a break if len(commands) < bulk_size: continuous_loop = False # check if being terminated if self.terminated(harvester_config.commandmanager.sleepTime, randomize=False): main_log.debug('terminated') return
def create_labels(self): """ Creates or updates a collection of labels (=panda queue+CE) """ start_time = time.time() tmp_log = core_utils.make_logger(_base_logger, 'harvester_id={0}'.format( self.harvester_id), method_name='create_labels') if not self.__active: tmp_log.debug('APFMon reporting not enabled') return try: tmp_log.debug('start') url = '{0}/labels'.format(self.base_url) # get the active queues from the config mapper all_sites = self.queue_config_mapper.get_active_queues().keys() panda_queues_dict = PandaQueuesDict() # publish the active queues to APF mon in shards for sites in core_utils.create_shards(all_sites, 20): labels = [] for site in sites: try: site_info = panda_queues_dict.get(site, dict()) if not site_info: tmp_log.warning( 'No site info for {0}'.format(site)) continue # when no CEs associated to a queue, e.g. P1, HPCs, etc. Try to see if there is something # in local configuration, otherwise set it to a dummy value try: ce = self.queue_config_mapper.queueConfig[ site].submitter['ceEndpoint'] queues = [{'ce_endpoint': ce}] except KeyError: if site_info['queues']: queues = site_info['queues'] else: queues = [{'ce_endpoint': NO_CE}] for queue in queues: try: ce = clean_ce(queue['ce_endpoint']) except: ce = '' try: ce_queue_id = queue['ce_queue_id'] except KeyError: ce_queue_id = 0 labels.append({ 'name': '{0}-{1}'.format(site, ce), 'wmsqueue': site, 'ce_queue_id': ce_queue_id, 'factory': self.harvester_id }) except: tmp_log.error('Excepted for site {0} with: {1}'.format( site, traceback.format_exc())) continue payload = json.dumps(labels) r = requests.put(url, data=payload, timeout=self.__label_timeout) tmp_log.debug( 'label creation for {0} ended with {1} {2}'.format( sites, r.status_code, r.text)) end_time = time.time() tmp_log.debug('done (took {0})'.format(end_time - start_time)) except: tmp_log.error('Excepted with: {0}'.format(traceback.format_exc()))
def run(self): """ main """ main_log = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run') bulk_size = harvester_config.commandmanager.commands_bulk_size locked = self.db_proxy.get_process_lock('commandmanager', self.get_pid(), harvester_config.commandmanager.sleepTime) if locked: # send command list to be received siteNames = set() commandList = [] for queueName, queueConfig in iteritems(self.queueConfigMapper.get_active_queues()): if queueConfig is None or queueConfig.runMode != 'slave': continue # one command for all queues in one site if queueConfig.siteName not in siteNames: commandItem = {'command': CommandSpec.COM_reportWorkerStats, 'computingSite': queueConfig.siteName, 'resourceType': queueConfig.resourceType } commandList.append(commandItem) siteNames.add(queueConfig.siteName) # one command for each queue commandItem = {'command': CommandSpec.COM_setNWorkers, 'computingSite': queueConfig.siteName, 'resourceType': queueConfig.resourceType } commandList.append(commandItem) data = {'startTime': datetime.datetime.utcnow(), 'sw_version': panda_pkg_info.release_version, 'commit_stamp': commit_timestamp.timestamp} if len(commandList) > 0: main_log.debug('sending command list to receive') data['commands'] = commandList self.communicator.is_alive(data) # main loop while True: # get lock locked = self.db_proxy.get_process_lock('commandmanager', self.get_pid(), harvester_config.commandmanager.sleepTime) if locked or self.singleMode: main_log.debug('polling commands loop') # send heartbeat if self.lastHeartbeat is None \ or self.lastHeartbeat < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): self.lastHeartbeat = datetime.datetime.utcnow() self.communicator.is_alive({}) continuous_loop = True # as long as there are commands, retrieve them while continuous_loop: # get commands from panda server for this harvester instance commands = self.communicator.get_commands(bulk_size) main_log.debug('got {0} commands (bulk size: {1})'.format(len(commands), bulk_size)) command_specs = self.convert_to_command_specs(commands) # cache commands in internal DB self.db_proxy.store_commands(command_specs) main_log.debug('cached {0} commands in internal DB'.format(len(command_specs))) # retrieve processed commands from harvester cache command_ids_ack = self.db_proxy.get_commands_ack() for shard in core_utils.create_shards(command_ids_ack, bulk_size): # post acknowledgements to panda server self.communicator.ack_commands(shard) main_log.debug('acknowledged {0} commands to panda server'.format(len(shard))) # clean acknowledged commands self.db_proxy.clean_commands_by_id(shard) # clean commands that have been processed and do not need acknowledgement self.db_proxy.clean_processed_commands() # if we didn't collect the full bulk, give panda server a break if len(commands) < bulk_size: continuous_loop = False # check if being terminated if self.terminated(harvester_config.commandmanager.sleepTime, randomize=False): main_log.debug('terminated') return