def stager(once=False, rses=None, mock=False, bulk=100, group_bulk=1, group_policy='rule', source_strategy=None, activities=None, sleep_time=600, retry_other_fts=False): """ Main loop to submit a new transfer primitive to a transfertool. """ try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s" % max_time_in_queue) activity_next_exe_time = defaultdict(time.time) executable = 'conveyor-stager' if activities: activities.sort() executable += '--activities ' + str(activities) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'conveyor-stager[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Stager starting with bring_online %s seconds' % (bring_online)) time.sleep( 10 ) # To prevent running on the same partition if all the poller restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'conveyor-stager[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Stager started') while not graceful_stop.is_set(): try: heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'conveyor-stager[%i/%i] : ' % ( heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None for activity in activities: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue logger(logging.INFO, 'Starting to get stagein transfers for %s' % (activity)) start_time = time.time() transfers = __get_stagein_transfers( total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, mock=mock, schemes=scheme, bring_online=bring_online, retry_other_fts=retry_other_fts, logger=logger) record_timer( 'daemons.conveyor.stager.get_stagein_transfers.per_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) record_counter('daemons.conveyor.stager.get_stagein_transfers', len(transfers)) record_timer( 'daemons.conveyor.stager.get_stagein_transfers.transfers', len(transfers)) logger( logging.INFO, 'Got %s stagein transfers for %s' % (len(transfers), activity)) # group transfers logger(logging.INFO, 'Starting to group transfers for %s' % (activity)) start_time = time.time() grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, source_strategy, max_time_in_queue) record_timer('daemons.conveyor.stager.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) logger(logging.INFO, 'Starting to submit transfers for %s' % (activity)) # submit transfers for external_host in grouped_jobs: for job in grouped_jobs[external_host]: # submit transfers submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter', logger=logger) if len(transfers) < group_bulk: logger( logging.INFO, 'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds' % (len(transfers), activity, group_bulk, sleep_time)) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time( ) + sleep_time except Exception: logger(logging.CRITICAL, "Exception", exc_info=True) if once: break logger(logging.INFO, 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop done')
def run_once(bulk, group_bulk, rse_ids, scheme, failover_scheme, transfertool_kwargs, heartbeat_handler, activity): worker_number, total_workers, logger = heartbeat_handler.live() start_time = time.time() transfers = next_transfers_to_submit( total_workers=total_workers, worker_number=worker_number, failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, transfertool_classes=[FTS3Transfertool], older_than=None, request_type=RequestType.STAGEIN, logger=logger, ) total_transfers = len( list(hop for paths in transfers.values() for path in paths for hop in path)) record_timer('daemons.conveyor.stager.get_stagein_transfers.per_transfer', (time.time() - start_time) * 1000 / (total_transfers if transfers else 1)) record_counter('daemons.conveyor.stager.get_stagein_transfers', total_transfers) record_timer('daemons.conveyor.stager.get_stagein_transfers.transfers', total_transfers) logger(logging.INFO, 'Got %s stagein transfers for %s' % (total_transfers, activity)) for builder, transfer_paths in transfers.items(): transfertool_obj = builder.make_transfertool( logger=logger, **transfertool_kwargs.get(builder.transfertool_class, {})) logger( logging.INFO, 'Starting to group transfers for %s (%s)' % (activity, transfertool_obj)) start_time = time.time() grouped_jobs = transfertool_obj.group_into_submit_jobs(transfer_paths) record_timer('daemons.conveyor.stager.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfer_paths) or 1)) logger( logging.INFO, 'Starting to submit transfers for %s (%s)' % (activity, transfertool_obj)) for job in grouped_jobs: worker_number, total_workers, logger = heartbeat_handler.live() submit_transfer(transfertool_obj=transfertool_obj, transfers=job['transfers'], job_params=job['job_params'], submitter='transfer_submitter', logger=logger) queue_empty = False if total_transfers < group_bulk: queue_empty = True logger( logging.INFO, 'Only %s transfers for %s which is less than group bulk %s' % (total_transfers, activity, group_bulk)) return queue_empty
def submitter(once=False, rses=None, mock=False, bulk=100, group_bulk=1, group_policy='rule', source_strategy=None, activities=None, sleep_time=600, max_sources=4, retry_other_fts=False): """ Main loop to submit a new transfer primitive to a transfertool. """ try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: timeout = config_get('conveyor', 'submit_timeout') timeout = float(timeout) except NoOptionError: timeout = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s", max_time_in_queue) activity_next_exe_time = defaultdict(time.time) executable = "conveyor-submitter" if activities: activities.sort() executable += '--activities ' + str(activities) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info('%s Submitter starting with timeout %s', prepend_str, timeout) time.sleep(10) # To prevent running on the same partition if all the poller restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info('%s Transfer submitter started', prepend_str) while not graceful_stop.is_set(): if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None for activity in activities: try: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) user_transfer = False if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']: logging.info('%s CMS user transfer activity', prepend_str) user_transfer = True logging.info('%s Starting to get transfer transfers for %s', prepend_str, activity) start_time = time.time() transfers = __get_transfers(total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, mock=mock, max_sources=max_sources, bring_online=bring_online, retry_other_fts=retry_other_fts) record_timer('daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) record_counter('daemons.conveyor.transfer_submitter.get_transfers', len(transfers)) GET_TRANSFERS_COUNTER.inc(len(transfers)) record_timer('daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers)) logging.info('%s Got %s transfers for %s in %s seconds', prepend_str, len(transfers), activity, time.time() - start_time) # group transfers logging.info('%s Starting to group transfers for %s', prepend_str, activity) start_time = time.time() grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, source_strategy, max_time_in_queue) record_timer('daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) logging.info('%s Starting to submit transfers for %s', prepend_str, activity) if TRANSFER_TOOL in ['fts3', 'mock']: for external_host in grouped_jobs: if not user_transfer: for job in grouped_jobs[external_host]: # submit transfers submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout) else: for _, jobs in iteritems(grouped_jobs[external_host]): # submit transfers for job in jobs: submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout, user_transfer_job=user_transfer) elif TRANSFER_TOOL == 'globus': if TRANSFER_TYPE == 'bulk': # build bulk job file list per external host to send to submit_transfer for external_host in grouped_jobs: # pad the job with job_params; irrelevant for globus but needed for further rucio parsing submitjob = {'files': [], 'job_params': grouped_jobs[''][0].get('job_params')} for job in grouped_jobs[external_host]: submitjob.get('files').append(job.get('files')[0]) logging.debug('submitjob: %s' % submitjob) submit_transfer(external_host=external_host, job=submitjob, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout) else: # build single job files and individually send to submit_transfer job_params = grouped_jobs[''][0].get('job_params') if grouped_jobs else None for external_host in grouped_jobs: for job in grouped_jobs[external_host]: for file in job['files']: singlejob = {'files': [file], 'job_params': job_params} logging.debug('singlejob: %s' % singlejob) submit_transfer(external_host=external_host, job=singlejob, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout) else: logging.error(prepend_str + 'Unknown transfer tool') if len(transfers) < group_bulk: logging.info('%s Only %s transfers for %s which is less than group bulk %s, sleep %s seconds', prepend_str, len(transfers), activity, group_bulk, sleep_time) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time() + sleep_time except Exception: logging.critical('%s %s', prepend_str, str(traceback.format_exc())) if once: break logging.info('%s Graceful stop requested', prepend_str) heartbeat.die(executable, hostname, pid, hb_thread) logging.info('%s Graceful stop done', prepend_str) return
def stager(once=False, rses=None, bulk=100, group_bulk=1, group_policy='rule', source_strategy=None, activities=None, sleep_time=600): """ Main loop to submit a new transfer primitive to a transfertool. """ try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s" % max_time_in_queue) activity_next_exe_time = defaultdict(time.time) logger_prefix = executable = 'conveyor-stager' if activities: activities.sort() executable += '--activities ' + str(activities) with HeartbeatHandler(executable=executable, logger_prefix=logger_prefix) as heartbeat_handler: logger = heartbeat_handler.logger logger(logging.INFO, 'Stager starting with bring_online %s seconds' % (bring_online)) while not graceful_stop.is_set(): try: heart_beat, logger = heartbeat_handler.live() if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None for activity in activities: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue logger(logging.INFO, 'Starting to get stagein transfers for %s' % (activity)) start_time = time.time() transfertool_kwargs = { FTS3Transfertool: { 'group_policy': group_policy, 'group_bulk': group_bulk, 'source_strategy': source_strategy, 'max_time_in_queue': max_time_in_queue, 'bring_online': bring_online, 'default_lifetime': -1, } } transfers = transfer_core.next_transfers_to_submit( total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, transfertools_by_name={'fts3': FTS3Transfertool}, older_than=None, request_type=RequestType.STAGEIN, logger=logger, ) total_transfers = len(list(hop for paths in transfers.values() for path in paths for hop in path)) record_timer('daemons.conveyor.stager.get_stagein_transfers.per_transfer', (time.time() - start_time) * 1000 / (total_transfers if transfers else 1)) record_counter('daemons.conveyor.stager.get_stagein_transfers', total_transfers) record_timer('daemons.conveyor.stager.get_stagein_transfers.transfers', total_transfers) logger(logging.INFO, 'Got %s stagein transfers for %s' % (total_transfers, activity)) for builder, transfer_paths in transfers.items(): transfertool_obj = builder.make_transfertool(logger=logger, **transfertool_kwargs.get(builder.transfertool_class, {})) logger(logging.INFO, 'Starting to group transfers for %s (%s)' % (activity, transfertool_obj)) start_time = time.time() grouped_jobs = transfertool_obj.group_into_submit_jobs(transfer_paths) record_timer('daemons.conveyor.stager.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfer_paths) or 1)) logger(logging.INFO, 'Starting to submit transfers for %s (%s)' % (activity, transfertool_obj)) for job in grouped_jobs: submit_transfer(transfertool_obj=transfertool_obj, transfers=job['transfers'], job_params=job['job_params'], submitter='transfer_submitter', logger=logger) if total_transfers < group_bulk: logger(logging.INFO, 'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds' % (total_transfers, activity, group_bulk, sleep_time)) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time() + sleep_time except Exception: raise if once: break
def submitter(once=False, rses=None, mock=False, bulk=100, group_bulk=1, group_policy='rule', fts_source_strategy='auto', activities=None, sleep_time=600, max_sources=4, retry_other_fts=False): """ Main loop to submit a new transfer primitive to a transfertool. """ try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: timeout = config_get('conveyor', 'submit_timeout') timeout = float(timeout) except NoOptionError: timeout = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s" % max_time_in_queue) activity_next_exe_time = defaultdict(time.time) executable = sys.argv[0] if activities: activities.sort() executable += '--activities ' + str(activities) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info(prepend_str + 'Submitter starting with timeout %s' % (timeout)) time.sleep( 10 ) # To prevent running on the same partition if all the poller restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info(prepend_str + 'Transfer submitter started') while not graceful_stop.is_set(): try: heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None for activity in activities: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue user_transfer = False if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']: logging.info(prepend_str + "CMS user transfer activity") user_transfer = True logging.info(prepend_str + 'Starting to get transfer transfers for %s' % (activity)) start_time = time.time() transfers = __get_transfers( total_workers=heart_beat['nr_threads'] - 1, worker_number=heart_beat['assign_thread'], failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, mock=mock, max_sources=max_sources, bring_online=bring_online, retry_other_fts=retry_other_fts) record_timer( 'daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) record_counter( 'daemons.conveyor.transfer_submitter.get_transfers', len(transfers)) record_timer( 'daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers)) logging.info( prepend_str + 'Got %s transfers for %s in %s seconds' % (len(transfers), activity, time.time() - start_time)) # group transfers logging.info(prepend_str + 'Starting to group transfers for %s' % (activity)) start_time = time.time() grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, fts_source_strategy, max_time_in_queue) record_timer( 'daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) logging.info(prepend_str + 'Starting to submit transfers for %s' % (activity)) for external_host in grouped_jobs: if not user_transfer: for job in grouped_jobs[external_host]: # submit transfers submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout) else: for _, jobs in grouped_jobs[external_host].iteritems(): # submit transfers for job in jobs: submit_transfer( external_host=external_host, job=job, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout, user_transfer_job=user_transfer) if len(transfers) < group_bulk: logging.info( prepend_str + 'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds' % (len(transfers), activity, group_bulk, sleep_time)) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time( ) + sleep_time except Exception: logging.critical(prepend_str + '%s' % (traceback.format_exc())) if once: break logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done') return
def submitter(once=False, rses=None, partition_wait_time=10, bulk=100, group_bulk=1, group_policy='rule', source_strategy=None, activities=None, sleep_time=600, max_sources=4, archive_timeout_override=None, filter_transfertool=FILTER_TRANSFERTOOL, transfertool=TRANSFER_TOOL, transfertype=TRANSFER_TYPE, ignore_availability=False): """ Main loop to submit a new transfer primitive to a transfertool. """ try: partition_hash_var = config_get('conveyor', 'partition_hash_var') except NoOptionError: partition_hash_var = None try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: timeout = config_get('conveyor', 'submit_timeout') timeout = float(timeout) except NoOptionError: timeout = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s", max_time_in_queue) activity_next_exe_time = defaultdict(time.time) logger_prefix = executable = "conveyor-submitter" if activities: activities.sort() executable += '--activities ' + str(activities) if filter_transfertool: executable += ' --filter-transfertool ' + filter_transfertool if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None with HeartbeatHandler(executable=executable, logger_prefix=logger_prefix) as heartbeat_handler: logger = heartbeat_handler.logger logger(logging.INFO, 'Submitter starting with timeout %s', timeout) if partition_wait_time: graceful_stop.wait(partition_wait_time) activity_next_exe_time = PriorityQueue() for activity in activities: activity_next_exe_time[activity] = time.time() while not graceful_stop.is_set() and activity_next_exe_time: try: time_to_sleep = 0 if once: activity = activity_next_exe_time.pop() else: activity = activity_next_exe_time.top() time_to_sleep = activity_next_exe_time[ activity] - time.time() activity_next_exe_time[activity] = time.time() + 1 if time_to_sleep > 0: logger(logging.DEBUG, 'Switching to activity %s and sleeping %s seconds', activity, time_to_sleep) graceful_stop.wait(time_to_sleep) else: logger(logging.DEBUG, 'Switching to activity %s', activity) heart_beat, logger = heartbeat_handler.live(older_than=3600) start_time = time.time() transfertool_kwargs = { FTS3Transfertool: { 'group_policy': group_policy, 'group_bulk': group_bulk, 'source_strategy': source_strategy, 'max_time_in_queue': max_time_in_queue, 'bring_online': bring_online, 'default_lifetime': 172800, 'archive_timeout_override': archive_timeout_override, }, GlobusTransferTool: { 'group_policy': transfertype, 'group_bulk': group_bulk, }, } transfers = transfer_core.next_transfers_to_submit( total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], partition_hash_var=partition_hash_var, failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, filter_transfertool=filter_transfertool, transfertools_by_name={ transfertool: TRANSFERTOOL_CLASSES_BY_NAME[transfertool] }, older_than=None, request_type=RequestType.TRANSFER, ignore_availability=ignore_availability, logger=logger, ) total_transfers = len( list(hop for paths in transfers.values() for path in paths for hop in path)) record_timer( 'daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - start_time) * 1000 / (total_transfers or 1)) GET_TRANSFERS_COUNTER.inc(total_transfers) record_timer( 'daemons.conveyor.transfer_submitter.get_transfers.transfers', total_transfers) logger( logging.INFO, '%sGot %s transfers for %s in %s seconds', 'Slept %s seconds, then ' % time_to_sleep if time_to_sleep > 0 else '', total_transfers, activity, time.time() - start_time) for builder, transfer_paths in transfers.items(): transfertool_obj = builder.make_transfertool( logger=logger, **transfertool_kwargs.get(builder.transfertool_class, {})) start_time = time.time() logger(logging.DEBUG, 'Starting to group transfers for %s (%s)', activity, transfertool_obj) grouped_jobs = transfertool_obj.group_into_submit_jobs( transfer_paths) record_timer( 'daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfer_paths) or 1)) logger(logging.DEBUG, 'Starting to submit transfers for %s (%s)', activity, transfertool_obj) for job in grouped_jobs: logger( logging.DEBUG, 'submitjob: transfers=%s, job_params=%s' % ([str(t) for t in job['transfers']], job['job_params'])) submit_transfer(transfertool_obj=transfertool_obj, transfers=job['transfers'], job_params=job['job_params'], submitter='transfer_submitter', timeout=timeout, logger=logger) if not once and total_transfers < group_bulk: logger( logging.DEBUG, 'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds', total_transfers, activity, group_bulk, sleep_time) activity_next_exe_time[activity] = time.time() + sleep_time except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) if once: raise
def submitter(once=False, rses=None, partition_wait_time=10, bulk=100, group_bulk=1, group_policy='rule', source_strategy=None, activities=None, sleep_time=600, max_sources=4, retry_other_fts=False, archive_timeout_override=None, filter_transfertool=FILTER_TRANSFERTOOL, transfertool=TRANSFER_TOOL, transfertype=TRANSFER_TYPE): """ Main loop to submit a new transfer primitive to a transfertool. """ try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: timeout = config_get('conveyor', 'submit_timeout') timeout = float(timeout) except NoOptionError: timeout = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s", max_time_in_queue) activity_next_exe_time = defaultdict(time.time) executable = "conveyor-submitter" if activities: activities.sort() executable += '--activities ' + str(activities) if filter_transfertool: executable += ' --filter-transfertool ' + filter_transfertool hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Submitter starting with timeout %s', timeout) if partition_wait_time: time.sleep( partition_wait_time ) # To prevent running on the same partition if all the poller restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Transfer submitter started') while not graceful_stop.is_set(): if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None for activity in activities: try: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prefix = 'conveyor-submitter[%i/%i] : ' % ( heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Starting to get transfer transfers for %s', activity) start_time = time.time() transfers = __get_transfers( total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, max_sources=max_sources, bring_online=bring_online, retry_other_fts=retry_other_fts, transfertool=filter_transfertool, logger=logger) record_timer( 'daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) record_counter( 'daemons.conveyor.transfer_submitter.get_transfers', len(transfers)) GET_TRANSFERS_COUNTER.inc(len(transfers)) record_timer( 'daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers)) logger(logging.INFO, 'Got %s transfers for %s in %s seconds', len(transfers), activity, time.time() - start_time) logger(logging.INFO, 'Starting to group transfers for %s', activity) start_time = time.time() grouped_jobs = {} if transfertool in ['fts3', 'mock']: # bulk_group_transfers_for_fts expects single hop transfers in parameter. Split multihop ones single_hop_transfers = {} for transfer_path in transfers.values(): for hop in transfer_path: single_hop_transfers[hop.rws.request_id] = hop transfers = single_hop_transfers grouped_jobs = bulk_group_transfers_for_fts( transfers, group_policy, group_bulk, source_strategy, max_time_in_queue, archive_timeout_override=archive_timeout_override) elif transfertool == 'globus': grouped_jobs = bulk_group_transfers_for_globus( transfers, transfertype, group_bulk) else: logger(logging.ERROR, 'Unknown transfer tool') record_timer( 'daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) logger(logging.INFO, 'Starting to submit transfers for %s', activity) for external_host in grouped_jobs: for job in grouped_jobs[external_host]: logger(logging.DEBUG, 'submitjob: %s' % job) submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter', timeout=timeout, logger=logger, transfertool=transfertool) if len(transfers) < group_bulk: logger( logging.INFO, 'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds', len(transfers), activity, group_bulk, sleep_time) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time( ) + sleep_time except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) if once: break logger(logging.INFO, 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop done') return
def run_once(bulk, group_bulk, filter_transfertool, transfertool, ignore_availability, rse_ids, scheme, failover_scheme, partition_hash_var, timeout, transfertool_kwargs, heartbeat_handler, activity): worker_number, total_workers, logger = heartbeat_handler.live() start_time = time.time() transfers = next_transfers_to_submit( total_workers=total_workers, worker_number=worker_number, partition_hash_var=partition_hash_var, failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, filter_transfertool=filter_transfertool, transfertools_by_name={ transfertool: TRANSFERTOOL_CLASSES_BY_NAME[transfertool] }, older_than=None, request_type=RequestType.TRANSFER, ignore_availability=ignore_availability, logger=logger, ) total_transfers = len( list(hop for paths in transfers.values() for path in paths for hop in path)) record_timer( 'daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - start_time) * 1000 / (total_transfers or 1)) GET_TRANSFERS_COUNTER.inc(total_transfers) record_timer('daemons.conveyor.transfer_submitter.get_transfers.transfers', total_transfers) logger(logging.INFO, 'Got %s transfers for %s in %s seconds', total_transfers, activity, time.time() - start_time) for builder, transfer_paths in transfers.items(): transfertool_obj = builder.make_transfertool( logger=logger, **transfertool_kwargs.get(builder.transfertool_class, {})) start_time = time.time() logger(logging.DEBUG, 'Starting to group transfers for %s (%s)', activity, transfertool_obj) grouped_jobs = transfertool_obj.group_into_submit_jobs(transfer_paths) record_timer('daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfer_paths) or 1)) logger(logging.DEBUG, 'Starting to submit transfers for %s (%s)', activity, transfertool_obj) for job in grouped_jobs: worker_number, total_workers, logger = heartbeat_handler.live() logger( logging.DEBUG, 'submitjob: transfers=%s, job_params=%s' % ([str(t) for t in job['transfers']], job['job_params'])) submit_transfer(transfertool_obj=transfertool_obj, transfers=job['transfers'], job_params=job['job_params'], submitter='transfer_submitter', timeout=timeout, logger=logger) queue_empty = False if total_transfers < group_bulk: queue_empty = True logger(logging.DEBUG, 'Only %s transfers for %s which is less than group bulk %s', total_transfers, activity, group_bulk) return queue_empty