def submitter(once=False, rses=[], mock=False, process=0, total_processes=1, total_threads=1, bulk=100, group_bulk=1, group_policy='rule', fts_source_strategy='auto', activities=None, sleep_time=600, max_sources=4, retry_other_fts=False): """ Main loop to submit a new transfer primitive to a transfertool. """ logging.info('Transfer submitter starting - process (%i/%i) threads (%i)' % (process, total_processes, total_threads)) try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: timeout = config_get('conveyor', 'submit_timeout') timeout = float(timeout) except NoOptionError: timeout = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s" % max_time_in_queue) executable = ' '.join(sys.argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb = heartbeat.live(executable, hostname, pid, hb_thread) logging.info( 'Transfer submitter started - process (%i/%i) threads (%i/%i) timeout (%s)' % (process, total_processes, hb['assign_thread'], hb['nr_threads'], timeout)) threadPool = ThreadPool(total_threads) activity_next_exe_time = defaultdict(time.time) while not graceful_stop.is_set(): try: hb = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None for activity in activities: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue user_transfer = False if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']: logging.info("CMS user transfer activity") user_transfer = True logging.info( "%s:%s Starting to get transfer transfers for %s" % (process, hb['assign_thread'], activity)) ts = time.time() transfers = __get_transfers(process=process, total_processes=total_processes, thread=hb['assign_thread'], total_threads=hb['nr_threads'], failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, mock=mock, max_sources=max_sources, bring_online=bring_online, retry_other_fts=retry_other_fts) record_timer( 'daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - ts) * 1000 / (len(transfers) if len(transfers) else 1)) record_counter( 'daemons.conveyor.transfer_submitter.get_transfers', len(transfers)) record_timer( 'daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers)) logging.info( "%s:%s Got %s transfers for %s" % (process, hb['assign_thread'], len(transfers), activity)) # group transfers logging.info("%s:%s Starting to group transfers for %s" % (process, hb['assign_thread'], activity)) ts = time.time() grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, fts_source_strategy, max_time_in_queue) record_timer( 'daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - ts) * 1000 / (len(transfers) if len(transfers) else 1)) logging.info("%s:%s Starting to submit transfers for %s" % (process, hb['assign_thread'], activity)) for external_host in grouped_jobs: if not user_transfer: for job in grouped_jobs[external_host]: # submit transfers job_requests = makeRequests( submit_transfer, args_list=[((), { 'external_host': external_host, 'job': job, 'submitter': 'transfer_submitter', 'process': process, 'thread': hb['assign_thread'], 'timeout': timeout })]) [ threadPool.putRequest(job_req) for job_req in job_requests ] else: for user, jobs in grouped_jobs[ external_host].iteritems(): # submit transfers for job in jobs: job_requests = makeRequests( submit_transfer, args_list=[((), { 'external_host': external_host, 'job': job, 'submitter': 'transfer_submitter', 'process': process, 'thread': hb['assign_thread'], 'timeout': timeout, 'user_transfer_job': user_transfer })]) [ threadPool.putRequest(job_req) for job_req in job_requests ] threadPool.wait() if len(transfers) < group_bulk: logging.info( '%i:%i - only %s transfers for %s which is less than group bulk %s, sleep %s seconds' % (process, hb['assign_thread'], len(transfers), activity, group_bulk, sleep_time)) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time( ) + sleep_time except: logging.critical( '%s:%s %s' % (process, hb['assign_thread'], traceback.format_exc())) if once: break logging.info('%s:%s graceful stop requested' % (process, hb['assign_thread'])) threadPool.dismissWorkers(total_threads, do_join=True) heartbeat.die(executable, hostname, pid, hb_thread) logging.info('%s:%s graceful stop done' % (process, hb['assign_thread'])) return
def stager(once=False, rses=None, mock=False, bulk=100, group_bulk=1, group_policy='rule', source_strategy=None, activities=None, sleep_time=600, retry_other_fts=False): """ Main loop to submit a new transfer primitive to a transfertool. """ try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s" % max_time_in_queue) activity_next_exe_time = defaultdict(time.time) executable = ' '.join(sys.argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Stager starting with bring_online %s seconds' % (bring_online)) time.sleep( 10 ) # To prevent running on the same partition if all the poller restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Stager started') while not graceful_stop.is_set(): try: heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None for activity in activities: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue logging.info(prepend_str + 'Starting to get stagein transfers for %s' % (activity)) start_time = time.time() transfers = __get_stagein_transfers( total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, mock=mock, schemes=scheme, bring_online=bring_online, retry_other_fts=retry_other_fts) record_timer( 'daemons.conveyor.stager.get_stagein_transfers.per_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) record_counter('daemons.conveyor.stager.get_stagein_transfers', len(transfers)) record_timer( 'daemons.conveyor.stager.get_stagein_transfers.transfers', len(transfers)) logging.info(prepend_str + 'Got %s stagein transfers for %s' % (len(transfers), activity)) # group transfers logging.info(prepend_str + 'Starting to group transfers for %s' % (activity)) start_time = time.time() grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, source_strategy, max_time_in_queue) record_timer('daemons.conveyor.stager.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) logging.info(prepend_str + 'Starting to submit transfers for %s' % (activity)) # submit transfers for external_host in grouped_jobs: for job in grouped_jobs[external_host]: # submit transfers submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter', logging_prepend_str=prepend_str) if len(transfers) < group_bulk: logging.info( prepend_str + 'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds' % (len(transfers), activity, group_bulk, sleep_time)) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time( ) + sleep_time except Exception: logging.critical(prepend_str + '%s' % (traceback.format_exc())) if once: break logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')
def submitter(once=False, rses=None, mock=False, bulk=100, group_bulk=1, group_policy='rule', source_strategy=None, activities=None, sleep_time=600, max_sources=4, retry_other_fts=False): """ Main loop to submit a new transfer primitive to a transfertool. """ try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: timeout = config_get('conveyor', 'submit_timeout') timeout = float(timeout) except NoOptionError: timeout = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s", max_time_in_queue) activity_next_exe_time = defaultdict(time.time) executable = sys.argv[0] if activities: activities.sort() executable += '--activities ' + str(activities) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info('%s Submitter starting with timeout %s', prepend_str, timeout) time.sleep( 10 ) # To prevent running on the same partition if all the poller restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info('%s Transfer submitter started', prepend_str) while not graceful_stop.is_set(): try: heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None for activity in activities: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue user_transfer = False if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']: logging.info('%s CMS user transfer activity', prepend_str) user_transfer = True logging.info('%s Starting to get transfer transfers for %s', prepend_str, activity) start_time = time.time() transfers = __get_transfers( total_workers=heart_beat['nr_threads'] - 1, worker_number=heart_beat['assign_thread'], failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, mock=mock, max_sources=max_sources, bring_online=bring_online, retry_other_fts=retry_other_fts) record_timer( 'daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) record_counter( 'daemons.conveyor.transfer_submitter.get_transfers', len(transfers)) GET_TRANSFERS_COUNTER.inc(len(transfers)) record_timer( 'daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers)) logging.info('%s Got %s transfers for %s in %s seconds', prepend_str, len(transfers), activity, time.time() - start_time) # group transfers logging.info('%s Starting to group transfers for %s', prepend_str, activity) start_time = time.time() grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, source_strategy, max_time_in_queue) record_timer( 'daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) logging.info('%s Starting to submit transfers for %s', prepend_str, activity) if TRANSFER_TOOL == 'fts3': for external_host in grouped_jobs: if not user_transfer: for job in grouped_jobs[external_host]: # submit transfers submit_transfer( external_host=external_host, job=job, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout) else: for _, jobs in iteritems( grouped_jobs[external_host]): # submit transfers for job in jobs: submit_transfer( external_host=external_host, job=job, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout, user_transfer_job=user_transfer) elif TRANSFER_TOOL == 'globus': if TRANSFER_TYPE == 'bulk': # build bulk job file list per external host to send to submit_transfer for external_host in grouped_jobs: # pad the job with job_params; irrelevant for globus but needed for further rucio parsing submitjob = { 'files': [], 'job_params': grouped_jobs[''][0].get('job_params') } for job in grouped_jobs[external_host]: submitjob.get('files').append( job.get('files')[0]) logging.debug('submitjob: %s' % submitjob) submit_transfer(external_host=external_host, job=submitjob, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout) else: # build single job files and individually send to submit_transfer job_params = grouped_jobs[''][0].get( 'job_params') if grouped_jobs else None for external_host in grouped_jobs: for job in grouped_jobs[external_host]: for file in job['files']: singlejob = { 'files': [file], 'job_params': job_params } logging.debug('singlejob: %s' % singlejob) submit_transfer( external_host=external_host, job=singlejob, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout) else: logging.error(prepend_str + 'Unknown transfer tool') if len(transfers) < group_bulk: logging.info( '%s Only %s transfers for %s which is less than group bulk %s, sleep %s seconds', prepend_str, len(transfers), activity, group_bulk, sleep_time) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time( ) + sleep_time except Exception: logging.critical('%s %s', prepend_str, str(traceback.format_exc())) if once: break logging.info('%s Graceful stop requested', prepend_str) heartbeat.die(executable, hostname, pid, hb_thread) logging.info('%s Graceful stop done', prepend_str) return