예제 #1
0
def stager(once=False,
           rses=None,
           mock=False,
           bulk=100,
           group_bulk=1,
           group_policy='rule',
           source_strategy=None,
           activities=None,
           sleep_time=600,
           retry_other_fts=False):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None

    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}
    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s" %
                  max_time_in_queue)

    activity_next_exe_time = defaultdict(time.time)
    executable = 'conveyor-stager'
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'conveyor-stager[%i/%i] : ' % (heart_beat['assign_thread'],
                                            heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO,
           'Stager starting with bring_online %s seconds' % (bring_online))

    time.sleep(
        10
    )  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'conveyor-stager[%i/%i] : ' % (heart_beat['assign_thread'],
                                            heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Stager started')

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
            prefix = 'conveyor-stager[%i/%i] : ' % (
                heart_beat['assign_thread'], heart_beat['nr_threads'])
            logger = formatted_logger(logging.log, prefix + '%s')

            if activities is None:
                activities = [None]
            if rses:
                rse_ids = [rse['id'] for rse in rses]
            else:
                rse_ids = None

            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                logger(logging.INFO,
                       'Starting to get stagein transfers for %s' % (activity))
                start_time = time.time()
                transfers = __get_stagein_transfers(
                    total_workers=heart_beat['nr_threads'],
                    worker_number=heart_beat['assign_thread'],
                    failover_schemes=failover_scheme,
                    limit=bulk,
                    activity=activity,
                    rses=rse_ids,
                    mock=mock,
                    schemes=scheme,
                    bring_online=bring_online,
                    retry_other_fts=retry_other_fts,
                    logger=logger)
                record_timer(
                    'daemons.conveyor.stager.get_stagein_transfers.per_transfer',
                    (time.time() - start_time) * 1000 /
                    (len(transfers) if transfers else 1))
                record_counter('daemons.conveyor.stager.get_stagein_transfers',
                               len(transfers))
                record_timer(
                    'daemons.conveyor.stager.get_stagein_transfers.transfers',
                    len(transfers))
                logger(
                    logging.INFO, 'Got %s stagein transfers for %s' %
                    (len(transfers), activity))

                # group transfers
                logger(logging.INFO,
                       'Starting to group transfers for %s' % (activity))
                start_time = time.time()
                grouped_jobs = bulk_group_transfer(transfers, group_policy,
                                                   group_bulk, source_strategy,
                                                   max_time_in_queue)
                record_timer('daemons.conveyor.stager.bulk_group_transfer',
                             (time.time() - start_time) * 1000 /
                             (len(transfers) if transfers else 1))

                logger(logging.INFO,
                       'Starting to submit transfers for %s' % (activity))
                # submit transfers
                for external_host in grouped_jobs:
                    for job in grouped_jobs[external_host]:
                        # submit transfers
                        submit_transfer(external_host=external_host,
                                        job=job,
                                        submitter='transfer_submitter',
                                        logger=logger)

                if len(transfers) < group_bulk:
                    logger(
                        logging.INFO,
                        'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds'
                        % (len(transfers), activity, group_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time(
                        ) + sleep_time
        except Exception:
            logger(logging.CRITICAL, "Exception", exc_info=True)

        if once:
            break

    logger(logging.INFO, 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logger(logging.INFO, 'Graceful stop done')
예제 #2
0
def run_once(bulk, group_bulk, rse_ids, scheme, failover_scheme,
             transfertool_kwargs, heartbeat_handler, activity):
    worker_number, total_workers, logger = heartbeat_handler.live()

    start_time = time.time()
    transfers = next_transfers_to_submit(
        total_workers=total_workers,
        worker_number=worker_number,
        failover_schemes=failover_scheme,
        limit=bulk,
        activity=activity,
        rses=rse_ids,
        schemes=scheme,
        transfertool_classes=[FTS3Transfertool],
        older_than=None,
        request_type=RequestType.STAGEIN,
        logger=logger,
    )
    total_transfers = len(
        list(hop for paths in transfers.values() for path in paths
             for hop in path))
    record_timer('daemons.conveyor.stager.get_stagein_transfers.per_transfer',
                 (time.time() - start_time) * 1000 /
                 (total_transfers if transfers else 1))
    record_counter('daemons.conveyor.stager.get_stagein_transfers',
                   total_transfers)
    record_timer('daemons.conveyor.stager.get_stagein_transfers.transfers',
                 total_transfers)
    logger(logging.INFO,
           'Got %s stagein transfers for %s' % (total_transfers, activity))

    for builder, transfer_paths in transfers.items():
        transfertool_obj = builder.make_transfertool(
            logger=logger,
            **transfertool_kwargs.get(builder.transfertool_class, {}))
        logger(
            logging.INFO, 'Starting to group transfers for %s (%s)' %
            (activity, transfertool_obj))
        start_time = time.time()
        grouped_jobs = transfertool_obj.group_into_submit_jobs(transfer_paths)
        record_timer('daemons.conveyor.stager.bulk_group_transfer',
                     (time.time() - start_time) * 1000 /
                     (len(transfer_paths) or 1))

        logger(
            logging.INFO, 'Starting to submit transfers for %s (%s)' %
            (activity, transfertool_obj))
        for job in grouped_jobs:
            worker_number, total_workers, logger = heartbeat_handler.live()
            submit_transfer(transfertool_obj=transfertool_obj,
                            transfers=job['transfers'],
                            job_params=job['job_params'],
                            submitter='transfer_submitter',
                            logger=logger)

    queue_empty = False
    if total_transfers < group_bulk:
        queue_empty = True
        logger(
            logging.INFO,
            'Only %s transfers for %s which is less than group bulk %s' %
            (total_transfers, activity, group_bulk))
    return queue_empty
예제 #3
0
파일: submitter.py 프로젝트: ricsxn/rucio
def submitter(once=False, rses=None, mock=False,
              bulk=100, group_bulk=1, group_policy='rule', source_strategy=None,
              activities=None, sleep_time=600, max_sources=4, retry_other_fts=False):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None
    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None
    try:
        timeout = config_get('conveyor', 'submit_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}

    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s", max_time_in_queue)

    activity_next_exe_time = defaultdict(time.time)
    executable = "conveyor-submitter"
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)

    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logging.info('%s Submitter starting with timeout %s', prepend_str, timeout)

    time.sleep(10)  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logging.info('%s Transfer submitter started', prepend_str)

    while not graceful_stop.is_set():
        if activities is None:
            activities = [None]
        if rses:
            rse_ids = [rse['id'] for rse in rses]
        else:
            rse_ids = None
        for activity in activities:
            try:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600)
                prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])

                user_transfer = False

                if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']:
                    logging.info('%s CMS user transfer activity', prepend_str)
                    user_transfer = True

                logging.info('%s Starting to get transfer transfers for %s', prepend_str, activity)
                start_time = time.time()
                transfers = __get_transfers(total_workers=heart_beat['nr_threads'],
                                            worker_number=heart_beat['assign_thread'],
                                            failover_schemes=failover_scheme,
                                            limit=bulk,
                                            activity=activity,
                                            rses=rse_ids,
                                            schemes=scheme,
                                            mock=mock,
                                            max_sources=max_sources,
                                            bring_online=bring_online,
                                            retry_other_fts=retry_other_fts)
                record_timer('daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1))
                record_counter('daemons.conveyor.transfer_submitter.get_transfers', len(transfers))
                GET_TRANSFERS_COUNTER.inc(len(transfers))
                record_timer('daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers))
                logging.info('%s Got %s transfers for %s in %s seconds', prepend_str, len(transfers), activity, time.time() - start_time)

                # group transfers
                logging.info('%s Starting to group transfers for %s', prepend_str, activity)
                start_time = time.time()

                grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, source_strategy, max_time_in_queue)
                record_timer('daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1))

                logging.info('%s Starting to submit transfers for %s', prepend_str, activity)

                if TRANSFER_TOOL in ['fts3', 'mock']:
                    for external_host in grouped_jobs:
                        if not user_transfer:
                            for job in grouped_jobs[external_host]:
                                # submit transfers
                                submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter',
                                                logging_prepend_str=prepend_str, timeout=timeout)
                        else:
                            for _, jobs in iteritems(grouped_jobs[external_host]):
                                # submit transfers
                                for job in jobs:
                                    submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter',
                                                    logging_prepend_str=prepend_str, timeout=timeout, user_transfer_job=user_transfer)
                elif TRANSFER_TOOL == 'globus':
                    if TRANSFER_TYPE == 'bulk':
                        # build bulk job file list per external host to send to submit_transfer
                        for external_host in grouped_jobs:
                            # pad the job with job_params; irrelevant for globus but needed for further rucio parsing
                            submitjob = {'files': [], 'job_params': grouped_jobs[''][0].get('job_params')}
                            for job in grouped_jobs[external_host]:
                                submitjob.get('files').append(job.get('files')[0])
                            logging.debug('submitjob: %s' % submitjob)
                            submit_transfer(external_host=external_host, job=submitjob, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout)
                    else:
                        # build single job files and individually send to submit_transfer
                        job_params = grouped_jobs[''][0].get('job_params') if grouped_jobs else None
                        for external_host in grouped_jobs:
                            for job in grouped_jobs[external_host]:
                                for file in job['files']:
                                    singlejob = {'files': [file], 'job_params': job_params}
                                    logging.debug('singlejob: %s' % singlejob)
                                    submit_transfer(external_host=external_host, job=singlejob, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout)
                else:
                    logging.error(prepend_str + 'Unknown transfer tool')

                if len(transfers) < group_bulk:
                    logging.info('%s Only %s transfers for %s which is less than group bulk %s, sleep %s seconds', prepend_str, len(transfers), activity, group_bulk, sleep_time)
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time() + sleep_time
            except Exception:
                logging.critical('%s %s', prepend_str, str(traceback.format_exc()))

        if once:
            break

    logging.info('%s Graceful stop requested', prepend_str)

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info('%s Graceful stop done', prepend_str)
    return
예제 #4
0
파일: stager.py 프로젝트: mlassnig/rucio
def stager(once=False, rses=None, bulk=100, group_bulk=1, group_policy='rule',
           source_strategy=None, activities=None, sleep_time=600):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None

    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}
    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s" % max_time_in_queue)

    activity_next_exe_time = defaultdict(time.time)
    logger_prefix = executable = 'conveyor-stager'
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    with HeartbeatHandler(executable=executable, logger_prefix=logger_prefix) as heartbeat_handler:
        logger = heartbeat_handler.logger
        logger(logging.INFO, 'Stager starting with bring_online %s seconds' % (bring_online))

        while not graceful_stop.is_set():

            try:
                heart_beat, logger = heartbeat_handler.live()

                if activities is None:
                    activities = [None]
                if rses:
                    rse_ids = [rse['id'] for rse in rses]
                else:
                    rse_ids = None

                for activity in activities:
                    if activity_next_exe_time[activity] > time.time():
                        graceful_stop.wait(1)
                        continue

                    logger(logging.INFO, 'Starting to get stagein transfers for %s' % (activity))
                    start_time = time.time()

                    transfertool_kwargs = {
                        FTS3Transfertool: {
                            'group_policy': group_policy,
                            'group_bulk': group_bulk,
                            'source_strategy': source_strategy,
                            'max_time_in_queue': max_time_in_queue,
                            'bring_online': bring_online,
                            'default_lifetime': -1,
                        }
                    }
                    transfers = transfer_core.next_transfers_to_submit(
                        total_workers=heart_beat['nr_threads'],
                        worker_number=heart_beat['assign_thread'],
                        failover_schemes=failover_scheme,
                        limit=bulk,
                        activity=activity,
                        rses=rse_ids,
                        schemes=scheme,
                        transfertools_by_name={'fts3': FTS3Transfertool},
                        older_than=None,
                        request_type=RequestType.STAGEIN,
                        logger=logger,
                    )
                    total_transfers = len(list(hop for paths in transfers.values() for path in paths for hop in path))
                    record_timer('daemons.conveyor.stager.get_stagein_transfers.per_transfer', (time.time() - start_time) * 1000 / (total_transfers if transfers else 1))
                    record_counter('daemons.conveyor.stager.get_stagein_transfers', total_transfers)
                    record_timer('daemons.conveyor.stager.get_stagein_transfers.transfers', total_transfers)
                    logger(logging.INFO, 'Got %s stagein transfers for %s' % (total_transfers, activity))

                    for builder, transfer_paths in transfers.items():
                        transfertool_obj = builder.make_transfertool(logger=logger, **transfertool_kwargs.get(builder.transfertool_class, {}))
                        logger(logging.INFO, 'Starting to group transfers for %s (%s)' % (activity, transfertool_obj))
                        start_time = time.time()
                        grouped_jobs = transfertool_obj.group_into_submit_jobs(transfer_paths)
                        record_timer('daemons.conveyor.stager.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfer_paths) or 1))

                        logger(logging.INFO, 'Starting to submit transfers for %s (%s)' % (activity, transfertool_obj))
                        for job in grouped_jobs:
                            submit_transfer(transfertool_obj=transfertool_obj, transfers=job['transfers'], job_params=job['job_params'], submitter='transfer_submitter', logger=logger)

                    if total_transfers < group_bulk:
                        logger(logging.INFO, 'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds' % (total_transfers, activity, group_bulk, sleep_time))
                        if activity_next_exe_time[activity] < time.time():
                            activity_next_exe_time[activity] = time.time() + sleep_time
            except Exception:
                raise

            if once:
                break
예제 #5
0
파일: submitter.py 프로젝트: sartiran/rucio
def submitter(once=False,
              rses=None,
              mock=False,
              bulk=100,
              group_bulk=1,
              group_policy='rule',
              fts_source_strategy='auto',
              activities=None,
              sleep_time=600,
              max_sources=4,
              retry_other_fts=False):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None
    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None
    try:
        timeout = config_get('conveyor', 'submit_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}

    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s" %
                  max_time_in_queue)

    activity_next_exe_time = defaultdict(time.time)
    executable = sys.argv[0]
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)

    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Submitter starting with timeout %s' %
                 (timeout))

    time.sleep(
        10
    )  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Transfer submitter started')

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable,
                                        hostname,
                                        pid,
                                        hb_thread,
                                        older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] +
                                                 1, heart_beat['nr_threads'])

            if activities is None:
                activities = [None]
            if rses:
                rse_ids = [rse['id'] for rse in rses]
            else:
                rse_ids = None
            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                user_transfer = False

                if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']:
                    logging.info(prepend_str + "CMS user transfer activity")
                    user_transfer = True

                logging.info(prepend_str +
                             'Starting to get transfer transfers for %s' %
                             (activity))
                start_time = time.time()
                transfers = __get_transfers(
                    total_workers=heart_beat['nr_threads'] - 1,
                    worker_number=heart_beat['assign_thread'],
                    failover_schemes=failover_scheme,
                    limit=bulk,
                    activity=activity,
                    rses=rse_ids,
                    schemes=scheme,
                    mock=mock,
                    max_sources=max_sources,
                    bring_online=bring_online,
                    retry_other_fts=retry_other_fts)
                record_timer(
                    'daemons.conveyor.transfer_submitter.get_transfers.per_transfer',
                    (time.time() - start_time) * 1000 /
                    (len(transfers) if transfers else 1))
                record_counter(
                    'daemons.conveyor.transfer_submitter.get_transfers',
                    len(transfers))
                record_timer(
                    'daemons.conveyor.transfer_submitter.get_transfers.transfers',
                    len(transfers))
                logging.info(
                    prepend_str + 'Got %s transfers for %s in %s seconds' %
                    (len(transfers), activity, time.time() - start_time))

                # group transfers
                logging.info(prepend_str +
                             'Starting to group transfers for %s' % (activity))
                start_time = time.time()

                grouped_jobs = bulk_group_transfer(transfers, group_policy,
                                                   group_bulk,
                                                   fts_source_strategy,
                                                   max_time_in_queue)
                record_timer(
                    'daemons.conveyor.transfer_submitter.bulk_group_transfer',
                    (time.time() - start_time) * 1000 /
                    (len(transfers) if transfers else 1))

                logging.info(prepend_str +
                             'Starting to submit transfers for %s' %
                             (activity))

                for external_host in grouped_jobs:
                    if not user_transfer:
                        for job in grouped_jobs[external_host]:
                            # submit transfers
                            submit_transfer(external_host=external_host,
                                            job=job,
                                            submitter='transfer_submitter',
                                            logging_prepend_str=prepend_str,
                                            timeout=timeout)
                    else:
                        for _, jobs in grouped_jobs[external_host].iteritems():
                            # submit transfers
                            for job in jobs:
                                submit_transfer(
                                    external_host=external_host,
                                    job=job,
                                    submitter='transfer_submitter',
                                    logging_prepend_str=prepend_str,
                                    timeout=timeout,
                                    user_transfer_job=user_transfer)

                if len(transfers) < group_bulk:
                    logging.info(
                        prepend_str +
                        'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds'
                        % (len(transfers), activity, group_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time(
                        ) + sleep_time
        except Exception:
            logging.critical(prepend_str + '%s' % (traceback.format_exc()))

        if once:
            break

    logging.info(prepend_str + 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Graceful stop done')
    return
예제 #6
0
def submitter(once=False,
              rses=None,
              partition_wait_time=10,
              bulk=100,
              group_bulk=1,
              group_policy='rule',
              source_strategy=None,
              activities=None,
              sleep_time=600,
              max_sources=4,
              archive_timeout_override=None,
              filter_transfertool=FILTER_TRANSFERTOOL,
              transfertool=TRANSFER_TOOL,
              transfertype=TRANSFER_TYPE,
              ignore_availability=False):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    try:
        partition_hash_var = config_get('conveyor', 'partition_hash_var')
    except NoOptionError:
        partition_hash_var = None
    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None
    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None
    try:
        timeout = config_get('conveyor', 'submit_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}

    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s",
                  max_time_in_queue)

    activity_next_exe_time = defaultdict(time.time)
    logger_prefix = executable = "conveyor-submitter"
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    if filter_transfertool:
        executable += ' --filter-transfertool ' + filter_transfertool

    if activities is None:
        activities = [None]
    if rses:
        rse_ids = [rse['id'] for rse in rses]
    else:
        rse_ids = None

    with HeartbeatHandler(executable=executable,
                          logger_prefix=logger_prefix) as heartbeat_handler:
        logger = heartbeat_handler.logger
        logger(logging.INFO, 'Submitter starting with timeout %s', timeout)

        if partition_wait_time:
            graceful_stop.wait(partition_wait_time)

        activity_next_exe_time = PriorityQueue()
        for activity in activities:
            activity_next_exe_time[activity] = time.time()

        while not graceful_stop.is_set() and activity_next_exe_time:
            try:
                time_to_sleep = 0
                if once:
                    activity = activity_next_exe_time.pop()
                else:
                    activity = activity_next_exe_time.top()
                    time_to_sleep = activity_next_exe_time[
                        activity] - time.time()
                    activity_next_exe_time[activity] = time.time() + 1
                if time_to_sleep > 0:
                    logger(logging.DEBUG,
                           'Switching to activity %s and sleeping %s seconds',
                           activity, time_to_sleep)
                    graceful_stop.wait(time_to_sleep)
                else:
                    logger(logging.DEBUG, 'Switching to activity %s', activity)

                heart_beat, logger = heartbeat_handler.live(older_than=3600)

                start_time = time.time()

                transfertool_kwargs = {
                    FTS3Transfertool: {
                        'group_policy': group_policy,
                        'group_bulk': group_bulk,
                        'source_strategy': source_strategy,
                        'max_time_in_queue': max_time_in_queue,
                        'bring_online': bring_online,
                        'default_lifetime': 172800,
                        'archive_timeout_override': archive_timeout_override,
                    },
                    GlobusTransferTool: {
                        'group_policy': transfertype,
                        'group_bulk': group_bulk,
                    },
                }
                transfers = transfer_core.next_transfers_to_submit(
                    total_workers=heart_beat['nr_threads'],
                    worker_number=heart_beat['assign_thread'],
                    partition_hash_var=partition_hash_var,
                    failover_schemes=failover_scheme,
                    limit=bulk,
                    activity=activity,
                    rses=rse_ids,
                    schemes=scheme,
                    filter_transfertool=filter_transfertool,
                    transfertools_by_name={
                        transfertool:
                        TRANSFERTOOL_CLASSES_BY_NAME[transfertool]
                    },
                    older_than=None,
                    request_type=RequestType.TRANSFER,
                    ignore_availability=ignore_availability,
                    logger=logger,
                )
                total_transfers = len(
                    list(hop for paths in transfers.values() for path in paths
                         for hop in path))

                record_timer(
                    'daemons.conveyor.transfer_submitter.get_transfers.per_transfer',
                    (time.time() - start_time) * 1000 / (total_transfers or 1))
                GET_TRANSFERS_COUNTER.inc(total_transfers)
                record_timer(
                    'daemons.conveyor.transfer_submitter.get_transfers.transfers',
                    total_transfers)
                logger(
                    logging.INFO, '%sGot %s transfers for %s in %s seconds',
                    'Slept %s seconds, then ' %
                    time_to_sleep if time_to_sleep > 0 else '',
                    total_transfers, activity,
                    time.time() - start_time)

                for builder, transfer_paths in transfers.items():
                    transfertool_obj = builder.make_transfertool(
                        logger=logger,
                        **transfertool_kwargs.get(builder.transfertool_class,
                                                  {}))
                    start_time = time.time()
                    logger(logging.DEBUG,
                           'Starting to group transfers for %s (%s)', activity,
                           transfertool_obj)
                    grouped_jobs = transfertool_obj.group_into_submit_jobs(
                        transfer_paths)
                    record_timer(
                        'daemons.conveyor.transfer_submitter.bulk_group_transfer',
                        (time.time() - start_time) * 1000 /
                        (len(transfer_paths) or 1))

                    logger(logging.DEBUG,
                           'Starting to submit transfers for %s (%s)',
                           activity, transfertool_obj)
                    for job in grouped_jobs:
                        logger(
                            logging.DEBUG,
                            'submitjob: transfers=%s, job_params=%s' %
                            ([str(t)
                              for t in job['transfers']], job['job_params']))
                        submit_transfer(transfertool_obj=transfertool_obj,
                                        transfers=job['transfers'],
                                        job_params=job['job_params'],
                                        submitter='transfer_submitter',
                                        timeout=timeout,
                                        logger=logger)

                if not once and total_transfers < group_bulk:
                    logger(
                        logging.DEBUG,
                        'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds',
                        total_transfers, activity, group_bulk, sleep_time)
                    activity_next_exe_time[activity] = time.time() + sleep_time

            except Exception:
                logger(logging.CRITICAL, 'Exception', exc_info=True)
                if once:
                    raise
예제 #7
0
def submitter(once=False,
              rses=None,
              partition_wait_time=10,
              bulk=100,
              group_bulk=1,
              group_policy='rule',
              source_strategy=None,
              activities=None,
              sleep_time=600,
              max_sources=4,
              retry_other_fts=False,
              archive_timeout_override=None,
              filter_transfertool=FILTER_TRANSFERTOOL,
              transfertool=TRANSFER_TOOL,
              transfertype=TRANSFER_TYPE):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None
    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None
    try:
        timeout = config_get('conveyor', 'submit_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}

    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s",
                  max_time_in_queue)

    activity_next_exe_time = defaultdict(time.time)
    executable = "conveyor-submitter"
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    if filter_transfertool:
        executable += ' --filter-transfertool ' + filter_transfertool

    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'],
                                               heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Submitter starting with timeout %s', timeout)

    if partition_wait_time:
        time.sleep(
            partition_wait_time
        )  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'],
                                               heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Transfer submitter started')

    while not graceful_stop.is_set():
        if activities is None:
            activities = [None]
        if rses:
            rse_ids = [rse['id'] for rse in rses]
        else:
            rse_ids = None
        for activity in activities:
            try:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                heart_beat = heartbeat.live(executable,
                                            hostname,
                                            pid,
                                            hb_thread,
                                            older_than=3600)
                prefix = 'conveyor-submitter[%i/%i] : ' % (
                    heart_beat['assign_thread'], heart_beat['nr_threads'])
                logger = formatted_logger(logging.log, prefix + '%s')

                logger(logging.INFO,
                       'Starting to get transfer transfers for %s', activity)
                start_time = time.time()
                transfers = __get_transfers(
                    total_workers=heart_beat['nr_threads'],
                    worker_number=heart_beat['assign_thread'],
                    failover_schemes=failover_scheme,
                    limit=bulk,
                    activity=activity,
                    rses=rse_ids,
                    schemes=scheme,
                    max_sources=max_sources,
                    bring_online=bring_online,
                    retry_other_fts=retry_other_fts,
                    transfertool=filter_transfertool,
                    logger=logger)

                record_timer(
                    'daemons.conveyor.transfer_submitter.get_transfers.per_transfer',
                    (time.time() - start_time) * 1000 /
                    (len(transfers) if transfers else 1))
                record_counter(
                    'daemons.conveyor.transfer_submitter.get_transfers',
                    len(transfers))
                GET_TRANSFERS_COUNTER.inc(len(transfers))
                record_timer(
                    'daemons.conveyor.transfer_submitter.get_transfers.transfers',
                    len(transfers))
                logger(logging.INFO, 'Got %s transfers for %s in %s seconds',
                       len(transfers), activity,
                       time.time() - start_time)

                logger(logging.INFO, 'Starting to group transfers for %s',
                       activity)
                start_time = time.time()
                grouped_jobs = {}
                if transfertool in ['fts3', 'mock']:
                    # bulk_group_transfers_for_fts expects single hop transfers in parameter. Split multihop ones
                    single_hop_transfers = {}
                    for transfer_path in transfers.values():
                        for hop in transfer_path:
                            single_hop_transfers[hop.rws.request_id] = hop
                    transfers = single_hop_transfers
                    grouped_jobs = bulk_group_transfers_for_fts(
                        transfers,
                        group_policy,
                        group_bulk,
                        source_strategy,
                        max_time_in_queue,
                        archive_timeout_override=archive_timeout_override)
                elif transfertool == 'globus':
                    grouped_jobs = bulk_group_transfers_for_globus(
                        transfers, transfertype, group_bulk)
                else:
                    logger(logging.ERROR, 'Unknown transfer tool')
                record_timer(
                    'daemons.conveyor.transfer_submitter.bulk_group_transfer',
                    (time.time() - start_time) * 1000 /
                    (len(transfers) if transfers else 1))

                logger(logging.INFO, 'Starting to submit transfers for %s',
                       activity)
                for external_host in grouped_jobs:
                    for job in grouped_jobs[external_host]:
                        logger(logging.DEBUG, 'submitjob: %s' % job)
                        submit_transfer(external_host=external_host,
                                        job=job,
                                        submitter='transfer_submitter',
                                        timeout=timeout,
                                        logger=logger,
                                        transfertool=transfertool)

                if len(transfers) < group_bulk:
                    logger(
                        logging.INFO,
                        'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds',
                        len(transfers), activity, group_bulk, sleep_time)
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time(
                        ) + sleep_time
            except Exception:
                logger(logging.CRITICAL, 'Exception', exc_info=True)

        if once:
            break

    logger(logging.INFO, 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logger(logging.INFO, 'Graceful stop done')
    return
예제 #8
0
def run_once(bulk, group_bulk, filter_transfertool, transfertool,
             ignore_availability, rse_ids, scheme, failover_scheme,
             partition_hash_var, timeout, transfertool_kwargs,
             heartbeat_handler, activity):
    worker_number, total_workers, logger = heartbeat_handler.live()

    start_time = time.time()
    transfers = next_transfers_to_submit(
        total_workers=total_workers,
        worker_number=worker_number,
        partition_hash_var=partition_hash_var,
        failover_schemes=failover_scheme,
        limit=bulk,
        activity=activity,
        rses=rse_ids,
        schemes=scheme,
        filter_transfertool=filter_transfertool,
        transfertools_by_name={
            transfertool: TRANSFERTOOL_CLASSES_BY_NAME[transfertool]
        },
        older_than=None,
        request_type=RequestType.TRANSFER,
        ignore_availability=ignore_availability,
        logger=logger,
    )
    total_transfers = len(
        list(hop for paths in transfers.values() for path in paths
             for hop in path))

    record_timer(
        'daemons.conveyor.transfer_submitter.get_transfers.per_transfer',
        (time.time() - start_time) * 1000 / (total_transfers or 1))
    GET_TRANSFERS_COUNTER.inc(total_transfers)
    record_timer('daemons.conveyor.transfer_submitter.get_transfers.transfers',
                 total_transfers)
    logger(logging.INFO, 'Got %s transfers for %s in %s seconds',
           total_transfers, activity,
           time.time() - start_time)

    for builder, transfer_paths in transfers.items():
        transfertool_obj = builder.make_transfertool(
            logger=logger,
            **transfertool_kwargs.get(builder.transfertool_class, {}))
        start_time = time.time()
        logger(logging.DEBUG, 'Starting to group transfers for %s (%s)',
               activity, transfertool_obj)
        grouped_jobs = transfertool_obj.group_into_submit_jobs(transfer_paths)
        record_timer('daemons.conveyor.transfer_submitter.bulk_group_transfer',
                     (time.time() - start_time) * 1000 /
                     (len(transfer_paths) or 1))

        logger(logging.DEBUG, 'Starting to submit transfers for %s (%s)',
               activity, transfertool_obj)
        for job in grouped_jobs:
            worker_number, total_workers, logger = heartbeat_handler.live()
            logger(
                logging.DEBUG, 'submitjob: transfers=%s, job_params=%s' %
                ([str(t) for t in job['transfers']], job['job_params']))
            submit_transfer(transfertool_obj=transfertool_obj,
                            transfers=job['transfers'],
                            job_params=job['job_params'],
                            submitter='transfer_submitter',
                            timeout=timeout,
                            logger=logger)

    queue_empty = False
    if total_transfers < group_bulk:
        queue_empty = True
        logger(logging.DEBUG,
               'Only %s transfers for %s which is less than group bulk %s',
               total_transfers, activity, group_bulk)
    return queue_empty