Пример #1
0
    def revert(self):
        """
        Reverts the changes from previous tuning, this is to avoid recursively tuning with no reference point,
        the manually configured attributes or the default attributes will stay as the reference point
        Before each cycle, all tunings will be reverted to the original reference point.
        :returns: bool indicating if revert was successful or not.
        """
        try:
            cycle_file = config_get('conveyor', 'fts_throttler_cycle')
        except Exception:
            logging.warning(
                'could not get the cycle file, cannot revert cycle changes, therefor no tuning either'
            )
            return False

        with open(cycle_file) as cycle_info:
            cycle_info_dict = json.load(cycle_info)
            storages = cycle_info_dict['storages']
            for storage in storages:
                t = FTS3Transfertool(storage['fts-host'])
                logging.info('storage information: %s', storage)
                t.set_se_config(
                    storage['storage'],
                    inbound_max_active=storage['inbound_max_active'],
                    outbound_max_active=storage['outbound_max_active'])
                logging.info(
                    'on storage ' + storage['storage'] +
                    ' outbound_max_active reverted from ' +
                    str(storage['tuned_outbound_max_active']) + ' to ' +
                    str(storage['outbound_max_active'])  # NOQA: W503
                    + ', inbound_max_active reverted from ' +
                    str(storage['tuned_inbound_max_active']) + ' to ' +
                    str(storage['inbound_max_active']))  # NOQA: W503
            logging.info('revert performed')
        return True
Пример #2
0
def query_latest(external_host, state, last_nhours=1):
    """
    Query the latest transfers in last n hours with state.

    :param external_host:  FTS host name as a string.
    :param state:          FTS job state as a string or a dictionary.
    :param last_nhours:    Latest n hours as an integer.
    :returns:              Requests status information as a dictionary.
    """

    record_counter('core.request.query_latest')

    start_time = time.time()
    resps = FTS3Transfertool(external_host=external_host).query_latest(state=state, last_nhours=last_nhours)
    record_timer('core.request.query_latest_fts3.%s.%s_hours' % (external_host, last_nhours), (time.time() - start_time) * 1000)

    if not resps:
        return

    ret_resps = []
    for resp in resps:
        if 'job_metadata' not in resp or resp['job_metadata'] is None or 'issuer' not in resp['job_metadata'] or resp['job_metadata']['issuer'] != 'rucio':
            continue

        if 'request_id' not in resp['job_metadata']:
            # submitted by new submitter
            try:
                logging.debug("Transfer %s on %s is %s, decrease its updated_at." % (resp['job_id'], external_host, resp['job_state']))
                set_transfer_update_time(external_host, resp['job_id'], datetime.datetime.utcnow() - datetime.timedelta(hours=24))
            except Exception as error:
                logging.debug("Exception happened when updating transfer updatetime: %s" % str(error).replace('\n', ''))

    return ret_resps
Пример #3
0
def submit_bulk_transfers(external_host, files, transfertool='fts3', job_params={}, timeout=None, user_transfer_job=False):
    """
    Submit transfer request to a transfertool.

    :param external_host:  External host name as string
    :param files:          List of Dictionary containing request file.
    :param transfertool:   Transfertool as a string.
    :param job_params:     Metadata key/value pairs for all files as a dictionary.
    :returns:              Transfertool external ID.
    """

    record_counter('core.request.submit_transfer')

    transfer_id = None

    if transfertool == 'fts3':
        start_time = time.time()
        job_files = []
        for file in files:
            job_file = {}
            for key in file:
                if key == 'sources':
                    # convert sources from (src_rse, url, src_rse_id, rank) to url
                    job_file[key] = []
                    for source in file[key]:
                        job_file[key].append(source[1])
                else:
                    job_file[key] = file[key]
            job_files.append(job_file)
        if not user_transfer_job:
            transfer_id = FTS3Transfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout)
        elif USER_TRANSFERS == "cms":
            transfer_id = FTS3MyProxyTransfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout)
        else:
            # if no valid USER TRANSFER cases --> go with std submission
            transfer_id = FTS3Transfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout)
        record_timer('core.request.submit_transfers_fts3', (time.time() - start_time) * 1000 / len(files))
    return transfer_id
Пример #4
0
    def testread(self, tuning_ratio=25):
        """
        Read the failure ratio of storages without tuning
        :returns: filtered JSON response from Elastic search.
        :param tuning_ratio: integer lower bound for what failing storages you want to read.
        """
        result = self.request_timeout_data()
        if result is not None:

            rses = result['aggregations']['rse']['buckets']
            for rse in rses:
                # if a rse has a failure ratio above the tuning ratio we read it.
                if rse['failure_ratio'].get('value') > tuning_ratio:

                    # rse_info holds the storage name(0) and FTS-host server(1)
                    rse_info = rse['key'].split()
                    t = FTS3Transfertool(rse_info[1])

                    # extract FTS storage from dst-url
                    tmp = rse['destination']['hits']['hits'][0]['_source'][
                        'payload']['dst-url'].split(':', 2)
                    url = tmp[0] + ':' + tmp[1]
                    logging.info('\033[91m RSE \033[0m' + rse_info[0] +
                                 '\033[91m on FTS host \033[0m' + rse_info[1] +
                                 '\033[91m has failure ratio \033[0m' +
                                 str(rse['failure_ratio'].get('value')) +
                                 '\033[91m on storage \033[0m' + url)

                    try:
                        se = t.get_se_config(url)
                        self.logger(logging.INFO, 'storage settings: %s', se)
                    except KeyError:
                        self.logger(
                            logging.WARNING,
                            'configuration for storage element was not found')
                    except Exception as error:
                        self.logger(
                            logging.WARNING,
                            'an error occured when trying to get the storage configuration'
                        )
                        self.logger(logging.WARNING, str(error))
                        continue

            return rses
        else:
            self.logger(
                logging.WARNING,
                'Could not retrieve timeout data with elastic search, trying again next cycle'
            )
Пример #5
0
def bulk_query_transfers(request_host, transfer_ids, transfertool='fts3', timeout=None):
    """
    Query the status of a transfer.

    :param request_host:  Name of the external host.
    :param transfer_ids:  List of (External-ID as a 32 character hex string)
    :param transfertool:  Transfertool name as a string.
    :returns:             Request status information as a dictionary.
    """

    record_counter('core.request.bulk_query_transfers')

    if transfertool == 'fts3':
        try:
            start_time = time.time()
            fts_resps = FTS3Transfertool(external_host=request_host).bulk_query(transfer_ids=transfer_ids, timeout=timeout)
            record_timer('core.request.bulk_query_transfers', (time.time() - start_time) * 1000 / len(transfer_ids))
        except Exception:
            raise

        for transfer_id in transfer_ids:
            if transfer_id not in fts_resps:
                fts_resps[transfer_id] = Exception("Transfer id %s is not returned" % transfer_id)
            if fts_resps[transfer_id] and not isinstance(fts_resps[transfer_id], Exception):
                for request_id in fts_resps[transfer_id]:
                    if fts_resps[transfer_id][request_id]['file_state'] in (str(FTSState.FAILED),
                                                                            str(FTSState.FINISHEDDIRTY),
                                                                            str(FTSState.CANCELED)):
                        fts_resps[transfer_id][request_id]['new_state'] = RequestState.FAILED
                    elif fts_resps[transfer_id][request_id]['file_state'] in str(FTSState.FINISHED):
                        fts_resps[transfer_id][request_id]['new_state'] = RequestState.DONE
        return fts_resps
    else:
        raise NotImplementedError

    return None
Пример #6
0
    def tune(self):
        """
        tune the configuration settings
        """
        result = self.request_timeout_data()
        if result is not None:

            try:
                cycle_file = config_get('conveyor', 'fts_throttler_cycle')
            except Exception:
                logging.warning(
                    'could not get the cycle file, cannot perform tuning for this cycle without cycle file, returning'
                )
                return

            try:
                tuning_ratio = config_get('conveyor',
                                          'fts_throttler_tuning_ratio')
            except Exception:
                logging.warning(
                    'could not get the tuning ratio from config, returning')
                return

            rses = result['aggregations']['rse']['buckets']
            cycle_info_dict = {'storages': []}
            for rse in rses:
                # if a rse has a failure ratio above the tuning ratio (percentage) we tune it.
                if rse['failure_ratio'].get('value') > int(tuning_ratio):

                    # rse_info holds the storage name(0) and FTS-host server(1)
                    rse_info = rse['key'].split()

                    # Tapes might have other reasons for timeouts which should be treated differently, therefor they are ignored and not tuned for now.
                    if rse['storage_type']['hits']['hits'][0]['_source'][
                            'payload']['dst-type'] == 'TAPE':
                        logging.info(
                            '%s is a tape storage type, it will not be tuned',
                            rse_info[0])
                        continue
                    # instantiate transfertool for access to get_se_config and set_se_config.
                    t = FTS3Transfertool(rse_info[1])

                    # extract FTS storage from dst-url
                    tmp = rse['destination']['hits']['hits'][0]['_source'][
                        'payload']['dst-url'].split(':', 2)
                    url = tmp[0] + ':' + tmp[1]

                    n = rse['failure_ratio'].get('value')

                    logging.info(' RSE ' + rse_info[0] + ' on FTS host ' +
                                 rse_info[1] + ' has failure ratio ' +
                                 str(rse['failure_ratio'].get('value')) +
                                 ' on storage ' + url)  # NOQA: W503

                    try:
                        se = t.get_se_config(url)
                        logging.info('storage settings: %s', se)
                    except KeyError:
                        logging.warning(
                            'configuration for storage element was not found, config will be set from default values'
                        )
                        # all FTS Host servers have a default reference storage named '*' that holds the default values for all storages that arent listed yet.
                        default_storage = t.get_se_config('*')
                        t.set_se_config(
                            url,
                            inbound_max_active=int(
                                (100 / (100 + n)) * default_storage['se_info']
                                ['inbound_max_active']),
                            outbound_max_active=int(
                                (100 / (100 + n)) * default_storage['se_info']
                                ['outbound_max_active']))

                        logging.info(
                            url + 'inbound_max_active changed from ' +
                            str(default_storage['se_info']
                                ['inbound_max_active']) + ' to ' +
                            str(
                                int((100 /
                                     (100 + n)) * default_storage['se_info']
                                    ['inbound_max_active'])) +
                            ', outbound_max_active changed from ' +
                            str(default_storage['se_info']
                                ['outbound_max_active']) + ' to ' +
                            str(
                                int((100 /
                                     (100 + n)) * default_storage['se_info']
                                    ['outbound_max_active'])))  # NOQA: W503

                        # cycle_info_dict is used to write changes down to the cycle file.
                        cycle_info_dict['storages'].append({
                            'storage':
                            url,
                            'inbound_max_active':
                            default_storage['se_info']['inbound_max_active'],
                            'outbound_max_active':
                            default_storage['se_info']['outbound_max_active'],
                            'failure_ratio':
                            n,
                            'tuned_inbound_max_active':
                            int((100 / (100 + n)) * default_storage['se_info']
                                ['inbound_max_active']),
                            'tuned_outbound_max_active':
                            int((100 / (100 + n)) * default_storage['se_info']
                                ['outbound_max_active']),
                            'fts-host':
                            rse_info[1],
                            'time':
                            str(datetime.datetime.now())
                        })
                        continue
                    except Exception as error:
                        logging.warning(
                            'an error occured when trying to get the storage configuration'
                        )
                        logging.warning(str(error))
                        continue

                    # Even though we could read the config, we still need to know if the important attributes are empty.
                    if se['se_info']['inbound_max_active'] is None:
                        try:
                            default_storage = t.get_se_config('*')
                        except Exception:
                            raise Exception(
                                'Could not retrieve the default storage information'
                            )
                        ima = default_storage['se_info']['inbound_max_active']
                    else:
                        ima = se['se_info']['inbound_max_active']

                    if se['se_info']['outbound_max_active'] is None:
                        try:
                            default_storage = t.get_se_config('*')
                        except Exception:
                            raise Exception(
                                'Could not retrieve the default storage information'
                            )
                        oma = default_storage['se_info']['outbound_max_active']
                    else:
                        oma = se['se_info']['outbound_max_active']

                    # append existing information to dict and write to file.
                    cycle_info_dict['storages'].append({
                        'storage':
                        url,
                        'inbound_max_active':
                        ima,
                        'outbound_max_active':
                        oma,
                        'failure_ratio':
                        n,
                        'tuned_inbound_max_active':
                        int((100 / (100 + n)) * ima),
                        'tuned_outbound_max_active':
                        int((100 / (100 + n)) * oma),
                        'fts-host':
                        rse_info[1],
                        'time':
                        str(datetime.datetime.now())
                    })

                    # tune down the configuration of a storage relative to the failure ratio(n) and existing configuration.
                    t.set_se_config(url,
                                    inbound_max_active=int(
                                        (100 / (100 + n)) * ima),
                                    outbound_max_active=int(
                                        (100 / (100 + n)) * oma))

                    logging.info(
                        url + 'inbound_max_active changed from ' + str(ima) +
                        ' to ' + str(int((100 / (100 + n)) * ima)) +
                        ', outbound_max_active changed from ' + str(oma) +
                        ' to ' + str(int(
                            (100 / (100 + n)) * oma)))  # NOQA: W503

            if cycle_info_dict['storages'] == []:
                logging.info(
                    'no storages are failing significantly due to timeout errors, therefor no tuning happened.'
                )

            with open(cycle_file, 'w') as outfile:
                json.dump(cycle_info_dict, outfile)
        else:
            logging.warning(
                'Could not detect any storages with sufficient failure ratio for tuning, trying again next cycle'
            )
        return
Пример #7
0
def run_once(fts_bulk, db_bulk, older_than, activity_shares, multi_vo, timeout,
             activity, heartbeat_handler):
    worker_number, total_workers, logger = heartbeat_handler.live()

    start_time = time.time()
    logger(
        logging.DEBUG,
        'Start to poll transfers older than %i seconds for activity %s using transfer tool: %s'
        % (older_than, activity, FILTER_TRANSFERTOOL))
    transfs = request_core.get_next(
        request_type=[
            RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT
        ],
        state=[RequestState.SUBMITTED],
        limit=db_bulk,
        older_than=datetime.datetime.utcnow() -
        datetime.timedelta(seconds=older_than) if older_than else None,
        total_workers=total_workers,
        worker_number=worker_number,
        mode_all=True,
        hash_variable='id',
        activity=activity,
        activity_shares=activity_shares,
        transfertool=FILTER_TRANSFERTOOL)

    record_timer('daemons.conveyor.poller.get_next',
                 (time.time() - start_time) * 1000)

    if TRANSFER_TOOL and not FILTER_TRANSFERTOOL:
        # only keep transfers which don't have any transfertool set, or have one equal to TRANSFER_TOOL
        transfs_tmp = [
            t for t in transfs
            if not t['transfertool'] or t['transfertool'] == TRANSFER_TOOL
        ]
        if len(transfs_tmp) != len(transfs):
            logger(
                logging.INFO,
                'Skipping %i transfers because of missmatched transfertool',
                len(transfs) - len(transfs_tmp))
        transfs = transfs_tmp

    if transfs:
        logger(
            logging.DEBUG,
            'Polling %i transfers for activity %s' % (len(transfs), activity))

    transfs.sort(
        key=lambda t: (t['external_host'] or '', t['scope'].vo if multi_vo else
                       '', t['external_id'] or '', t['request_id'] or ''))
    for (external_host,
         vo), transfers_for_host in groupby(transfs,
                                            key=lambda t:
                                            (t['external_host'], t['scope'].vo
                                             if multi_vo else None)):
        transfers_by_eid = {}
        for external_id, xfers in groupby(transfers_for_host,
                                          key=lambda t: t['external_id']):
            transfers_by_eid[external_id] = {t['request_id']: t for t in xfers}

        for chunk in dict_chunks(transfers_by_eid, fts_bulk):
            try:
                if TRANSFER_TOOL == 'globus':
                    transfertool_obj = GlobusTransferTool(external_host=None)
                else:
                    transfertool_obj = FTS3Transfertool(
                        external_host=external_host, vo=vo)
                worker_number, total_workers, logger = heartbeat_handler.live()
                poll_transfers(transfertool_obj=transfertool_obj,
                               transfers_by_eid=chunk,
                               timeout=timeout,
                               logger=logger)
            except Exception:
                logger(logging.ERROR, 'Exception', exc_info=True)

    queue_empty = False
    if len(transfs) < fts_bulk / 2:
        logger(
            logging.INFO,
            "Only %s transfers for activity %s, which is less than half of the bulk %s"
            % (len(transfs), activity, fts_bulk))
        queue_empty = True

    return queue_empty