Пример #1
0
class CRABAccessHistory(object):
    """
    Sets two attrs:
      global_usage_rank:  float value
      num_access: integer
      last_access: timestamp
    """

    produces = ['global_usage_rank', 'num_access', 'last_access']

    _default_config = None

    @staticmethod
    def set_default(config):
        CRABAccessHistory._default_config = Configuration(config)

    def __init__(self, config = None):
        if config is None:
            config = CRABAccessHistory._default_config

        self._history = HistoryDatabase(config.get('history', None))
        self._popdb = PopDB(config.get('popdb', None))

        self.max_back_query = config.get('max_back_query', 7)

        self.included_sites = list(config.get('include_sites', []))
        self.excluded_sites = list(config.get('exclude_sites', []))

        self.set_read_only(config.get('read_only', False))

    def set_read_only(self, value = True):
        self._read_only = value
        self._history.set_read_only(value)

    def load(self, inventory):
        records = self._get_stored_records(inventory)
        self._compute(inventory, records)

    def _get_stored_records(self, inventory):
        """
        Get the replica access data from DB.
        @param inventory  DynamoInventory
        @return  {dataset: [(date, number of access)]}
        """

        # pick up all accesses that are less than 2 years old
        # old accesses will be removed automatically next time the access information is saved from memory
        sql = 'SELECT d.`name`, UNIX_TIMESTAMP(a.`date`), a.`num_accesses` FROM `dataset_accesses` AS a'
        sql += ' INNER JOIN `datasets` AS d ON d.`id` = a.`dataset_id`'
        sql += ' WHERE a.`date` > DATE_SUB(NOW(), INTERVAL 2 YEAR) ORDER BY d.`id`, a.`date`'

        all_accesses = {}
        num_records = 0

        # little speedup by not repeating lookups for the same replica
        current_dataset_name = ''
        dataset_exists = True
        replica = None
        for dataset_name, timestamp, num_accesses in self._history.db.xquery(sql):
            num_records += 1

            if dataset_name == current_dataset_name:
                if not dataset_exists:
                    continue
            else:
                current_dataset_name = dataset_name

                try:
                    dataset = inventory.datasets[dataset_name]
                except KeyError:
                    dataset_exists = False
                    continue
                else:
                    dataset_exists = True

                accesses = all_accesses[dataset] = []

            accesses.append((timestamp, num_accesses))

        try:
            last_update = self._history.db.query('SELECT UNIX_TIMESTAMP(`dataset_accesses_last_update`) FROM `popularity_last_update`')[0]
        except IndexError:
            last_update = 0

        LOG.info('Loaded %d replica access data. Last update on %s UTC', num_records, time.strftime('%Y-%m-%d', time.gmtime(last_update)))

        return all_accesses

    def _compute(self, inventory, all_accesses):
        """
        Set the dataset usage rank based on access list.
        nAccessed is NACC normalized by size (in GB).
        @param inventory   DynamoInventory
        @param all_accesses {dataset: [(date, number of access)]} (time ordered)
        """

        now = time.time()
        today = datetime.datetime.utcfromtimestamp(now).date()

        for dataset in inventory.datasets.itervalues():
            last_access = 0
            num_access = 0
            norm_access = 0.

            try:
                accesses = all_accesses[dataset]
            except KeyError:
                pass
            else:
                last_access = accesses[-1][0]
                num_access = sum(e[1] for e in accesses)
                if dataset.size != 0:
                    norm_access = float(num_access) / (dataset.size * 1.e-9)

            try:
                last_block_created = max(r.last_block_created() for r in dataset.replicas)
            except ValueError: # empty sequence
                last_block_created = 0

            last_change = max(last_access, dataset.last_update, last_block_created)

            rank = (now - last_change) / (24. * 3600.) - norm_access

            dataset.attr['global_usage_rank'] = rank
            dataset.attr['num_access'] = num_access
            dataset.attr['last_access'] = max(last_access, dataset.last_update)

    def update(self, inventory):
        try:
            try:
                last_update = self._history.db.query('SELECT UNIX_TIMESTAMP(`dataset_accesses_last_update`) FROM `popularity_last_update`')[0]
            except IndexError:
                last_update = time.time() - 3600 * 24 # just go back by a day
                if not self._read_only:
                    self._history.db.query('INSERT INTO `popularity_last_update` VALUES ()')

            if not self._read_only:
                self._history.db.query('UPDATE `popularity_last_update` SET `dataset_accesses_last_update` = NOW()', retries = 0, silent = True)

        except MySQLdb.OperationalError:
            # We have a read-only config
            self._read_only = True
            LOG.info('Cannot write to DB. Switching to read_only.')

        start_time = max(last_update, (time.time() - 3600 * 24 * self.max_back_query))
        start_date = datetime.date(*time.gmtime(start_time)[:3])

        source_records = self._get_source_records(inventory, start_date)

        if not self._read_only:
            self._save_records(source_records)
            # remove old entries
            self._history.db.query('DELETE FROM `dataset_accesses` WHERE `date` < DATE_SUB(NOW(), INTERVAL 2 YEAR)')
            self._history.db.query('UPDATE `popularity_last_update` SET `dataset_accesses_last_update` = NOW()')

    def _get_source_records(self, inventory, start_date):
        """
        Get the replica access data from PopDB from start_date to today.
        @param inventory      DynamoInventory
        @param start_date     Query start date (datetime.datetime)
        @return  {replica: {date: (number of access, total cpu time)}}
        """

        days_to_query = []

        utctoday = datetime.date(*time.gmtime()[:3])
        date = start_date
        while date <= utctoday: # get records up to today
            days_to_query.append(date)
            date += datetime.timedelta(1) # one day

        LOG.info('Updating dataset access info from %s to %s', start_date.strftime('%Y-%m-%d'), utctoday.strftime('%Y-%m-%d'))

        all_accesses = {}

        arg_pool = []
        for site in inventory.sites.itervalues():
            matched = (len(self.included_sites) == 0)

            for pattern in self.included_sites:
                if fnmatch.fnmatch(site.name, pattern):
                    matched = True
                    break

            for pattern in self.excluded_sites:
                if fnmatch.fnmatch(site.name, pattern):
                    matched = False
                    break

            if matched:
                for date in days_to_query:
                    arg_pool.append((site, inventory, date))

        mapper = Map()
        mapper.logger = LOG

        records = mapper.execute(self._get_site_record, arg_pool)

        for site_record in records:
            for replica, date, naccess, cputime in site_record:
                if replica not in all_accesses:
                    all_accesses[replica] = {}

                all_accesses[replica][date] = (naccess, cputime)

        return all_accesses

    def _get_site_record(self, site, inventory, date):
        """
        Get the replica access data on a single site from PopDB.
        @param site       Site
        @param inventory  Inventory
        @param date       datetime.date
        @return [(replica, number of access, total cpu time)]
        """

        if site.name.startswith('T0'):
            return []
        elif site.name.startswith('T1') and site.name.count('_') > 2:
            nameparts = site.name.split('_')
            sitename = '_'.join(nameparts[:3])
            service = 'popularity/DSStatInTimeWindow/' # the trailing slash is apparently important
        elif site.name == 'T2_CH_CERN':
            sitename = site.name
            service = 'xrdpopularity/DSStatInTimeWindow'
        else:
            sitename = site.name
            service = 'popularity/DSStatInTimeWindow/'

        datestr = date.strftime('%Y-%m-%d')
        result = self._popdb.make_request(service, ['sitename=' + sitename, 'tstart=' + datestr, 'tstop=' + datestr])

        records = []
        
        for ds_entry in result:
            try:
                dataset = inventory.datasets[ds_entry['COLLNAME']]
            except KeyError:
                continue

            replica = site.find_dataset_replica(dataset)
            if replica is None:
                continue

            records.append((replica, date, int(ds_entry['NACC']), float(ds_entry['TOTCPU'])))

        return records

    def _save_records(self, records):
        """
        Save the newly fetched access records.
        @param records  {replica: {date: (number of access, total cpu time)}}
        """

        site_names = set(r.site.name for r in records.iterkeys())
        self._history.save_sites(site_names)
        site_id_map = dict(self._history.db.select_many('sites', ('name', 'id'), 'name', site_names))

        dataset_names = set(r.dataset.name for r in records.iterkeys())
        self._history.save_datasets(dataset_names)
        dataset_id_map = dict(self._history.db.select_many('datasets', ('name', 'id'), 'name', dataset_names))

        fields = ('dataset_id', 'site_id', 'date', 'access_type', 'num_accesses', 'cputime')

        data = []
        for replica, entries in records.iteritems():
            dataset_id = dataset_id_map[replica.dataset.name]
            site_id = site_id_map[replica.site.name]

            for date, (num_accesses, cputime) in entries.iteritems():
                data.append((dataset_id, site_id, date.strftime('%Y-%m-%d'), 'local', num_accesses, cputime))

        self._history.db.insert_many('dataset_accesses', fields, None, data, do_update = True)
Пример #2
0
class GlobalQueueRequestHistory(object):
    """
    Sets one attr:
      request_weight:  float value
    """

    produces = ['request_weight']

    _default_config = None

    @staticmethod
    def set_default(config):
        GlobalQueueRequestHistory._default_config = Configuration(config)

    def __init__(self, config=None):
        if config is None:
            config = GlobalQueueRequestHistory._default_config

        self._history = HistoryDatabase(config.get('history', None))
        self._htcondor = HTCondor(config.get('htcondor', None))

        # Weight computation halflife constant (given in days in config)
        self.weight_halflife = config.get('weight_halflife', 4) * 3600. * 24.

        self.set_read_only(config.get('read_only', False))

    def set_read_only(self, value=True):
        self._read_only = value

    def load(self, inventory):
        records = self._get_stored_records(inventory)
        self._compute(inventory, records)

    def _get_stored_records(self, inventory):
        """
        Get the dataset request data from DB.
        @param inventory  DynamoInventory
        @return  {dataset: {jobid: GlobalQueueJob}}
        """

        # pick up requests that are less than 1 year old
        # old requests will be removed automatically next time the access information is saved from memory
        sql = 'SELECT d.`name`, r.`id`, UNIX_TIMESTAMP(r.`queue_time`), UNIX_TIMESTAMP(r.`completion_time`),'
        sql += ' r.`nodes_total`, r.`nodes_done`, r.`nodes_failed`, r.`nodes_queued` FROM `dataset_requests` AS r'
        sql += ' INNER JOIN `datasets` AS d ON d.`id` = r.`dataset_id`'
        sql += ' WHERE r.`queue_time` > DATE_SUB(NOW(), INTERVAL 1 YEAR) ORDER BY d.`id`, r.`queue_time`'

        all_requests = {}
        num_records = 0

        # little speedup by not repeating lookups for the same dataset
        current_dataset_name = ''
        dataset_exists = True
        for dataset_name, job_id, queue_time, completion_time, nodes_total, nodes_done, nodes_failed, nodes_queued in self._history.db.xquery(
                sql):
            num_records += 1

            if dataset_name == current_dataset_name:
                if not dataset_exists:
                    continue
            else:
                current_dataset_name = dataset_name

                try:
                    dataset = inventory.datasets[dataset_name]
                except KeyError:
                    dataset_exists = False
                    continue
                else:
                    dataset_exists = True

                requests = all_requests[dataset] = {}

            requests[job_id] = GlobalQueueJob(queue_time, completion_time,
                                              nodes_total, nodes_done,
                                              nodes_failed, nodes_queued)

        try:
            last_update = self._history.db.query(
                'SELECT UNIX_TIMESTAMP(`dataset_requests_last_update`) FROM `popularity_last_update`',
                retries=1)[0]
        except IndexError:
            last_update = 0

        LOG.info('Loaded %d dataset request data. Last update at %s UTC',
                 num_records,
                 time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(last_update)))

        return all_requests

    def _compute(self, inventory, all_requests):
        """
        Set the dataset request weight based on request list. Formula:
          w = Sum(exp(-t_i/T))
        where t_i is the time distance of the ith request from now. T is defined in the configuration.
        @param inventory     DynamoInventory
        @param all_requests  {dataset: {jobid: GlobalQueueJob}}
        """

        now = time.time()
        decay_constant = self.weight_halflife / math.log(2.)

        for dataset in inventory.datasets.itervalues():
            try:
                requests = all_requests[dataset]
            except KeyError:
                dataset.attr['request_weight'] = 0.
                continue

            weight = 0.
            for job in requests.itervalues():
                # first element of reqdata tuple is the queue time
                weight += math.exp((job.queue_time - now) / decay_constant)

            dataset.attr['request_weight'] = weight

    def update(self, inventory):
        try:
            try:
                last_update = self._history.db.query(
                    'SELECT UNIX_TIMESTAMP(`dataset_requests_last_update`) FROM `popularity_last_update`',
                    retries=1)[0]
            except IndexError:
                last_update = time.time() - 3600 * 24  # just go back by a day
                if not self._read_only:
                    self._history.db.query(
                        'INSERT INTO `popularity_last_update` VALUES ()')

            if not self._read_only:
                self._history.db.query(
                    'UPDATE `popularity_last_update` SET `dataset_requests_last_update` = NOW()',
                    retries=0,
                    silent=True)

        except MySQLdb.OperationalError:
            # We have a read-only config
            self._read_only = True
            LOG.info('Cannot write to DB. Switching to self._read_only.')

        source_records = self._get_source_records(inventory, last_update)

        if not self._read_only:
            self._save_records(source_records)
            # remove old entries
            self._history.db.query(
                'DELETE FROM `dataset_requests` WHERE `queue_time` < DATE_SUB(NOW(), INTERVAL 1 YEAR)'
            )
            self._history.db.query(
                'UPDATE `popularity_last_update` SET `dataset_requests_last_update` = NOW()'
            )

    def _get_source_records(self, inventory, last_update):
        """
        Get the dataset request data from Global Queue schedd.
        @param inventory    DynamoInventory
        @param last_update  UNIX timestamp
        @return {dataset: {jobid: GlobalQueueJob}}
        """

        constraint = 'TaskType=?="ROOT" && !isUndefined(DESIRED_CMSDataset) && (QDate > {last_update} || CompletionDate > {last_update})'.format(
            last_update=last_update)

        attributes = [
            'DESIRED_CMSDataset', 'GlobalJobId', 'QDate', 'CompletionDate',
            'DAG_NodesTotal', 'DAG_NodesDone', 'DAG_NodesFailed',
            'DAG_NodesQueued'
        ]

        job_ads = self._htcondor.find_jobs(constraint=constraint,
                                           attributes=attributes)

        job_ads.sort(key=lambda a: a['DESIRED_CMSDataset'])

        all_requests = {}

        for ad in job_ads:
            dataset_name = ad['DESIRED_CMSDataset']

            try:
                dataset = inventory.datasets[dataset_name]
            except KeyError:
                continue

            if dataset not in all_requests:
                all_requests[dataset] = {}

            try:
                nodes_total = ad['DAG_NodesTotal']
                nodes_done = ad['DAG_NodesDone']
                nodes_failed = ad['DAG_NodesFailed']
                nodes_queued = ad['DAG_NodesQueued']
            except KeyError:
                nodes_total = 0
                nodes_done = 0
                nodes_failed = 0
                nodes_queued = 0

            all_requests[dataset][ad['GlobalJobId']] = GlobalQueueJob(
                ad['QDate'], ad['CompletionDate'], nodes_total, nodes_done,
                nodes_failed, nodes_queued)

        return all_requests

    def _save_records(self, records):
        """
        Save the newly fetched request records.
        @param records  {dataset: {jobid: GlobalQueueJob}}
        """

        dataset_names = [d.name for d in records.iterkeys()]

        self._history.save_datasets(dataset_names)
        dataset_id_map = dict(
            self._history.db.select_many('datasets', ('name', 'id'), 'name',
                                         dataset_names))

        fields = ('id', 'dataset_id', 'queue_time', 'completion_time',
                  'nodes_total', 'nodes_done', 'nodes_failed', 'nodes_queued')

        data = []
        for dataset, dataset_request_list in records.iteritems():
            dataset_id = dataset_id_map[dataset.name]

            for job_id, (queue_time, completion_time, nodes_total, nodes_done,
                         nodes_failed,
                         nodes_queued) in dataset_request_list.iteritems():
                data.append(
                    (job_id, dataset_id,
                     time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(queue_time)),
                     time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(completion_time))
                     if completion_time > 0 else '0000-00-00 00:00:00',
                     nodes_total, nodes_done, nodes_failed, nodes_queued))

        self._history.db.insert_many('dataset_requests',
                                     fields,
                                     None,
                                     data,
                                     do_update=True)
Пример #3
0
class RequestManager(object):
    """
    Manager for external copy and deletion requests made through the web interface.
    Requests are written in registry when they are in new and activated states.
    When moving to terminal states (completed, rejected, cancelled) the records are migrated to history.
    This is a MySQL-specific implementation, but the interface is generic. It should be straightforward
    to abstractify the class if necessary.
    """

    # default config
    _config = df.Configuration()

    @staticmethod
    def set_default(config):
        RequestManager._config = df.Configuration(config)

    def __init__(self, optype, config=None):
        """
        @param optype  'copy' or 'deletion'.
        """
        if config is None:
            config = RequestManager._config

        self.registry = RegistryDatabase(config.get('registry', None))
        self.history = HistoryDatabase(config.get('history', None))

        # we'll be using temporary tables
        self.registry.db.reuse_connection = True
        self.history.db.reuse_connection = True

        self.optype = optype

        self.set_read_only(config.get('read_only', False))

    def set_read_only(self, value=True):
        self._read_only = value

    def lock(self):
        """
        Lock the registry table for lookup + update workflows.
        """
        if not self._read_only:
            self.registry.db.lock_tables()

    def unlock(self):
        if not self._read_only:
            self.registry.db.unlock_tables()

    def _save_items(self, items):
        """
        Save the items into history.
        @param items          List of dataset and block names.

        @return [dataset id], [block id]
        """
        dataset_names = []
        block_names = []

        for item in items:
            # names are validated already
            try:
                dataset_name, block_name = df.Block.from_full_name(item)
            except df.ObjectError:
                dataset_names.append(item)
            else:
                block_names.append(
                    (dataset_name, df.Block.to_real_name(block_name)))

        dataset_ids = self.history.save_datasets(dataset_names, get_ids=True)
        block_ids = self.history.save_blocks(block_names, get_ids=True)

        return dataset_ids, block_ids

    def _get_saved_item_ids(self, items):
        """
        Get the history dataset and block ids from the items list.
        @param items          List of dataset and block names.

        @return [dataset id], [block id]
        """
        dataset_names = []
        block_names = []

        for item in items:
            # names are validated already
            try:
                dataset_name, block_name = df.Block.from_full_name(item)
            except df.ObjectError:
                dataset_names.append(item)
            else:
                block_names.append(
                    (dataset_name, df.Block.to_real_name(block_name)))

        dataset_ids = self.history.db.select_many('datasets', 'id', 'name',
                                                  dataset_names)
        block_ids = self.history.db.select_many('blocks', 'id', 'name',
                                                block_names)

        return dataset_ids, block_ids

    def _make_temp_registry_tables(self, items, sites):
        """
        Make temporary tables to be used to constrain request search.
        @param items   List of dataset and block names.
        @param sites   List of site names.
        """

        # Make temporary tables and fill copy_ids_tmp with ids of requests whose item and site lists fully cover the provided list of items and sites.
        columns = [
            '`item` varchar(512) CHARACTER SET latin1 COLLATE latin1_general_cs NOT NULL'
        ]
        self.registry.db.create_tmp_table('items_tmp', columns)
        columns = [
            '`site` varchar(32) CHARACTER SET latin1 COLLATE latin1_general_cs NOT NULL'
        ]
        self.registry.db.create_tmp_table('sites_tmp', columns)

        if items is not None:
            self.registry.db.insert_many('items_tmp', ('item', ),
                                         MySQL.make_tuple,
                                         items,
                                         db=self.registry.db.scratch_db)
        if sites is not None:
            self.registry.db.insert_many('sites_tmp', ('site', ),
                                         MySQL.make_tuple,
                                         sites,
                                         db=self.registry.db.scratch_db)

        columns = [
            '`id` int(10) unsigned NOT NULL AUTO_INCREMENT',
            'PRIMARY KEY (`id`)'
        ]
        self.registry.db.create_tmp_table('ids_tmp', columns)

        sql = 'INSERT INTO `{db}`.`ids_tmp`'
        sql += ' SELECT r.`id` FROM `{op}_requests` AS r WHERE'
        sql += ' 0 NOT IN (SELECT (`site` IN (SELECT `site` FROM `{op}_request_sites` AS s WHERE s.`request_id` = r.`id`)) FROM `{db}`.`sites_tmp`)'
        sql += ' AND '
        sql += ' 0 NOT IN (SELECT (`item` IN (SELECT `item` FROM `{op}_request_items` AS i WHERE i.`request_id` = r.`id`)) FROM `{db}`.`items_tmp`)'
        self.registry.db.query(
            sql.format(db=self.registry.db.scratch_db, op=self.optype))

        self.registry.db.drop_tmp_table('items_tmp')
        self.registry.db.drop_tmp_table('sites_tmp')

        return '`{db}`.`ids_tmp`'.format(db=self.registry.db.scratch_db)

    def _make_temp_history_tables(self, dataset_ids, block_ids, site_ids):
        """
        Make temporary tables to be used to constrain request search.
        @param dataset_ids   List of dataset ids.
        @param block_ids     List of block ids.
        @param site_ids      List of site ids.
        """

        columns = [
            '`id` int(10) unsigned NOT NULL AUTO_INCREMENT',
            'PRIMARY KEY (`id`)'
        ]
        self.history.db.create_tmp_table('ids_tmp', columns)

        tmp_table_name = '`{db}`.`ids_tmp`'.format(
            db=self.history.db.scratch_db)

        if (dataset_ids is not None and len(dataset_ids) == 0) or \
                (block_ids is not None and len(block_ids) == 0) or \
                (site_ids is not None and len(site_ids) == 0):
            # temp table must be empty
            return tmp_table_name

        # Make temporary tables and fill ids_tmp with ids of requests whose item and site lists fully cover the provided list of items and sites.
        columns = ['`id` int(10) unsigned NOT NULL']
        self.history.db.create_tmp_table('datasets_tmp', columns)
        columns = ['`id` bigint(20) unsigned NOT NULL']
        self.history.db.create_tmp_table('blocks_tmp', columns)
        columns = ['`id` int(10) unsigned NOT NULL']
        self.history.db.create_tmp_table('sites_tmp', columns)

        if dataset_ids is not None:
            self.history.db.insert_many('datasets_tmp', ('id', ),
                                        MySQL.make_tuple,
                                        dataset_ids,
                                        db=self.history.db.scratch_db)
        if block_ids is not None:
            self.history.db.insert_many('blocks_tmp', ('id', ),
                                        MySQL.make_tuple,
                                        block_ids,
                                        db=self.history.db.scratch_db)
        if site_ids is not None:
            self.history.db.insert_many('sites_tmp', ('id', ),
                                        MySQL.make_tuple,
                                        site_ids,
                                        db=self.history.db.scratch_db)

        # Explaining the query outwards:
        # SELECT `X_id` FROM `{op}_request_X` WHERE `request_id` = r.`id` -> Full list of X for the request
        # `id` IN (SELECT `X_id` ...) -> 0 or 1
        # SELECT (`id` IN (SELECT `X_id` ...)) FROM tmp.`X_tmp` -> 0s and 1s for all entries in X_tmp
        # 0 NOT IN (SELECT ... FROM tmp.`X_tmp`) -> All entries in X_tmp are contained in {op}_request_X for the specific request

        sql = 'INSERT INTO `{db}`.`ids_tmp`'
        sql += ' SELECT r.`id` FROM `{op}_requests` AS r WHERE'
        sql += ' 0 NOT IN (SELECT (`id` IN (SELECT `site_id` FROM `{op}_request_sites` AS s WHERE s.`request_id` = r.`id`)) FROM `{db}`.`sites_tmp`)'
        sql += ' AND '
        sql += ' 0 NOT IN (SELECT (`id` IN (SELECT `dataset_id` FROM `{op}_request_datasets` AS d WHERE d.`request_id` = r.`id`)) FROM `{db}`.`datasets_tmp`)'
        sql += ' AND '
        sql += ' 0 NOT IN (SELECT (`id` IN (SELECT `block_id` FROM `{op}_request_blocks` AS b WHERE b.`request_id` = r.`id`)) FROM `{db}`.`blocks_tmp`)'
        self.history.db.query(
            sql.format(db=self.history.db.scratch_db, op=self.optype))

        self.history.db.drop_tmp_table('datasets_tmp')
        self.history.db.drop_tmp_table('blocks_tmp')
        self.history.db.drop_tmp_table('sites_tmp')

        return tmp_table_name

    def _make_registry_constraints(self, request_id, statuses, users, items,
                                   sites):
        constraints = []

        if request_id is not None:
            constraints.append('r.`id` = %d' % request_id)

        if statuses is not None:
            constraints.append('r.`status` IN ' +
                               MySQL.stringify_sequence(statuses))

        if users is not None:
            constraints.append('r.`user` IN ' +
                               MySQL.stringify_sequence(users))

        if items is not None or sites is not None:
            temp_table = self._make_temp_registry_tables(items, sites)
            constraints.append(
                'r.`id` IN (SELECT `id` FROM {0})'.format(temp_table))

        if len(constraints) != 0:
            return ' WHERE ' + ' AND '.join(constraints)
        else:
            return ''

    def _make_history_constraints(self, request_id, statuses, users, items,
                                  sites):
        if users is not None:
            history_user_ids = self.history.db.select_many(
                'users', 'id', 'name', users)
        else:
            history_user_ids = None

        if items is not None:
            history_dataset_ids, history_block_ids = self._get_saved_item_ids(
                items)
        else:
            history_dataset_ids = None
            history_block_ids = None

        if sites is not None:
            history_site_ids = self.history.db.select_many(
                'sites', 'id', 'name', sites)
        else:
            history_site_ids = None

        constraints = []

        if request_id is not None:
            constraints.append('r.`id` = %d' % request_id)

        if statuses is not None:
            constraints.append('r.`status` IN ' +
                               MySQL.stringify_sequence(statuses))

        if users is not None:
            constraints.append('r.`user_id` IN ' +
                               MySQL.stringify_sequence(history_user_ids))

        if items is not None or sites is not None:
            temp_table = self._make_temp_history_tables(
                history_dataset_ids, history_block_ids, history_site_ids)
            constraints.append(
                'r.`id` IN (SELECT `id` FROM {0})'.format(temp_table))

        if len(constraints) != 0:
            return ' WHERE ' + ' AND '.join(constraints)
        else:
            return ''
Пример #4
0
class RequestManager(object):
    """
    Manager for external copy and deletion requests made through the web interface.
    Requests are written in registry when they are in new and activated states.
    When moving to terminal states (completed, rejected, cancelled) the records are migrated to history.
    This is a MySQL-specific implementation, but the interface is generic. It should be straightforward
    to abstractify the class if necessary.
    """

    # default config
    _config = df.Configuration()

    @staticmethod
    def set_default(config):
        RequestManager._config = df.Configuration(config)

    def __init__(self, optype, config = None):
        """
        @param optype  'copy' or 'deletion'.
        """
        if config is None:
            config = RequestManager._config

        self.registry = RegistryDatabase(config.get('registry', None))
        self.history = HistoryDatabase(config.get('history', None))
        #self.cache = CacheDatabase(config.get('cache', None))

        # we'll be using temporary tables
        self.registry.db.reuse_connection = True
        self.history.db.reuse_connection = True
        #self.cache.db.reuse_connection = True

        self.optype = optype

        self.set_read_only(config.get('read_only', False))

    def set_read_only(self, value = True):
        self._read_only = value

    def lock(self):
        """
        Lock the registry table for lookup + update workflows.
        """
        if not self._read_only:
            self.registry.db.lock_tables()

    def unlock(self):
        if not self._read_only:
            self.registry.db.unlock_tables()

    def _save_items(self, items):
        """
        Save the items into history.
        @param items          List of dataset and block names.

        @return [dataset id], [block id]
        """
        dataset_names = []
        block_names = []

        for item in items:
            # names are validated already
            try:
                dataset_name, block_name = df.Block.from_full_name(item)
            except df.ObjectError:
                dataset_names.append(item)
            else:
                block_names.append((dataset_name, df.Block.to_real_name(block_name)))

        dataset_ids = self.history.save_datasets(dataset_names, get_ids = True)
        block_ids = self.history.save_blocks(block_names, get_ids = True)

        return dataset_ids, block_ids

    def _get_saved_item_ids(self, items):
        """
        Get the history dataset and block ids from the items list.
        @param items          List of dataset and block names.

        @return [dataset id], [block id]
        """
        dataset_names = []
        block_names = []

        for item in items:
            # names are validated already
            try:
                dataset_name, block_name = df.Block.from_full_name(item)
            except df.ObjectError:
                dataset_names.append(item)
            else:
                block_names.append((dataset_name, df.Block.to_real_name(block_name)))

        dataset_ids = self.history.db.select_many('datasets', 'id', 'name', dataset_names)
        block_ids = self.history.db.select_many('blocks', 'id', 'name', block_names)

        return dataset_ids, block_ids

    def _make_temp_registry_tables(self, items, sites):
        """
        Make temporary tables to be used to constrain request search.
        @param items   List of dataset and block names.
        @param sites   List of site names.
        """

        # Make temporary tables and fill copy_ids_tmp with ids of requests whose item and site lists fully cover the provided list of items and sites.
        columns = ['`item` varchar(512) CHARACTER SET latin1 COLLATE latin1_general_cs NOT NULL']
        self.registry.db.create_tmp_table('items_tmp', columns)
        columns = ['`site` varchar(32) CHARACTER SET latin1 COLLATE latin1_general_cs NOT NULL']
        self.registry.db.create_tmp_table('sites_tmp', columns)

        if items is not None:
            self.registry.db.insert_many('items_tmp', ('item',), MySQL.make_tuple, items, db = self.registry.db.scratch_db)

        LOG.info(sites)

        if sites is not None:
            self.registry.db.insert_many('sites_tmp', ('site',), MySQL.make_tuple, sites, db = self.registry.db.scratch_db)

        columns = [
            '`id` int(10) unsigned NOT NULL AUTO_INCREMENT',
            'PRIMARY KEY (`id`)'
        ]
        self.registry.db.create_tmp_table('ids_tmp', columns)


        sql = 'INSERT INTO `{db}`.`ids_tmp`'
        sql += ' SELECT r.`id` FROM `{op}_requests` AS r WHERE'
        sql += ' 0 NOT IN (SELECT (`site` IN (SELECT `site` FROM `{op}_request_sites` AS s WHERE s.`request_id` = r.`id`)) FROM `{db}`.`sites_tmp`)'
        sql += ' AND '
        sql += ' 0 NOT IN (SELECT (`item` IN (SELECT `item` FROM `{op}_request_items` AS i WHERE i.`request_id` = r.`id`)) FROM `{db}`.`items_tmp`)'
        self.registry.db.query(sql.format(db = self.registry.db.scratch_db, op = self.optype))

        self.registry.db.drop_tmp_table('items_tmp')
        self.registry.db.drop_tmp_table('sites_tmp')

        return '`{db}`.`ids_tmp`'.format(db = self.registry.db.scratch_db)

    def _make_temp_history_tables(self, dataset_ids, block_ids, site_ids):
        """
        Make temporary tables to be used to constrain request search.
        @param dataset_ids   List of dataset ids.
        @param block_ids     List of block ids.
        @param site_ids      List of site ids.
        """

        columns = [
            '`id` int(10) unsigned NOT NULL AUTO_INCREMENT',
            'PRIMARY KEY (`id`)'
        ]
        self.history.db.create_tmp_table('ids_tmp', columns)

        tmp_table_name = '`{db}`.`ids_tmp`'.format(db = self.history.db.scratch_db)

        if (dataset_ids is not None and len(dataset_ids) == 0) or \
                (block_ids is not None and len(block_ids) == 0) or \
                (site_ids is not None and len(site_ids) == 0):
            # temp table must be empty
            return tmp_table_name

        # Make temporary tables and fill ids_tmp with ids of requests whose item and site lists fully cover the provided list of items and sites.
        columns = ['`id` int(10) unsigned NOT NULL']
        self.history.db.create_tmp_table('datasets_tmp', columns)
        columns = ['`id` bigint(20) unsigned NOT NULL']
        self.history.db.create_tmp_table('blocks_tmp', columns)
        columns = ['`id` int(10) unsigned NOT NULL']
        self.history.db.create_tmp_table('sites_tmp', columns)

        if dataset_ids is not None:
            self.history.db.insert_many('datasets_tmp', ('id',), MySQL.make_tuple, dataset_ids, db = self.history.db.scratch_db)
        if block_ids is not None:
            self.history.db.insert_many('blocks_tmp', ('id',), MySQL.make_tuple, block_ids, db = self.history.db.scratch_db)
        if site_ids is not None:
            self.history.db.insert_many('sites_tmp', ('id',), MySQL.make_tuple, site_ids, db = self.history.db.scratch_db)

        # Explaining the query outwards:
        # SELECT `X_id` FROM `{op}_request_X` WHERE `request_id` = r.`id` -> Full list of X for the request
        # `id` IN (SELECT `X_id` ...) -> 0 or 1
        # SELECT (`id` IN (SELECT `X_id` ...)) FROM tmp.`X_tmp` -> 0s and 1s for all entries in X_tmp
        # 0 NOT IN (SELECT ... FROM tmp.`X_tmp`) -> All entries in X_tmp are contained in {op}_request_X for the specific request

        sql = 'INSERT INTO `{db}`.`ids_tmp`'
        sql += ' SELECT r.`id` FROM `{op}_requests` AS r WHERE'
        sql += ' 0 NOT IN (SELECT (`id` IN (SELECT `site_id` FROM `{op}_request_sites` AS s WHERE s.`request_id` = r.`id`)) FROM `{db}`.`sites_tmp`)'
        sql += ' AND '
        sql += ' 0 NOT IN (SELECT (`id` IN (SELECT `dataset_id` FROM `{op}_request_datasets` AS d WHERE d.`request_id` = r.`id`)) FROM `{db}`.`datasets_tmp`)'
        sql += ' AND '
        sql += ' 0 NOT IN (SELECT (`id` IN (SELECT `block_id` FROM `{op}_request_blocks` AS b WHERE b.`request_id` = r.`id`)) FROM `{db}`.`blocks_tmp`)'
        self.history.db.query(sql.format(db = self.history.db.scratch_db, op = self.optype))

        self.history.db.drop_tmp_table('datasets_tmp')
        self.history.db.drop_tmp_table('blocks_tmp')
        self.history.db.drop_tmp_table('sites_tmp')

        return tmp_table_name

    def _make_registry_constraints(self, request_id, statuses, users, items, sites):
        constraints = []

        if request_id is not None:
            constraints.append('r.`id` = %d' % request_id)

        if statuses is not None:
            constraints.append('r.`status` IN ' + MySQL.stringify_sequence(statuses))

        if users is not None:
            constraints.append('r.`user` IN ' + MySQL.stringify_sequence(users))

        if items is not None or sites is not None:
            temp_table = self._make_temp_registry_tables(items, sites)
            constraints.append('r.`id` IN (SELECT `id` FROM {0})'.format(temp_table))

        if len(constraints) != 0:
            return ' WHERE ' + ' AND '.join(constraints)
        else:
            return ''

    def _make_history_constraints(self, request_id, statuses, users, items, sites):
        if users is not None:
            history_user_ids = self.history.db.select_many('users', 'id', 'name', users)
        else:
            history_user_ids = None

        if items is not None:
            history_dataset_ids, history_block_ids = self._get_saved_item_ids(items)
        else:
            history_dataset_ids = None
            history_block_ids = None

        if sites is not None:
            history_site_ids = self.history.db.select_many('sites', 'id', 'name', sites)
        else:
            history_site_ids = None

        constraints = []

        if request_id is not None:
            constraints.append('r.`id` = %d' % request_id)

        if statuses is not None:
            constraints.append('r.`status` IN ' + MySQL.stringify_sequence(statuses))

        if users is not None:
            constraints.append('r.`user_id` IN ' + MySQL.stringify_sequence(history_user_ids))

        if items is not None or sites is not None:
            temp_table = self._make_temp_history_tables(history_dataset_ids, history_block_ids, history_site_ids)
            constraints.append('r.`id` IN (SELECT `id` FROM {0})'.format(temp_table))

        if len(constraints) != 0:
            return ' WHERE ' + ' AND '.join(constraints)
        else:
            return ''