示例#1
0
def spec4data_records():
    "Return spec part for data_records"
    data_record = record_codes('data_record')
    empty_record = record_codes('empty_record')
    gridfs_record = record_codes('gridfs_record')
    spec = {'$in': [data_record, empty_record, gridfs_record]}
    return spec
示例#2
0
def spec4data_records():
    "Return spec part for data_records"
    data_record = record_codes('data_record')
    empty_record = record_codes('empty_record')
    gridfs_record = record_codes('gridfs_record')
    spec = {'$in': [data_record, empty_record, gridfs_record]}
    return spec
示例#3
0
    def generate_records(self, dasquery, results, header):
        """
        Iterate over provided results, update records and yield them
        to next level (update_cache)
        """
        self.logger.debug("(%s) store to cache" % dasquery)
        if  not results:
            return

        dasheader  = header['das']
        expire     = adjust_expire(dasheader['expire'])
        system     = dasheader['system'] # DAS service names, e.g. combined
        services   = dasheader['services'] # CMS services used to get data
        api        = dasheader['api']
        prim_key   = header.get('prim_key', None)
        if  not prim_key:
            # get primary key from a list of lookup keys which has the
            # following structure [{'api':[keys]}, {...}]
            lup_keys = header['lookup_keys']
            lkeys    = [l for i in lup_keys for k in i.values() for l in k]
            prim_key = lkeys[0] if 'summary' not in lkeys else 'summary'
        cond_keys  = list(dasquery.mongo_query['spec'].keys())
        # get API record id
        spec       = {'qhash':dasquery.qhash, 'das.system':system,
                      'das.expire': {'$gt':time.time()},
                      'das.record': record_codes('query_record')}
        counter    = 0
        rids = [str(r['_id']) for r in \
                self.col.find(spec, ['_id'], **PYMONGO_OPTS)]
        if  rids:
            if  isinstance(results, list) or isinstance(results, GeneratorType):
                for item in results:
                    counter += 1
                    if  'das' in item:
                        expire = item.get('das').get('expire', expire)
                        dasheader['expire'] = expire
                    item['das'] = dict(expire=expire, primary_key=prim_key,
                                       condition_keys=cond_keys,
                                       instance=dasquery.instance,
                                       system=system, services=services,
                                       record=record_codes('data_record'),
                                       ts=time.time(), api=api)
                    item['das_id'] = rids
                    item['qhash'] = dasquery.qhash
                    yield item
            else:
                print("\n\n ### results = ", str(results))
                raise Exception('Provided results is not a list/generator type')
        if  expire != dasheader['expire']: # update DAS records
            header['das']['expire'] = expire
        # update das record with new status
        status = 'Update DAS cache, %s API' % header['das']['api'][0]
        self.update_query_record(dasquery, status, header)

        msg = "\n%s yield %s rows" % (dasheader['system'], counter)
        self.logger.info(msg)
示例#4
0
 def insert_query_record(self, dasquery, header):
     """
     Insert query record into DAS cache.
     """
     # check presence of API record in a cache
     dasheader = header['das']
     system = dasheader['system']
     api = dasheader['api']
     collection = 'cache'
     check_query = True
     expire = dasheader.get('expire', None)
     if expire:
         dasheader['expire'] = adjust_expire(expire)
     if not self.incache(dasquery, collection, system, api, check_query):
         msg = "query=%s, header=%s" % (dasquery, header)
         self.logger.debug(msg)
         q_record = dict(das=dasheader, query=dasquery.storage_query)
         q_record['das']['record'] = record_codes('query_record')
         q_record['das']['status'] = "requested"
         q_record['qhash'] = dasquery.qhash
         q_record['das']['ctime'] = [time.time()]
         res = self.col.insert_one(q_record)
         if not res:
             msg = 'unable to insert query record'
             print(dastimestamp('DAS ERROR '), dasquery, msg,
                   ', will retry')
             time.sleep(1)
             res = self.col.insert(q_record)
             if not res:
                 print(dastimestamp('DAS ERROR '), dasquery, msg)
示例#5
0
 def incache(self, dasquery, collection='merge', system=None, api=None,
         query_record=False):
     """
     Check if we have query results in cache, otherwise return null.
     Please note, input parameter query means MongoDB query, please
     consult MongoDB API for more details,
     http://api.mongodb.org/python/
     """
     if  query_record:
         record = record_codes('query_record')
     else:
         record = spec4data_records()
     spec = {'qhash':dasquery.qhash, 'das.record':record,
             'das.expire':{'$gt':time.time()}}
     if  system:
         spec.update({'das.system': system})
     if  api:
         spec.update({'das.api': api})
     conn = db_connection(self.dburi)
     mdb  = conn[self.dbname]
     mdb.add_son_manipulator(self.das_son_manipulator)
     col  = mdb[collection]
     res  = col.find(spec, **PYMONGO_OPTS).count()
     msg  = "(%s, coll=%s) found %s results" % (dasquery, collection, res)
     self.logger.info(msg)
     if  res:
         return True
     return False
示例#6
0
    def update_query_record_system(self, dasquery, system, api, status):
        "Update system status of dasquery in das.cache collection"
        spec = {'qhash': dasquery.qhash, 'das.system': system, 'das.api': api,
                'das.record':record_codes('query_record')}
        udict = {'$set': {'das.status':status}}
#         print("### update_query_record", spec)
        doc=self.col.find_one_and_update(spec, udict, return_document=ReturnDocument.AFTER)
示例#7
0
 def find_query_record(self, dasquery):
     "Find DAS query records and return them to the caller"
     spec = {
         'qhash': dasquery.qhash,
         'das.record': record_codes('query_record')
     }
     return self.col.find(spec, **PYMONGO_OPTS)
示例#8
0
 def insert_query_record(self, dasquery, header):
     """
     Insert query record into DAS cache.
     """
     # check presence of API record in a cache
     dasheader   = header['das']
     system      = dasheader['system']
     api         = dasheader['api']
     collection  = 'cache'
     check_query = True
     expire = dasheader.get('expire', None)
     if  expire:
         dasheader['expire'] = adjust_expire(expire)
     if  not self.incache(dasquery, collection, system, api, check_query):
         msg = "query=%s, header=%s" % (dasquery, header)
         self.logger.debug(msg)
         q_record = dict(das=dasheader, query=dasquery.storage_query)
         q_record['das']['record'] = record_codes('query_record')
         q_record['das']['status'] = "requested"
         q_record['qhash'] = dasquery.qhash
         q_record['das']['ctime'] = [time.time()]
         res = self.col.insert_one(q_record)
         if  not res:
             msg = 'unable to insert query record'
             print(dastimestamp('DAS ERROR '), dasquery, msg, ', will retry')
             time.sleep(1)
             res = self.col.insert(q_record)
             if  not res:
                 print(dastimestamp('DAS ERROR '), dasquery, msg)
示例#9
0
 def apilist(self, dasquery):
     "Return list of apis for given dasquery"
     spec = {'qhash':dasquery.qhash,
             'das.record':record_codes('query_record')}
     apis = []
     for row in self.col.find(spec, ['das.api'], **PYMONGO_OPTS):
         try:
             apis += row['das']['api']
         except Exception as _err:
             pass
     return apis
示例#10
0
 def update_query_record_system(self, dasquery, system, api, status):
     "Update system status of dasquery in das.cache collection"
     spec = {
         'qhash': dasquery.qhash,
         'das.system': system,
         'das.api': api,
         'das.record': record_codes('query_record')
     }
     udict = {'$set': {'das.status': status}}
     #         print("### update_query_record", spec)
     doc = self.col.find_one_and_update(
         spec, udict, return_document=ReturnDocument.AFTER)
示例#11
0
 def apilist(self, dasquery):
     "Return list of apis for given dasquery"
     spec = {
         'qhash': dasquery.qhash,
         'das.record': record_codes('query_record')
     }
     apis = []
     for row in self.col.find(spec, ['das.api'], **PYMONGO_OPTS):
         try:
             apis += row['das']['api']
         except Exception as _err:
             pass
     return apis
示例#12
0
文件: key_learning.py 项目: ktf/DAS
    def __call__(self):
        """__call__ implementation"""
        self.das.rawcache.clean_cache("cache")
        rawcache = self.das.rawcache.col
        autodeque = lambda: collections.deque(maxlen=self.redundancy)
        found_ids = collections.defaultdict(autodeque)

        self.logger.info("finding das_ids")
        for doc in rawcache.find({'das.record': record_codes('data_record'),
                                  'das.primary_key': {'$exists': True}},
                                 fields=['das.primary_key', 'das_id']):
            for das_id in doc['das_id']:
                found_ids[doc['das']['primary_key']].append(das_id)

        hit_ids = set()
        self.logger.info("found %s primary_keys" % len(found_ids))
        for key in found_ids:
            self.logger.info("primary_key=%s" % key)
            for das_id in found_ids[key]:
                if _DEBUG:
                    print '-======= DAS ID ======'
                    pprint(das_id)
                    print '-======= HIT ID (ALREADY VISITED) ======'
                    pprint(hit_ids)

                if not das_id in hit_ids:
                    self.logger.info("das_id=%s" % das_id)
                    hit_ids.add(das_id)
                    doc = rawcache.find_one({'_id': ObjectId(das_id)})
                    if doc:
                        self.process_query_record(doc)
                    else:
                        self.logger.warning("no record for das_id=%s" % das_id)

        if _DEBUG:
            print 'result attributes (all):'
            for row in self.das.keylearning.list_members():
                pprint(row)
                res_t = self.das.mapping.primary_key(row['system'], row['urn'])
                print row.get('keys', ''), '-->', res_t, ':', \
                    ', '.join([m for m in row.get('members', [])])

        return {}
示例#13
0
 def incache(self,
             dasquery,
             collection='merge',
             system=None,
             api=None,
             query_record=False):
     """
     Check if we have query results in cache, otherwise return null.
     Please note, input parameter query means MongoDB query, please
     consult MongoDB API for more details,
     http://api.mongodb.org/python/
     """
     if query_record:
         record = record_codes('query_record')
     else:
         record = spec4data_records()
     spec = {
         'qhash': dasquery.qhash,
         'das.record': record,
         'das.expire': {
             '$gt': time.time()
         }
     }
     if system:
         spec.update({'das.system': system})
     if api:
         spec.update({'das.api': api})
     conn = db_connection(self.dburi)
     mdb = conn[self.dbname]
     mdb.add_son_manipulator(self.das_son_manipulator)
     col = mdb[collection]
     res = col.find(spec, **PYMONGO_OPTS).count()
     msg = "(%s, coll=%s) found %s results" % (dasquery, collection, res)
     self.logger.info(msg)
     if res:
         return True
     return False
示例#14
0
    def generate_records(self, dasquery, results, header):
        """
        Iterate over provided results, update records and yield them
        to next level (update_cache)
        """
        self.logger.debug("(%s) store to cache" % dasquery)
        if not results:
            return

        dasheader = header['das']
        expire = adjust_expire(dasheader['expire'])
        system = dasheader['system']  # DAS service names, e.g. combined
        services = dasheader['services']  # CMS services used to get data
        api = dasheader['api']
        prim_key = header.get('prim_key', None)
        if not prim_key:
            # get primary key from a list of lookup keys which has the
            # following structure [{'api':[keys]}, {...}]
            lup_keys = header['lookup_keys']
            lkeys = [l for i in lup_keys for k in i.values() for l in k]
            prim_key = lkeys[0] if 'summary' not in lkeys else 'summary'
        cond_keys = list(dasquery.mongo_query['spec'].keys())
        # get API record id
        spec = {
            'qhash': dasquery.qhash,
            'das.system': system,
            'das.expire': {
                '$gt': time.time()
            },
            'das.record': record_codes('query_record')
        }
        counter = 0
        rids = [str(r['_id']) for r in \
                self.col.find(spec, ['_id'], **PYMONGO_OPTS)]
        if rids:
            if isinstance(results, list) or isinstance(results, GeneratorType):
                for item in results:
                    counter += 1
                    if 'das' in item:
                        expire = item.get('das').get('expire', expire)
                        dasheader['expire'] = expire
                    item['das'] = dict(expire=expire,
                                       primary_key=prim_key,
                                       condition_keys=cond_keys,
                                       instance=dasquery.instance,
                                       system=system,
                                       services=services,
                                       record=record_codes('data_record'),
                                       ts=time.time(),
                                       api=api)
                    item['das_id'] = rids
                    item['qhash'] = dasquery.qhash
                    yield item
            else:
                print("\n\n ### results = ", str(results))
                raise Exception(
                    'Provided results is not a list/generator type')
        if expire != dasheader['expire']:  # update DAS records
            header['das']['expire'] = expire
        # update das record with new status
        status = 'Update DAS cache, %s API' % header['das']['api'][0]
        self.update_query_record(dasquery, status, header)

        msg = "\n%s yield %s rows" % (dasheader['system'], counter)
        self.logger.info(msg)
示例#15
0
    def merge_records(self, dasquery, attempt=0):
        """
        Merge DAS records for provided query. We perform the following
        steps:
        1. get all queries from das.cache by ordering them by primary key
        2. run aggregtor function to merge neighbors
        3. insert records into das.merge
        """
        ### TMP for asyncio
#         time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time

        # remove any entries in merge collection for this query
        self.merge.delete_many({'qhash':dasquery.qhash})
        # proceed
        self.logger.debug(dasquery)
        id_list = []
        expire  = 9999999999 # future
        # get all API records for given DAS query
        spec    = {'qhash':dasquery.qhash,
                   'das.expire':{'$gt':time.time()},
                   'das.record':record_codes('query_record')}
        records = self.col.find(spec, **PYMONGO_OPTS)
        for row in records:
            # find smallest expire timestamp to be used by aggregator
            rexpire = row.get('das', {}).get('expire', expire)
            if  rexpire < expire:
                expire = rexpire
            if  row['_id'] not in id_list:
                id_list.append(row['_id'])
        inserted = 0
        lookup_keys = set()
        fields = dasquery.mongo_query.get('fields')
        if  not fields: # Mongo
            fields = []
        for key in fields:
            for pkey in self.mapping.mapkeys(key):
                lookup_keys.add(pkey)
        for pkey in lookup_keys:
            skey = [(pkey, DESCENDING)]
            # lookup all service records
            spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey}
            if  self.verbose:
                nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count()
                msg  = "merging %s records, for %s key" % (nrec, pkey)
            else:
                msg  = "merging records, for %s key" % pkey
            self.logger.debug(msg)
            # use exhaust=False since we process all records in aggregator
            # and it can be delay in processing
            records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey)
            # aggregate all records
            agen = aggregator(dasquery, records, expire)
            # diff aggregated records
            gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0]))
            # insert all records into das.merge using bulk insert
            size = self.cache_size
            try:
                res = self.merge.insert_many(gen)
                inserted += len(res.inserted_ids)
            except InvalidDocument as exp:
                print(dastimestamp('DAS WARNING'), 'InvalidDocument during merge', str(exp))
                msg = "Caught bson error: " + str(exp)
                self.logger.info(msg)
                records = self.col.find(spec, **PYMONGO_OPTS).sort(skey)
                gen = aggregator(dasquery, records, expire)
                genrows = parse2gridfs(self.gfs, pkey, gen, self.logger)
                das_dict = {'das':{'expire':expire,
                        'das.record': record_codes('gridfs_record'),
                        'primary_key':[k for k in lookup_keys],
                        'system': ['gridfs']}, 'qhash':dasquery.qhash,
                        'cache_id':[], 'das_id': id_list}
                for row in genrows:
                    row.update(das_dict)
                    self.merge.insert(row)
            except InvalidOperation as exp:
                pass
            except DuplicateKeyError as err:
                print(dastimestamp('DAS WARNING'), 'DuplicateKeyError during merge')
                if  not isinstance(gen, list):
                    raise err
        status = 'fail'
        if  inserted:
            status = 'ok'
        elif  not lookup_keys: # we get query w/o fields
            msg = 'qhash %s, no lookup_keys' % dasquery.qhash
            print(dastimestamp('DAS WARNING'), msg)
            status = 'ok'
        else: # we didn't merge anything, it is DB look-up failure
            msg  = 'qhash %s, did not insert into das.merge, attempt %s' \
                    % (dasquery.qhash, attempt)
            print(dastimestamp('DAS WARNING'), msg)
            empty_expire = etstamp()
            lkeys = list(lookup_keys)
            das = dict(expire=empty_expire, primary_key=lkeys[0],
                       condition_keys=lkeys,
                       instance=dasquery.instance,
                       system=['das'], services=dasquery.services,
                       record=record_codes('empty_record'),
                       ts=time.time(), api=[])
            empty_record = {'das':das, 'qhash': dasquery.qhash,
                            'cache_id':[], 'das_id': id_list}
            for key in lkeys:
                empty_record.update({key.split('.')[0]:[]})
            for key, val in dasquery.mongo_query['spec'].items():
                if  key.find('.') == -1:
                    empty_record[key] = []
                else: # it is compound key, e.g. site.name
                    newkey, newval = convert_dot_notation(key, val)
                    empty_record[newkey] = adjust_mongo_keyvalue(newval)
            self.merge.insert(empty_record)
            # update DAS records (both meta and data ones, by using qhash)
            nval = {'$set': {'das.expire':empty_expire}}
            spec = {'qhash':dasquery.qhash}
            self.col.update_many(spec, nval)
        return status
示例#16
0
    def merge_records(self, dasquery, attempt=0):
        """
        Merge DAS records for provided query. We perform the following
        steps:
        1. get all queries from das.cache by ordering them by primary key
        2. run aggregtor function to merge neighbors
        3. insert records into das.merge
        """
        ### TMP for asyncio
        #         time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time

        # remove any entries in merge collection for this query
        self.merge.delete_many({'qhash': dasquery.qhash})
        # proceed
        self.logger.debug(dasquery)
        id_list = []
        expire = 9999999999  # future
        # get all API records for given DAS query
        spec = {
            'qhash': dasquery.qhash,
            'das.expire': {
                '$gt': time.time()
            },
            'das.record': record_codes('query_record')
        }
        records = self.col.find(spec, **PYMONGO_OPTS)
        for row in records:
            # find smallest expire timestamp to be used by aggregator
            rexpire = row.get('das', {}).get('expire', expire)
            if rexpire < expire:
                expire = rexpire
            if row['_id'] not in id_list:
                id_list.append(row['_id'])
        inserted = 0
        lookup_keys = set()
        fields = dasquery.mongo_query.get('fields')
        if not fields:  # Mongo
            fields = []
        for key in fields:
            for pkey in self.mapping.mapkeys(key):
                lookup_keys.add(pkey)
        for pkey in lookup_keys:
            skey = [(pkey, DESCENDING)]
            # lookup all service records
            spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey}
            if self.verbose:
                nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count()
                msg = "merging %s records, for %s key" % (nrec, pkey)
            else:
                msg = "merging records, for %s key" % pkey
            self.logger.debug(msg)
            # use exhaust=False since we process all records in aggregator
            # and it can be delay in processing
            records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey)
            # aggregate all records
            agen = aggregator(dasquery, records, expire)
            # diff aggregated records
            gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0]))
            # insert all records into das.merge using bulk insert
            size = self.cache_size
            try:
                res = self.merge.insert_many(gen)
                inserted += len(res.inserted_ids)
            except InvalidDocument as exp:
                print(dastimestamp('DAS WARNING'),
                      'InvalidDocument during merge', str(exp))
                msg = "Caught bson error: " + str(exp)
                self.logger.info(msg)
                records = self.col.find(spec, **PYMONGO_OPTS).sort(skey)
                gen = aggregator(dasquery, records, expire)
                genrows = parse2gridfs(self.gfs, pkey, gen, self.logger)
                das_dict = {
                    'das': {
                        'expire': expire,
                        'das.record': record_codes('gridfs_record'),
                        'primary_key': [k for k in lookup_keys],
                        'system': ['gridfs']
                    },
                    'qhash': dasquery.qhash,
                    'cache_id': [],
                    'das_id': id_list
                }
                for row in genrows:
                    row.update(das_dict)
                    self.merge.insert(row)
            except InvalidOperation as exp:
                pass
            except DuplicateKeyError as err:
                print(dastimestamp('DAS WARNING'),
                      'DuplicateKeyError during merge')
                if not isinstance(gen, list):
                    raise err
        status = 'fail'
        if inserted:
            status = 'ok'
        elif not lookup_keys:  # we get query w/o fields
            msg = 'qhash %s, no lookup_keys' % dasquery.qhash
            print(dastimestamp('DAS WARNING'), msg)
            status = 'ok'
        else:  # we didn't merge anything, it is DB look-up failure
            msg  = 'qhash %s, did not insert into das.merge, attempt %s' \
                    % (dasquery.qhash, attempt)
            print(dastimestamp('DAS WARNING'), msg)
            empty_expire = etstamp()
            lkeys = list(lookup_keys)
            das = dict(expire=empty_expire,
                       primary_key=lkeys[0],
                       condition_keys=lkeys,
                       instance=dasquery.instance,
                       system=['das'],
                       services=dasquery.services,
                       record=record_codes('empty_record'),
                       ts=time.time(),
                       api=[])
            empty_record = {
                'das': das,
                'qhash': dasquery.qhash,
                'cache_id': [],
                'das_id': id_list
            }
            for key in lkeys:
                empty_record.update({key.split('.')[0]: []})
            for key, val in dasquery.mongo_query['spec'].items():
                if key.find('.') == -1:
                    empty_record[key] = []
                else:  # it is compound key, e.g. site.name
                    newkey, newval = convert_dot_notation(key, val)
                    empty_record[newkey] = adjust_mongo_keyvalue(newval)
            self.merge.insert(empty_record)
            # update DAS records (both meta and data ones, by using qhash)
            nval = {'$set': {'das.expire': empty_expire}}
            spec = {'qhash': dasquery.qhash}
            self.col.update_many(spec, nval)
        return status
示例#17
0
 def find_query_record(self, dasquery):
     "Find DAS query records and return them to the caller"
     spec = {'qhash':dasquery.qhash,
             'das.record':record_codes('query_record')}
     return self.col.find(spec, **PYMONGO_OPTS)