def spec4data_records(): "Return spec part for data_records" data_record = record_codes('data_record') empty_record = record_codes('empty_record') gridfs_record = record_codes('gridfs_record') spec = {'$in': [data_record, empty_record, gridfs_record]} return spec
def generate_records(self, dasquery, results, header): """ Iterate over provided results, update records and yield them to next level (update_cache) """ self.logger.debug("(%s) store to cache" % dasquery) if not results: return dasheader = header['das'] expire = adjust_expire(dasheader['expire']) system = dasheader['system'] # DAS service names, e.g. combined services = dasheader['services'] # CMS services used to get data api = dasheader['api'] prim_key = header.get('prim_key', None) if not prim_key: # get primary key from a list of lookup keys which has the # following structure [{'api':[keys]}, {...}] lup_keys = header['lookup_keys'] lkeys = [l for i in lup_keys for k in i.values() for l in k] prim_key = lkeys[0] if 'summary' not in lkeys else 'summary' cond_keys = list(dasquery.mongo_query['spec'].keys()) # get API record id spec = {'qhash':dasquery.qhash, 'das.system':system, 'das.expire': {'$gt':time.time()}, 'das.record': record_codes('query_record')} counter = 0 rids = [str(r['_id']) for r in \ self.col.find(spec, ['_id'], **PYMONGO_OPTS)] if rids: if isinstance(results, list) or isinstance(results, GeneratorType): for item in results: counter += 1 if 'das' in item: expire = item.get('das').get('expire', expire) dasheader['expire'] = expire item['das'] = dict(expire=expire, primary_key=prim_key, condition_keys=cond_keys, instance=dasquery.instance, system=system, services=services, record=record_codes('data_record'), ts=time.time(), api=api) item['das_id'] = rids item['qhash'] = dasquery.qhash yield item else: print("\n\n ### results = ", str(results)) raise Exception('Provided results is not a list/generator type') if expire != dasheader['expire']: # update DAS records header['das']['expire'] = expire # update das record with new status status = 'Update DAS cache, %s API' % header['das']['api'][0] self.update_query_record(dasquery, status, header) msg = "\n%s yield %s rows" % (dasheader['system'], counter) self.logger.info(msg)
def insert_query_record(self, dasquery, header): """ Insert query record into DAS cache. """ # check presence of API record in a cache dasheader = header['das'] system = dasheader['system'] api = dasheader['api'] collection = 'cache' check_query = True expire = dasheader.get('expire', None) if expire: dasheader['expire'] = adjust_expire(expire) if not self.incache(dasquery, collection, system, api, check_query): msg = "query=%s, header=%s" % (dasquery, header) self.logger.debug(msg) q_record = dict(das=dasheader, query=dasquery.storage_query) q_record['das']['record'] = record_codes('query_record') q_record['das']['status'] = "requested" q_record['qhash'] = dasquery.qhash q_record['das']['ctime'] = [time.time()] res = self.col.insert_one(q_record) if not res: msg = 'unable to insert query record' print(dastimestamp('DAS ERROR '), dasquery, msg, ', will retry') time.sleep(1) res = self.col.insert(q_record) if not res: print(dastimestamp('DAS ERROR '), dasquery, msg)
def incache(self, dasquery, collection='merge', system=None, api=None, query_record=False): """ Check if we have query results in cache, otherwise return null. Please note, input parameter query means MongoDB query, please consult MongoDB API for more details, http://api.mongodb.org/python/ """ if query_record: record = record_codes('query_record') else: record = spec4data_records() spec = {'qhash':dasquery.qhash, 'das.record':record, 'das.expire':{'$gt':time.time()}} if system: spec.update({'das.system': system}) if api: spec.update({'das.api': api}) conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) col = mdb[collection] res = col.find(spec, **PYMONGO_OPTS).count() msg = "(%s, coll=%s) found %s results" % (dasquery, collection, res) self.logger.info(msg) if res: return True return False
def update_query_record_system(self, dasquery, system, api, status): "Update system status of dasquery in das.cache collection" spec = {'qhash': dasquery.qhash, 'das.system': system, 'das.api': api, 'das.record':record_codes('query_record')} udict = {'$set': {'das.status':status}} # print("### update_query_record", spec) doc=self.col.find_one_and_update(spec, udict, return_document=ReturnDocument.AFTER)
def find_query_record(self, dasquery): "Find DAS query records and return them to the caller" spec = { 'qhash': dasquery.qhash, 'das.record': record_codes('query_record') } return self.col.find(spec, **PYMONGO_OPTS)
def apilist(self, dasquery): "Return list of apis for given dasquery" spec = {'qhash':dasquery.qhash, 'das.record':record_codes('query_record')} apis = [] for row in self.col.find(spec, ['das.api'], **PYMONGO_OPTS): try: apis += row['das']['api'] except Exception as _err: pass return apis
def update_query_record_system(self, dasquery, system, api, status): "Update system status of dasquery in das.cache collection" spec = { 'qhash': dasquery.qhash, 'das.system': system, 'das.api': api, 'das.record': record_codes('query_record') } udict = {'$set': {'das.status': status}} # print("### update_query_record", spec) doc = self.col.find_one_and_update( spec, udict, return_document=ReturnDocument.AFTER)
def apilist(self, dasquery): "Return list of apis for given dasquery" spec = { 'qhash': dasquery.qhash, 'das.record': record_codes('query_record') } apis = [] for row in self.col.find(spec, ['das.api'], **PYMONGO_OPTS): try: apis += row['das']['api'] except Exception as _err: pass return apis
def __call__(self): """__call__ implementation""" self.das.rawcache.clean_cache("cache") rawcache = self.das.rawcache.col autodeque = lambda: collections.deque(maxlen=self.redundancy) found_ids = collections.defaultdict(autodeque) self.logger.info("finding das_ids") for doc in rawcache.find({'das.record': record_codes('data_record'), 'das.primary_key': {'$exists': True}}, fields=['das.primary_key', 'das_id']): for das_id in doc['das_id']: found_ids[doc['das']['primary_key']].append(das_id) hit_ids = set() self.logger.info("found %s primary_keys" % len(found_ids)) for key in found_ids: self.logger.info("primary_key=%s" % key) for das_id in found_ids[key]: if _DEBUG: print '-======= DAS ID ======' pprint(das_id) print '-======= HIT ID (ALREADY VISITED) ======' pprint(hit_ids) if not das_id in hit_ids: self.logger.info("das_id=%s" % das_id) hit_ids.add(das_id) doc = rawcache.find_one({'_id': ObjectId(das_id)}) if doc: self.process_query_record(doc) else: self.logger.warning("no record for das_id=%s" % das_id) if _DEBUG: print 'result attributes (all):' for row in self.das.keylearning.list_members(): pprint(row) res_t = self.das.mapping.primary_key(row['system'], row['urn']) print row.get('keys', ''), '-->', res_t, ':', \ ', '.join([m for m in row.get('members', [])]) return {}
def incache(self, dasquery, collection='merge', system=None, api=None, query_record=False): """ Check if we have query results in cache, otherwise return null. Please note, input parameter query means MongoDB query, please consult MongoDB API for more details, http://api.mongodb.org/python/ """ if query_record: record = record_codes('query_record') else: record = spec4data_records() spec = { 'qhash': dasquery.qhash, 'das.record': record, 'das.expire': { '$gt': time.time() } } if system: spec.update({'das.system': system}) if api: spec.update({'das.api': api}) conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) col = mdb[collection] res = col.find(spec, **PYMONGO_OPTS).count() msg = "(%s, coll=%s) found %s results" % (dasquery, collection, res) self.logger.info(msg) if res: return True return False
def generate_records(self, dasquery, results, header): """ Iterate over provided results, update records and yield them to next level (update_cache) """ self.logger.debug("(%s) store to cache" % dasquery) if not results: return dasheader = header['das'] expire = adjust_expire(dasheader['expire']) system = dasheader['system'] # DAS service names, e.g. combined services = dasheader['services'] # CMS services used to get data api = dasheader['api'] prim_key = header.get('prim_key', None) if not prim_key: # get primary key from a list of lookup keys which has the # following structure [{'api':[keys]}, {...}] lup_keys = header['lookup_keys'] lkeys = [l for i in lup_keys for k in i.values() for l in k] prim_key = lkeys[0] if 'summary' not in lkeys else 'summary' cond_keys = list(dasquery.mongo_query['spec'].keys()) # get API record id spec = { 'qhash': dasquery.qhash, 'das.system': system, 'das.expire': { '$gt': time.time() }, 'das.record': record_codes('query_record') } counter = 0 rids = [str(r['_id']) for r in \ self.col.find(spec, ['_id'], **PYMONGO_OPTS)] if rids: if isinstance(results, list) or isinstance(results, GeneratorType): for item in results: counter += 1 if 'das' in item: expire = item.get('das').get('expire', expire) dasheader['expire'] = expire item['das'] = dict(expire=expire, primary_key=prim_key, condition_keys=cond_keys, instance=dasquery.instance, system=system, services=services, record=record_codes('data_record'), ts=time.time(), api=api) item['das_id'] = rids item['qhash'] = dasquery.qhash yield item else: print("\n\n ### results = ", str(results)) raise Exception( 'Provided results is not a list/generator type') if expire != dasheader['expire']: # update DAS records header['das']['expire'] = expire # update das record with new status status = 'Update DAS cache, %s API' % header['das']['api'][0] self.update_query_record(dasquery, status, header) msg = "\n%s yield %s rows" % (dasheader['system'], counter) self.logger.info(msg)
def merge_records(self, dasquery, attempt=0): """ Merge DAS records for provided query. We perform the following steps: 1. get all queries from das.cache by ordering them by primary key 2. run aggregtor function to merge neighbors 3. insert records into das.merge """ ### TMP for asyncio # time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time # remove any entries in merge collection for this query self.merge.delete_many({'qhash':dasquery.qhash}) # proceed self.logger.debug(dasquery) id_list = [] expire = 9999999999 # future # get all API records for given DAS query spec = {'qhash':dasquery.qhash, 'das.expire':{'$gt':time.time()}, 'das.record':record_codes('query_record')} records = self.col.find(spec, **PYMONGO_OPTS) for row in records: # find smallest expire timestamp to be used by aggregator rexpire = row.get('das', {}).get('expire', expire) if rexpire < expire: expire = rexpire if row['_id'] not in id_list: id_list.append(row['_id']) inserted = 0 lookup_keys = set() fields = dasquery.mongo_query.get('fields') if not fields: # Mongo fields = [] for key in fields: for pkey in self.mapping.mapkeys(key): lookup_keys.add(pkey) for pkey in lookup_keys: skey = [(pkey, DESCENDING)] # lookup all service records spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey} if self.verbose: nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count() msg = "merging %s records, for %s key" % (nrec, pkey) else: msg = "merging records, for %s key" % pkey self.logger.debug(msg) # use exhaust=False since we process all records in aggregator # and it can be delay in processing records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey) # aggregate all records agen = aggregator(dasquery, records, expire) # diff aggregated records gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0])) # insert all records into das.merge using bulk insert size = self.cache_size try: res = self.merge.insert_many(gen) inserted += len(res.inserted_ids) except InvalidDocument as exp: print(dastimestamp('DAS WARNING'), 'InvalidDocument during merge', str(exp)) msg = "Caught bson error: " + str(exp) self.logger.info(msg) records = self.col.find(spec, **PYMONGO_OPTS).sort(skey) gen = aggregator(dasquery, records, expire) genrows = parse2gridfs(self.gfs, pkey, gen, self.logger) das_dict = {'das':{'expire':expire, 'das.record': record_codes('gridfs_record'), 'primary_key':[k for k in lookup_keys], 'system': ['gridfs']}, 'qhash':dasquery.qhash, 'cache_id':[], 'das_id': id_list} for row in genrows: row.update(das_dict) self.merge.insert(row) except InvalidOperation as exp: pass except DuplicateKeyError as err: print(dastimestamp('DAS WARNING'), 'DuplicateKeyError during merge') if not isinstance(gen, list): raise err status = 'fail' if inserted: status = 'ok' elif not lookup_keys: # we get query w/o fields msg = 'qhash %s, no lookup_keys' % dasquery.qhash print(dastimestamp('DAS WARNING'), msg) status = 'ok' else: # we didn't merge anything, it is DB look-up failure msg = 'qhash %s, did not insert into das.merge, attempt %s' \ % (dasquery.qhash, attempt) print(dastimestamp('DAS WARNING'), msg) empty_expire = etstamp() lkeys = list(lookup_keys) das = dict(expire=empty_expire, primary_key=lkeys[0], condition_keys=lkeys, instance=dasquery.instance, system=['das'], services=dasquery.services, record=record_codes('empty_record'), ts=time.time(), api=[]) empty_record = {'das':das, 'qhash': dasquery.qhash, 'cache_id':[], 'das_id': id_list} for key in lkeys: empty_record.update({key.split('.')[0]:[]}) for key, val in dasquery.mongo_query['spec'].items(): if key.find('.') == -1: empty_record[key] = [] else: # it is compound key, e.g. site.name newkey, newval = convert_dot_notation(key, val) empty_record[newkey] = adjust_mongo_keyvalue(newval) self.merge.insert(empty_record) # update DAS records (both meta and data ones, by using qhash) nval = {'$set': {'das.expire':empty_expire}} spec = {'qhash':dasquery.qhash} self.col.update_many(spec, nval) return status
def merge_records(self, dasquery, attempt=0): """ Merge DAS records for provided query. We perform the following steps: 1. get all queries from das.cache by ordering them by primary key 2. run aggregtor function to merge neighbors 3. insert records into das.merge """ ### TMP for asyncio # time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time # remove any entries in merge collection for this query self.merge.delete_many({'qhash': dasquery.qhash}) # proceed self.logger.debug(dasquery) id_list = [] expire = 9999999999 # future # get all API records for given DAS query spec = { 'qhash': dasquery.qhash, 'das.expire': { '$gt': time.time() }, 'das.record': record_codes('query_record') } records = self.col.find(spec, **PYMONGO_OPTS) for row in records: # find smallest expire timestamp to be used by aggregator rexpire = row.get('das', {}).get('expire', expire) if rexpire < expire: expire = rexpire if row['_id'] not in id_list: id_list.append(row['_id']) inserted = 0 lookup_keys = set() fields = dasquery.mongo_query.get('fields') if not fields: # Mongo fields = [] for key in fields: for pkey in self.mapping.mapkeys(key): lookup_keys.add(pkey) for pkey in lookup_keys: skey = [(pkey, DESCENDING)] # lookup all service records spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey} if self.verbose: nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count() msg = "merging %s records, for %s key" % (nrec, pkey) else: msg = "merging records, for %s key" % pkey self.logger.debug(msg) # use exhaust=False since we process all records in aggregator # and it can be delay in processing records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey) # aggregate all records agen = aggregator(dasquery, records, expire) # diff aggregated records gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0])) # insert all records into das.merge using bulk insert size = self.cache_size try: res = self.merge.insert_many(gen) inserted += len(res.inserted_ids) except InvalidDocument as exp: print(dastimestamp('DAS WARNING'), 'InvalidDocument during merge', str(exp)) msg = "Caught bson error: " + str(exp) self.logger.info(msg) records = self.col.find(spec, **PYMONGO_OPTS).sort(skey) gen = aggregator(dasquery, records, expire) genrows = parse2gridfs(self.gfs, pkey, gen, self.logger) das_dict = { 'das': { 'expire': expire, 'das.record': record_codes('gridfs_record'), 'primary_key': [k for k in lookup_keys], 'system': ['gridfs'] }, 'qhash': dasquery.qhash, 'cache_id': [], 'das_id': id_list } for row in genrows: row.update(das_dict) self.merge.insert(row) except InvalidOperation as exp: pass except DuplicateKeyError as err: print(dastimestamp('DAS WARNING'), 'DuplicateKeyError during merge') if not isinstance(gen, list): raise err status = 'fail' if inserted: status = 'ok' elif not lookup_keys: # we get query w/o fields msg = 'qhash %s, no lookup_keys' % dasquery.qhash print(dastimestamp('DAS WARNING'), msg) status = 'ok' else: # we didn't merge anything, it is DB look-up failure msg = 'qhash %s, did not insert into das.merge, attempt %s' \ % (dasquery.qhash, attempt) print(dastimestamp('DAS WARNING'), msg) empty_expire = etstamp() lkeys = list(lookup_keys) das = dict(expire=empty_expire, primary_key=lkeys[0], condition_keys=lkeys, instance=dasquery.instance, system=['das'], services=dasquery.services, record=record_codes('empty_record'), ts=time.time(), api=[]) empty_record = { 'das': das, 'qhash': dasquery.qhash, 'cache_id': [], 'das_id': id_list } for key in lkeys: empty_record.update({key.split('.')[0]: []}) for key, val in dasquery.mongo_query['spec'].items(): if key.find('.') == -1: empty_record[key] = [] else: # it is compound key, e.g. site.name newkey, newval = convert_dot_notation(key, val) empty_record[newkey] = adjust_mongo_keyvalue(newval) self.merge.insert(empty_record) # update DAS records (both meta and data ones, by using qhash) nval = {'$set': {'das.expire': empty_expire}} spec = {'qhash': dasquery.qhash} self.col.update_many(spec, nval) return status
def find_query_record(self, dasquery): "Find DAS query records and return them to the caller" spec = {'qhash':dasquery.qhash, 'das.record':record_codes('query_record')} return self.col.find(spec, **PYMONGO_OPTS)