def test_aggregator(self): """Test aggregator function""" # 1 row in results dasquery = DASQuery(dict(fields=None, spec={'dataset':'/a/b/c'})) qhash = dasquery.qhash das = {'expire': 10, 'primary_key':'vk', 'record': 1, 'api':'api', 'system':['foo'], 'services':[], 'condition_keys':['run'], 'instance':None} row = {'run':10, 'das':das, '_id':1, 'das_id':1} rows = (row for i in range(0,1)) result = [r for r in aggregator(dasquery, rows, das['expire'])] del result[0]['das']['ts'] # we don't need record timestamp expect = [{'run': 10, 'das':das, 'cache_id': [1], 'das_id': [1], 'qhash':qhash}] self.assertEqual(result, expect) # 2 rows with different values for common key rows = [] row = {'run':1, 'das':das, '_id':1, 'das_id':1} rows.append(row) row = {'run':2, 'das':das, '_id':1, 'das_id':1} rows.append(row) res = (r for r in rows) result = [r for r in aggregator(dasquery, res, das['expire'])] for r in result: del r['das']['ts'] # we don't need record timestamp expect = [{'run': 1, 'das':das, 'das_id': [1], 'cache_id': [1], 'qhash':qhash}, {'run': 2, 'das':das, 'das_id': [1], 'cache_id': [1], 'qhash':qhash}] self.assertEqual(result, expect) # 2 rows with common value for common key das = {'expire': 10, 'primary_key':'run.a', 'record': 1, 'api': ['api'], 'system':['foo'], 'services':[], 'condition_keys':['run'], 'instance':None} rows = [] row = {'run':{'a':1,'b':1}, 'das':das, '_id':1, 'das_id':[1]} rows.append(row) row = {'run':{'a':1,'b':2}, 'das':das, '_id':1, 'das_id':[1]} rows.append(row) res = (r for r in rows) result = [r for r in aggregator(dasquery, res, das['expire'])] for r in result: del r['das']['ts'] # we don't need record timestamp expect = [{'run': [{'a': 1, 'b': 1}, {'a': 1, 'b': 2}], 'das':das, 'das_id': [1], 'cache_id': [1], 'qhash':qhash}] self.assertEqual(result, expect)
def test_aggregator_duplicates(self): """Test aggregator function""" dasquery = DASQuery(dict(fields=None, spec={'dataset':'/a/b/c'})) qhash = dasquery.qhash das = {'expire': 10, 'primary_key':'run.a', 'empty_record': 0, 'system':['foo'], 'condition_keys':['run'], 'instance':None} rows = [] row = {'run':{'a':1,'b':1}, 'das':das, '_id':1, 'das_id':1} rows.append(row) row = {'run':{'a':1,'b':1}, 'das':das, '_id':2, 'das_id':2} rows.append(row) res = (r for r in rows) result = [r for r in aggregator(dasquery, res, das['expire'])] for r in result: del r['das']['ts'] # we don't need record timestamp expect = [{'run': [{'a': 1, 'b': 1}], 'das':das, 'qhash':qhash, 'das_id': [1, 2], 'cache_id': [1, 2]}] self.assertEqual(result, expect)
def test_aggregator_duplicates(self): """Test aggregator function""" dasquery = DASQuery(dict(fields=None, spec={'dataset': '/a/b/c'})) qhash = dasquery.qhash das = { 'expire': 10, 'primary_key': 'run.a', 'record': 1, 'api': ['api'], 'system': ['foo'], 'services': [], 'condition_keys': ['run'], 'instance': None } rows = [] row = {'run': {'a': 1, 'b': 1}, 'das': das, '_id': 1, 'das_id': [1]} rows.append(row) row = {'run': {'a': 1, 'b': 1}, 'das': das, '_id': 2, 'das_id': [2]} rows.append(row) res = (r for r in rows) result = [r for r in aggregator(dasquery, res, das['expire'])] for r in result: del r['das']['ts'] # we don't need record timestamp expect = [{ 'run': [{ 'a': 1, 'b': 1 }, { 'a': 1, 'b': 1 }], 'das': das, 'qhash': qhash, 'das_id': [1, 2], 'cache_id': [1, 2] }] self.assertEqual(result, expect)
def merge_records(self, dasquery): """ Merge DAS records for provided query. We perform the following steps: 1. get all queries from das.cache by ordering them by primary key 2. run aggregtor function to merge neighbors 3. insert records into das.merge """ self.logger.debug(dasquery) id_list = [] expire = 9999999999 # future # get all API records for given DAS query spec = {'qhash':dasquery.qhash, 'query':{'$exists':True}} records = self.col.find(spec) for row in records: # find smallest expire timestamp to be used by aggregator if row['das']['expire'] < expire: expire = row['das']['expire'] if row['_id'] not in id_list: id_list.append(row['_id']) inserted = 0 lookup_keys = set() fields = dasquery.mongo_query.get('fields') if not fields: # Mongo fields = [] for key in fields: for pkey in self.mapping.mapkeys(key): lookup_keys.add(pkey) for pkey in lookup_keys: skey = [(pkey, DESCENDING)] # lookup all service records spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey} if self.verbose: nrec = self.col.find(spec).sort(skey).count() msg = "merging %s records, for %s key" % (nrec, pkey) else: msg = "merging records, for %s key" % pkey self.logger.debug(msg) records = self.col.find(spec).sort(skey) # aggregate all records agen = aggregator(dasquery, records, expire) # diff aggregated records gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0])) # insert all records into das.merge using bulk insert size = self.cache_size try: while True: nres = self.merge.insert(\ itertools.islice(gen, size), safe=True) if nres and isinstance(nres, list): inserted += len(nres) else: break except InvalidDocument as exp: msg = "Caught bson error: " + str(exp) self.logger.info(msg) records = self.col.find(spec).sort(skey) gen = aggregator(dasquery, records, expire) genrows = parse2gridfs(self.gfs, pkey, gen, self.logger) das_dict = {'das':{'expire':expire, 'empty_record': 0, 'primary_key':[k for k in lookup_keys], 'system': ['gridfs']}, 'qhash':dasquery.qhash, 'cache_id':[], 'das_id': id_list} for row in genrows: row.update(das_dict) self.merge.insert(row, safe=True) except InvalidOperation: pass if inserted: self.logdb.insert('merge', {'insert': inserted}) elif not lookup_keys: # we get query w/o fields pass else: # we didn't merge anything, it is DB look-up failure empty_expire = time.time() + 20 # secs, short enough to expire empty_record = {'das':{'expire':empty_expire, 'primary_key':list(lookup_keys), 'empty_record': 1}, 'cache_id':[], 'das_id': id_list} for key, val in dasquery.mongo_query['spec'].iteritems(): if key.find('.') == -1: empty_record[key] = [] else: # it is compound key, e.g. site.name newkey, newval = convert_dot_notation(key, val) empty_record[newkey] = adjust_mongo_keyvalue(newval) self.merge.insert(empty_record, safe=True) # update DAS records (both meta and data ones, by using qhash) nval = {'$set': {'das.expire':empty_expire}} spec = {'qhash':dasquery.qhash} self.col.update(spec, nval, multi=True, safe=True)
def test_aggregator(self): """Test aggregator function""" # 1 row in results dasquery = DASQuery(dict(fields=None, spec={'dataset': '/a/b/c'})) qhash = dasquery.qhash das = { 'expire': 10, 'primary_key': 'vk', 'record': 1, 'api': 'api', 'system': ['foo'], 'services': [], 'condition_keys': ['run'], 'instance': None } row = {'run': 10, 'das': das, '_id': 1, 'das_id': 1} rows = (row for i in range(0, 1)) result = [r for r in aggregator(dasquery, rows, das['expire'])] del result[0]['das']['ts'] # we don't need record timestamp expect = [{ 'run': 10, 'das': das, 'cache_id': [1], 'das_id': [1], 'qhash': qhash }] self.assertEqual(result, expect) # 2 rows with different values for common key rows = [] row = {'run': 1, 'das': das, '_id': 1, 'das_id': 1} rows.append(row) row = {'run': 2, 'das': das, '_id': 1, 'das_id': 1} rows.append(row) res = (r for r in rows) result = [r for r in aggregator(dasquery, res, das['expire'])] for r in result: del r['das']['ts'] # we don't need record timestamp expect = [{ 'run': 1, 'das': das, 'das_id': [1], 'cache_id': [1], 'qhash': qhash }, { 'run': 2, 'das': das, 'das_id': [1], 'cache_id': [1], 'qhash': qhash }] self.assertEqual(result, expect) # 2 rows with common value for common key das = { 'expire': 10, 'primary_key': 'run.a', 'record': 1, 'api': ['api'], 'system': ['foo'], 'services': [], 'condition_keys': ['run'], 'instance': None } rows = [] row = {'run': {'a': 1, 'b': 1}, 'das': das, '_id': 1, 'das_id': [1]} rows.append(row) row = {'run': {'a': 1, 'b': 2}, 'das': das, '_id': 1, 'das_id': [1]} rows.append(row) res = (r for r in rows) result = [r for r in aggregator(dasquery, res, das['expire'])] for r in result: del r['das']['ts'] # we don't need record timestamp expect = [{ 'run': [{ 'a': 1, 'b': 1 }, { 'a': 1, 'b': 2 }], 'das': das, 'das_id': [1], 'cache_id': [1], 'qhash': qhash }] self.assertEqual(result, expect)
def merge_records(self, dasquery, attempt=0): """ Merge DAS records for provided query. We perform the following steps: 1. get all queries from das.cache by ordering them by primary key 2. run aggregtor function to merge neighbors 3. insert records into das.merge """ ### TMP for asyncio # time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time # remove any entries in merge collection for this query self.merge.delete_many({'qhash':dasquery.qhash}) # proceed self.logger.debug(dasquery) id_list = [] expire = 9999999999 # future # get all API records for given DAS query spec = {'qhash':dasquery.qhash, 'das.expire':{'$gt':time.time()}, 'das.record':record_codes('query_record')} records = self.col.find(spec, **PYMONGO_OPTS) for row in records: # find smallest expire timestamp to be used by aggregator rexpire = row.get('das', {}).get('expire', expire) if rexpire < expire: expire = rexpire if row['_id'] not in id_list: id_list.append(row['_id']) inserted = 0 lookup_keys = set() fields = dasquery.mongo_query.get('fields') if not fields: # Mongo fields = [] for key in fields: for pkey in self.mapping.mapkeys(key): lookup_keys.add(pkey) for pkey in lookup_keys: skey = [(pkey, DESCENDING)] # lookup all service records spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey} if self.verbose: nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count() msg = "merging %s records, for %s key" % (nrec, pkey) else: msg = "merging records, for %s key" % pkey self.logger.debug(msg) # use exhaust=False since we process all records in aggregator # and it can be delay in processing records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey) # aggregate all records agen = aggregator(dasquery, records, expire) # diff aggregated records gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0])) # insert all records into das.merge using bulk insert size = self.cache_size try: res = self.merge.insert_many(gen) inserted += len(res.inserted_ids) except InvalidDocument as exp: print(dastimestamp('DAS WARNING'), 'InvalidDocument during merge', str(exp)) msg = "Caught bson error: " + str(exp) self.logger.info(msg) records = self.col.find(spec, **PYMONGO_OPTS).sort(skey) gen = aggregator(dasquery, records, expire) genrows = parse2gridfs(self.gfs, pkey, gen, self.logger) das_dict = {'das':{'expire':expire, 'das.record': record_codes('gridfs_record'), 'primary_key':[k for k in lookup_keys], 'system': ['gridfs']}, 'qhash':dasquery.qhash, 'cache_id':[], 'das_id': id_list} for row in genrows: row.update(das_dict) self.merge.insert(row) except InvalidOperation as exp: pass except DuplicateKeyError as err: print(dastimestamp('DAS WARNING'), 'DuplicateKeyError during merge') if not isinstance(gen, list): raise err status = 'fail' if inserted: status = 'ok' elif not lookup_keys: # we get query w/o fields msg = 'qhash %s, no lookup_keys' % dasquery.qhash print(dastimestamp('DAS WARNING'), msg) status = 'ok' else: # we didn't merge anything, it is DB look-up failure msg = 'qhash %s, did not insert into das.merge, attempt %s' \ % (dasquery.qhash, attempt) print(dastimestamp('DAS WARNING'), msg) empty_expire = etstamp() lkeys = list(lookup_keys) das = dict(expire=empty_expire, primary_key=lkeys[0], condition_keys=lkeys, instance=dasquery.instance, system=['das'], services=dasquery.services, record=record_codes('empty_record'), ts=time.time(), api=[]) empty_record = {'das':das, 'qhash': dasquery.qhash, 'cache_id':[], 'das_id': id_list} for key in lkeys: empty_record.update({key.split('.')[0]:[]}) for key, val in dasquery.mongo_query['spec'].items(): if key.find('.') == -1: empty_record[key] = [] else: # it is compound key, e.g. site.name newkey, newval = convert_dot_notation(key, val) empty_record[newkey] = adjust_mongo_keyvalue(newval) self.merge.insert(empty_record) # update DAS records (both meta and data ones, by using qhash) nval = {'$set': {'das.expire':empty_expire}} spec = {'qhash':dasquery.qhash} self.col.update_many(spec, nval) return status
def merge_records(self, dasquery, attempt=0): """ Merge DAS records for provided query. We perform the following steps: 1. get all queries from das.cache by ordering them by primary key 2. run aggregtor function to merge neighbors 3. insert records into das.merge """ ### TMP for asyncio # time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time # remove any entries in merge collection for this query self.merge.delete_many({'qhash': dasquery.qhash}) # proceed self.logger.debug(dasquery) id_list = [] expire = 9999999999 # future # get all API records for given DAS query spec = { 'qhash': dasquery.qhash, 'das.expire': { '$gt': time.time() }, 'das.record': record_codes('query_record') } records = self.col.find(spec, **PYMONGO_OPTS) for row in records: # find smallest expire timestamp to be used by aggregator rexpire = row.get('das', {}).get('expire', expire) if rexpire < expire: expire = rexpire if row['_id'] not in id_list: id_list.append(row['_id']) inserted = 0 lookup_keys = set() fields = dasquery.mongo_query.get('fields') if not fields: # Mongo fields = [] for key in fields: for pkey in self.mapping.mapkeys(key): lookup_keys.add(pkey) for pkey in lookup_keys: skey = [(pkey, DESCENDING)] # lookup all service records spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey} if self.verbose: nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count() msg = "merging %s records, for %s key" % (nrec, pkey) else: msg = "merging records, for %s key" % pkey self.logger.debug(msg) # use exhaust=False since we process all records in aggregator # and it can be delay in processing records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey) # aggregate all records agen = aggregator(dasquery, records, expire) # diff aggregated records gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0])) # insert all records into das.merge using bulk insert size = self.cache_size try: res = self.merge.insert_many(gen) inserted += len(res.inserted_ids) except InvalidDocument as exp: print(dastimestamp('DAS WARNING'), 'InvalidDocument during merge', str(exp)) msg = "Caught bson error: " + str(exp) self.logger.info(msg) records = self.col.find(spec, **PYMONGO_OPTS).sort(skey) gen = aggregator(dasquery, records, expire) genrows = parse2gridfs(self.gfs, pkey, gen, self.logger) das_dict = { 'das': { 'expire': expire, 'das.record': record_codes('gridfs_record'), 'primary_key': [k for k in lookup_keys], 'system': ['gridfs'] }, 'qhash': dasquery.qhash, 'cache_id': [], 'das_id': id_list } for row in genrows: row.update(das_dict) self.merge.insert(row) except InvalidOperation as exp: pass except DuplicateKeyError as err: print(dastimestamp('DAS WARNING'), 'DuplicateKeyError during merge') if not isinstance(gen, list): raise err status = 'fail' if inserted: status = 'ok' elif not lookup_keys: # we get query w/o fields msg = 'qhash %s, no lookup_keys' % dasquery.qhash print(dastimestamp('DAS WARNING'), msg) status = 'ok' else: # we didn't merge anything, it is DB look-up failure msg = 'qhash %s, did not insert into das.merge, attempt %s' \ % (dasquery.qhash, attempt) print(dastimestamp('DAS WARNING'), msg) empty_expire = etstamp() lkeys = list(lookup_keys) das = dict(expire=empty_expire, primary_key=lkeys[0], condition_keys=lkeys, instance=dasquery.instance, system=['das'], services=dasquery.services, record=record_codes('empty_record'), ts=time.time(), api=[]) empty_record = { 'das': das, 'qhash': dasquery.qhash, 'cache_id': [], 'das_id': id_list } for key in lkeys: empty_record.update({key.split('.')[0]: []}) for key, val in dasquery.mongo_query['spec'].items(): if key.find('.') == -1: empty_record[key] = [] else: # it is compound key, e.g. site.name newkey, newval = convert_dot_notation(key, val) empty_record[newkey] = adjust_mongo_keyvalue(newval) self.merge.insert(empty_record) # update DAS records (both meta and data ones, by using qhash) nval = {'$set': {'das.expire': empty_expire}} spec = {'qhash': dasquery.qhash} self.col.update_many(spec, nval) return status