def validate(self, build_config='mygene_allspecies', n=10): '''Validate merged genedoc, currently for ES backend only.''' import random import itertools import pyes self.load_build_config(build_config) last_build = self._build_config['build'][-1] logging.info("Last build record:") logging.info(pformat(last_build)) #assert last_build['target_backend'] == 'es', '"validate" currently works for "es" backend only' target_name = last_build['target'] self.validate_src_collections() self.prepare_target(target_name=target_name) logging.info("Validating...") target_cnt = self.target.count() stats_cnt = last_build['stats']['total_genes'] if target_cnt == stats_cnt: logging.info("OK [total count={}]".format(target_cnt)) else: logging.info( "Warning: total count of gene documents does not match [{}, should be {}]" .format(target_cnt, stats_cnt)) if n > 0: for src in self._build_config['sources']: logging.info("\nSrc: %s" % src) # if 'id_type' in self.src_master[src] and self.src_master[src]['id_type'] != 'entrez_gene': # print "skipped." # continue cnt = self.src[src].count() fdr1 = doc_feeder(self.src[src], step=10000, s=cnt - n) rand_s = random.randint(0, cnt - n) fdr2 = doc_feeder(self.src[src], step=n, s=rand_s, e=rand_s + n) _first_exception = True for doc in itertools.chain(fdr1, fdr2): _id = doc['_id'] try: es_doc = self.target.get_from_id(_id) except pyes.exceptions.NotFoundException: if _first_exception: logging.info() _first_exception = False logging.info("%s not found." % _id) continue for k in doc: if src == 'entrez_homologene' and k == 'taxid': # there is occasionally known error for taxid in homologene data. continue assert es_doc.get( k, None) == doc[k], (_id, k, es_doc.get(k, None), doc[k])
def diff_collections2(b1, b2, result_dir, step=10000): ''' b2 is new collection, b1 is old collection ''' DIFFFILE_PATH = '/home/kevinxin/diff_result/' DATA_FOLDER = os.path.join(DIFFFILE_PATH, result_dir) if not os.path.exists(DATA_FOLDER): os.mkdir(DATA_FOLDER) data_new = doc_feeder(b2.target_collection, step=step, inbatch=True, fields=[]) data_old = doc_feeder(b1.target_collection, step=step, inbatch=True, fields=[]) cnt = 0 cnt_update = 0 cnt_add = 0 cnt_delete = 0 for _batch in data_new: cnt += 1 id_list_new = [_doc['_id'] for _doc in _batch] docs_common = b1.target_collection.find({'_id': {'$in': id_list_new}}, projection=[]) ids_common = [_doc['_id'] for _doc in docs_common] id_in_new = list(set(id_list_new) - set(ids_common)) _updates = [] if len(ids_common) > 0: _updates = _diff_doc_inner_worker2(b1, b2, list(ids_common), fastdiff=True) file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj' _result = {'add': id_in_new, 'update': _updates, 'delete': [], 'source': b2.target_collection.name, 'timestamp': get_timestamp()} if len(_updates) != 0 or len(id_in_new) != 0: dump(_result, file_name) print("(Updated: {}, Added: {})".format(len(_updates), len(id_in_new)), end='') cnt_update += len(_updates) cnt_add += len(id_in_new) print("Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}".format(cnt_update, cnt_add)) print("="*100) for _batch in data_old: cnt += 1 id_list_old = [_doc['_id'] for _doc in _batch] docs_common = b2.target_collection.find({'_id': {'$in': id_list_old}}, projection=[]) ids_common = [_doc['_id'] for _doc in docs_common] id_in_old = list(set(id_list_old)-set(ids_common)) file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj' _result = {'delete': id_in_old, 'add': [], 'update': [], 'source': b2.target_collection.name, 'timestamp': get_timestamp()} if len(id_in_old) != 0: dump(_result, file_name) print("(Deleted: {})".format(len(id_in_old)), end='') cnt_delete += len(id_in_old) print("Finished calculating diff for the old collection. Total number of docs deleted: {}".format(cnt_delete)) print("="*100) print("Summary: (Updated: {}, Added: {}, Deleted: {})".format(cnt_update, cnt_add, cnt_delete))
def update_index(changes, sync_src, sync_target, noconfirm=False): # changes['_add'] = changes['delete'] # changes['_delete'] = changes['add'] # changes['delete'] = changes['_delete'] # changes['add'] = changes['_add'] # del changes['_add'] # del changes['_delete'] print("\t{}\trecords will be added.".format(len(changes['add']))) print("\t{}\trecords will be deleted.".format(len(changes['delete']))) print("\t{}\trecords will be updated.".format(len(changes['update']))) print() print('\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name, sync_src.name)) print('\tsync_target\t{:<45}{}'.format( sync_target.target_esidxer.ES_INDEX_NAME, sync_target.name)) if noconfirm or ask("Continue?") == 'Y': t00 = time.time() es_idxer = sync_target.target_esidxer if len(changes['add']) > 0: print("Adding {} new records...".format(len(changes['add']))) t0 = time.time() _q = {'_id': {'$in': changes['add']}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print("Done. [{}]".format(timesofar(t0))) if len(changes['delete']) > 0: print("Deleting {} old records...".format(len(changes['delete']))) t0 = time.time() es_idxer.delete_docs(changes['delete']) print("Done. [{}]".format(timesofar(t0))) if len(changes['update']) > 0: print("Updating {} existing records...".format( len(changes['update']))) t0 = time.time() ids = [d['_id'] for d in changes['update']] _q = {'_id': {'$in': ids}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print("Done. [{}]".format(timesofar(t0))) print('=' * 20) print('Finished. [{}]'.format(timesofar(t00)))
def validate(self, build_config='mygene_allspecies', n=10): '''Validate merged genedoc, currently for ES backend only.''' import random import itertools import pyes self.load_build_config(build_config) last_build = self._build_config['build'][-1] logging.info("Last build record:") logging.info(pformat(last_build)) #assert last_build['target_backend'] == 'es', '"validate" currently works for "es" backend only' target_name = last_build['target'] self.validate_src_collections() self.prepare_target(target_name=target_name) logging.info("Validating...") target_cnt = self.target.count() stats_cnt = last_build['stats']['total_genes'] if target_cnt == stats_cnt: logging.info("OK [total count={}]".format(target_cnt)) else: logging.info("Warning: total count of gene documents does not match [{}, should be {}]".format(target_cnt, stats_cnt)) if n > 0: for src in self._build_config['sources']: logging.info("\nSrc: %s" % src) # if 'id_type' in self.src_master[src] and self.src_master[src]['id_type'] != 'entrez_gene': # print "skipped." # continue cnt = self.src[src].count() fdr1 = doc_feeder(self.src[src], step=10000, s=cnt - n) rand_s = random.randint(0, cnt - n) fdr2 = doc_feeder(self.src[src], step=n, s=rand_s, e=rand_s + n) _first_exception = True for doc in itertools.chain(fdr1, fdr2): _id = doc['_id'] try: es_doc = self.target.get_from_id(_id) except pyes.exceptions.NotFoundException: if _first_exception: logging.info() _first_exception = False logging.info("%s not found." % _id) continue for k in doc: if src == 'entrez_homologene' and k == 'taxid': # there is occasionally known error for taxid in homologene data. continue assert es_doc.get(k, None) == doc[k], (_id, k, es_doc.get(k, None), doc[k])
def _build_index_sequential(self, collection, verbose=False, query=None, bulk=True, update=False, allow_upsert=True): def rate_control(cnt, t): delay = 0 if t > 90: delay = 30 elif t > 60: delay = 10 if delay: time.sleep(delay) from biothings.utils.mongo import doc_feeder src_docs = doc_feeder(collection, step=self.step, s=self.s, batch_callback=rate_control, query=query) if bulk: if update: # input doc will update existing one # if allow_upsert, create new one if not exist res = self.update_docs(src_docs, upsert=allow_upsert) else: # input doc will overwrite existing one res = self.index_bulk(src_docs) # if len(res[1]) > 0: if res[1]: raise IndexerException("Error: {} docs failed indexing.".format(len(res[1]))) return res[0] else: cnt = 0 for doc in src_docs: self.index(doc) cnt += 1 return cnt
def merge_index_worker(col_name, ids, pindexer, batch_num): col = create_backend(col_name).target_collection idxer = pindexer() upd_cnt = 0 new_cnt = 0 cur = doc_feeder(col, step=len(ids), inbatch=False, query={'_id': { '$in': ids }}) docs = [d for d in cur] [d.pop("_timestamp", None) for d in docs] dids = dict([(d["_id"], d) for d in docs]) dexistings = dict([(d["_id"], d) for d in idxer.get_docs([k for k in dids.keys()])]) for _id in dexistings: d = dexistings[_id] # update in-place d.update(dids[_id]) # mark as processed/updated dids.pop(_id) # updated docs (those existing in col *and* index) upd_cnt = idxer.index_bulk(dexistings.values(), len(dexistings)) logging.debug("%s documents updated in index" % repr(upd_cnt)) # new docs (only in col, *not* in index) new_cnt = idxer.index_bulk(dids.values(), len(dids)) logging.debug("%s new documents in index" % repr(new_cnt)) # need to return one: tuple(cnt,list) ret = (upd_cnt[0] + new_cnt[0], upd_cnt[1] + new_cnt[1]) return ret
def merger_worker(col_name, dest_name, ids, mapper, upsert, batch_num): try: src = mongo.get_src_db() tgt = mongo.get_target_db() col = src[col_name] #if batch_num == 2: # raise ValueError("oula pa bon") dest = DocMongoBackend(tgt, tgt[dest_name]) cur = doc_feeder(col, step=len(ids), inbatch=False, query={'_id': { '$in': ids }}) mapper.load() docs = mapper.process(cur) cnt = dest.update(docs, upsert=upsert) return cnt except Exception as e: logger_name = "build_%s_%s_batch_%s" % (dest_name, col_name, batch_num) logger = get_logger(logger_name, btconfig.LOG_FOLDER) logger.exception(e) exc_fn = os.path.join(btconfig.LOG_FOLDER, "%s.pick" % logger_name) pickle.dump(e, open(exc_fn, "wb")) logger.info("Exception was dumped in pickle file '%s'" % exc_fn) raise
def update_index(changes, sync_src, sync_target, noconfirm=False): # changes['_add'] = changes['delete'] # changes['_delete'] = changes['add'] # changes['delete'] = changes['_delete'] # changes['add'] = changes['_add'] # del changes['_add'] # del changes['_delete'] print("\t{}\trecords will be added.".format(len(changes['add']))) print("\t{}\trecords will be deleted.".format(len(changes['delete']))) print("\t{}\trecords will be updated.".format(len(changes['update']))) print() print('\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name, sync_src.name)) print('\tsync_target\t{:<45}{}'.format(sync_target.target_esidxer.ES_INDEX_NAME, sync_target.name)) if noconfirm or ask("Continue?") == 'Y': t00 = time.time() es_idxer = sync_target.target_esidxer if len(changes['add']) > 0: print("Adding {} new records...".format(len(changes['add']))) t0 = time.time() _q = {'_id': {'$in': changes['add']}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print("Done. [{}]".format(timesofar(t0))) if len(changes['delete']) > 0: print("Deleting {} old records...".format(len(changes['delete']))) t0 = time.time() es_idxer.delete_docs(changes['delete']) print("Done. [{}]".format(timesofar(t0))) if len(changes['update']) > 0: print("Updating {} existing records...".format(len(changes['update']))) t0 = time.time() ids = [d['_id'] for d in changes['update']] _q = {'_id': {'$in': ids}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print("Done. [{}]".format(timesofar(t0))) print('=' * 20) print('Finished. [{}]'.format(timesofar(t00)))
def mark_timestamp(timestamp): #.update({'_id': {'$in': xli1}}, {'$set': {'_timestamp': ts}}, multi=True) target = get_target_db() #genedoc_col = target.genedoc_mygene_allspecies_current genedoc_col = target.genedoc_mygene_xxxxx for doc in doc_feeder(genedoc_col): genedoc_col.update({'_id': doc['_id']}, {'$set': {'_timestamp': timestamp}}, manipulate=False, check_keys=False, upsert=False, w=0)
def new_index_worker(col_name, ids, pindexer, batch_num): col = create_backend(col_name).target_collection idxer = pindexer() cur = doc_feeder(col, step=len(ids), inbatch=False, query={'_id': { '$in': ids }}) cnt = idxer.index_bulk(cur) return cnt
def validate_src(self, collection, return_false=False, return_none=False, return_true=False, verbose=False, flag_invalid=False, generator=False): '''Validate hgvs ids from a src collection.''' return_dict = { False: return_false, True: return_true, None: return_none } # read in the collection from mongodb if is_str(collection): src = get_src_db() _coll = src[collection] else: _coll = collection cursor = doc_feeder(_coll, step=10000) out = {} print_only = not (return_false or return_none or return_true) if not print_only: # output dictionary, three keys: 'false','true','none' for k in return_dict: if return_dict[k]: out[k] = [] # initialize the count cnt_d = {True: 0, False: 0, None: 0} # cnt_d # validate each item in the cursor for item in cursor: _id = item['_id'] valid = self.validate_hgvs(_id, verbose=verbose) if valid == False and flag_invalid: collection.update({"_id": _id}, {'$set': { "unmatched_ref": "True" }}) cnt_d[valid] += 1 if return_dict[valid]: out[valid].append(_id) # print out counts print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True])) print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False])) print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None])) out['summary'] = cnt_d return out
def index(self): clients = self._get_clients() docs = doc_feeder( clients.mongo, step=len(self.ids), inbatch=False, query={'_id': { '$in': self.ids }}) self.logger.info("%s: %d documents.", self.name, len(self.ids)) return clients.es.mindex(docs)
def _merge_parallel_ipython(self, collection, geneid_set, step=100000, idmapping_d=None): from IPython.parallel import Client, require rc = Client() dview = rc[:] #dview = rc.load_balanced_view() dview.block = False target_collection = self.target.target_collection dview['server'], dview[ 'port'] = target_collection.database.client.address dview['database'] = target_collection.database.name dview['collection_name'] = target_collection.name def partition(lst, n): q, r = divmod(len(lst), n) indices = [q * i + min(i, r) for i in range(n + 1)] return [lst[indices[i]:indices[i + 1]] for i in range(n)] @require('pymongo', 'time') def worker(doc_li): conn = pymongo.MongoClient(server, port) target_collection = conn[database][collection_name] t0 = time.time() for doc in doc_li: __id = doc.pop('_id') doc.pop('taxid', None) target_collection.update({'_id': __id}, {'$set': doc}, manipulate=False, upsert=False) # ,safe=True) logging.info('Done. [%.1fs]' % (time.time() - t0)) for doc in doc_feeder(self.src[collection], step=step): _id = doc['_id'] if idmapping_d: _id = idmapping_d.get(_id, None) or _id for __id in alwayslist( _id ): # there could be cases that idmapping returns multiple entrez_gene ids. __id = str(__id) if __id in geneid_set: doc['_id'] = __id self.doc_queue.append(doc) if len(self.doc_queue) >= step: #dview.scatter('doc_li', self.doc_queue) #dview.apply_async(worker) dview.map_async(worker, partition(self.doc_queue, len(rc.ids))) self.doc_queue = [] logging.info("!")
def do_index_worker(col_name, ids, pindexer, batch_num): tgt = mongo.get_target_db() col = tgt[col_name] idxer = pindexer() cur = doc_feeder(col, step=len(ids), inbatch=False, query={'_id': { '$in': ids }}) cnt = idxer.index_bulk(cur) return cnt
def inspect_data(backend_provider, ids, mode, pre_mapping, **kwargs): col = create_backend(backend_provider).target_collection cur = doc_feeder(col, step=len(ids), inbatch=False, query={'_id': { '$in': ids }}) return btinspect.inspect_docs(cur, mode=mode, pre_mapping=pre_mapping, metadata=False, **kwargs)
def _merge_sequential(self, collection, geneid_set, step=100000, idmapping_d=None): for doc in doc_feeder(self.src[collection], step=step): _id = doc['_id'] if idmapping_d: _id = idmapping_d.get(_id, None) or _id for __id in alwayslist(_id): # there could be cases that idmapping returns multiple entrez_gene ids. __id = str(__id) if __id in geneid_set: doc.pop('_id', None) doc.pop('taxid', None) # target_collection.update({'_id': __id}, {'$set': doc}, # manipulate=False, # upsert=False) #,safe=True) self.target.update(__id, doc)
def mark_timestamp(timestamp): #.update({'_id': {'$in': xli1}}, {'$set': {'_timestamp': ts}}, multi=True) target = get_target_db() #genedoc_col = target.genedoc_mygene_allspecies_current genedoc_col = target.genedoc_mygene_xxxxx for doc in doc_feeder(genedoc_col): genedoc_col.update({'_id': doc['_id']}, {'$set': { '_timestamp': timestamp }}, manipulate=False, check_keys=False, upsert=False, w=0)
def backup_timestamp(self, outfile=None, compress=True, ): '''backup "_id" and "_timestamp" fields into a output file.''' ts = time.strftime('%Y%m%d') outfile = outfile or self._target_col.name + '_tsbk_' + ts + '.txt' if compress: outfile += '.bz' import bz2 logging.info('Backing up timestamps into "{}"...'.format(outfile)) t0 = time.time() file_handler = bz2.BZ2File if compress else open with file_handler(outfile, 'wb') as out_f: for doc in doc_feeder(self._target_col, step=100000, fields=['_timestamp']): data = '%s\t%s\n' % (doc['_id'], doc['_timestamp'].strftime('%Y%m%d')) out_f.write(data.encode()) logging.info("Done. %s" % timesofar(t0)) return outfile
def _merge_parallel_ipython(self, collection, geneid_set, step=100000, idmapping_d=None): from IPython.parallel import Client, require rc = Client() dview = rc[:] #dview = rc.load_balanced_view() dview.block = False target_collection = self.target.target_collection dview['server'], dview['port'] = target_collection.database.client.address dview['database'] = target_collection.database.name dview['collection_name'] = target_collection.name def partition(lst, n): q, r = divmod(len(lst), n) indices = [q * i + min(i, r) for i in range(n + 1)] return [lst[indices[i]:indices[i + 1]] for i in range(n)] @require('pymongo', 'time') def worker(doc_li): conn = pymongo.MongoClient(server, port) target_collection = conn[database][collection_name] t0 = time.time() for doc in doc_li: __id = doc.pop('_id') doc.pop('taxid', None) target_collection.update({'_id': __id}, {'$set': doc}, manipulate=False, upsert=False) # ,safe=True) logging.info('Done. [%.1fs]' % (time.time() - t0)) for doc in doc_feeder(self.src[collection], step=step): _id = doc['_id'] if idmapping_d: _id = idmapping_d.get(_id, None) or _id for __id in alwayslist(_id): # there could be cases that idmapping returns multiple entrez_gene ids. __id = str(__id) if __id in geneid_set: doc['_id'] = __id self.doc_queue.append(doc) if len(self.doc_queue) >= step: #dview.scatter('doc_li', self.doc_queue) #dview.apply_async(worker) dview.map_async(worker, partition(self.doc_queue, len(rc.ids))) self.doc_queue = [] logging.info("!")
def validate_src(self, collection, return_false=False, return_none=False, return_true=False, verbose=False, flag_invalid=False, generator=False): '''Validate hgvs ids from a src collection.''' return_dict = { False: return_false, True: return_true, None: return_none } # read in the collection from mongodb if is_str(collection): src = get_src_db() _coll = src[collection] else: _coll = collection cursor = doc_feeder(_coll, step=10000) out = {} print_only = not (return_false or return_none or return_true) if not print_only: # output dictionary, three keys: 'false','true','none' for k in return_dict: if return_dict[k]: out[k] = [] # initialize the count cnt_d = {True: 0, False: 0, None: 0} # cnt_d # validate each item in the cursor for item in cursor: _id = item['_id'] valid = self.validate_hgvs(_id, verbose=verbose) if valid == False and flag_invalid: collection.update({"_id": _id}, {'$set':{"unmatched_ref": "True"}}) cnt_d[valid] += 1 if return_dict[valid]: out[valid].append(_id) # print out counts print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True])) print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False])) print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None])) out['summary'] = cnt_d return out
def _merge_parallel(self, collection, geneid_set, step=100000, idmapping_d=None): from multiprocessing import Process, Queue NUMBER_OF_PROCESSES = 8 input_queue = Queue() input_queue.conn_pool = [] def worker(q, target): while True: doc = q.get() if doc == 'STOP': break __id = doc.pop('_id') doc.pop('taxid', None) target.update(__id, doc) # target_collection.update({'_id': __id}, {'$set': doc}, # manipulate=False, # upsert=False) #,safe=True) # Start worker processes for i in range(NUMBER_OF_PROCESSES): Process(target=worker, args=(input_queue, self.target)).start() for doc in doc_feeder(self.src[collection], step=step): _id = doc['_id'] if idmapping_d: _id = idmapping_d.get(_id, None) or _id for __id in alwayslist( _id ): # there could be cases that idmapping returns multiple entrez_gene ids. __id = str(__id) if __id in geneid_set: doc['_id'] = __id input_queue.put(doc) # Tell child processes to stop for i in range(NUMBER_OF_PROCESSES): input_queue.put('STOP')
def _merge_sequential(self, collection, geneid_set, step=100000, idmapping_d=None): for doc in doc_feeder(self.src[collection], step=step): _id = doc['_id'] if idmapping_d: _id = idmapping_d.get(_id, None) or _id for __id in alwayslist( _id ): # there could be cases that idmapping returns multiple entrez_gene ids. __id = str(__id) if __id in geneid_set: doc.pop('_id', None) doc.pop('taxid', None) # target_collection.update({'_id': __id}, {'$set': doc}, # manipulate=False, # upsert=False) #,safe=True) self.target.update(__id, doc)
def post_merge(self, source_names, batch_size, job_manager): # get the lineage mapper mapper = LineageMapper(name="lineage") # load cache (it's being loaded automatically # as it's not part of an upload process mapper.load() # create a storage to save docs back to merged collection db = get_target_db() col_name = self.target_backend.target_collection.name storage = UpsertStorage(db,col_name) for docs in doc_feeder(self.target_backend.target_collection, step=batch_size, inbatch=True): docs = mapper.process(docs) storage.process(docs,batch_size) # add indices used to create metadata stats keys = ["rank","taxid"] self.logger.info("Creating indices on %s" % repr(keys)) for k in keys: self.target_backend.target_collection.ensure_index(k)
def backup_timestamp( self, outfile=None, compress=True, ): '''backup "_id" and "_timestamp" fields into a output file.''' ts = time.strftime('%Y%m%d') outfile = outfile or self._target_col.name + '_tsbk_' + ts + '.txt' if compress: outfile += '.bz' import bz2 logging.info('Backing up timestamps into "{}"...'.format(outfile)) t0 = time.time() file_handler = bz2.BZ2File if compress else open with file_handler(outfile, 'wb') as out_f: for doc in doc_feeder(self._target_col, step=100000, fields=['_timestamp']): data = '%s\t%s\n' % (doc['_id'], doc['_timestamp'].strftime('%Y%m%d')) out_f.write(data.encode()) logging.info("Done. %s" % timesofar(t0)) return outfile
def merge(self): clients = self._get_clients() upd_cnt, docs_old = 0, {} new_cnt, docs_new = 0, {} # populate docs_old for doc in clients.es.mget(self.ids): docs_old[doc['_id']] = doc # populate docs_new for doc in doc_feeder( clients.mongo, step=len(self.ids), inbatch=False, query={'_id': { '$in': self.ids }}): docs_new[doc['_id']] = doc doc.pop("_timestamp", None) # merge existing ids for key in list(docs_new): if key in docs_old: docs_old[key].update(docs_new[key]) del docs_new[key] # updated docs (those existing in col *and* index) upd_cnt = clients.es.mindex(docs_old.values()) self.logger.info("%s: %d documents updated.", self.name, upd_cnt) # new docs (only in col, *not* in index) new_cnt = clients.es.mindex(docs_new.values()) self.logger.info("%s: %d new documents.", self.name, new_cnt) return upd_cnt + new_cnt
def _merge_parallel(self, collection, geneid_set, step=100000, idmapping_d=None): from multiprocessing import Process, Queue NUMBER_OF_PROCESSES = 8 input_queue = Queue() input_queue.conn_pool = [] def worker(q, target): while True: doc = q.get() if doc == 'STOP': break __id = doc.pop('_id') doc.pop('taxid', None) target.update(__id, doc) # target_collection.update({'_id': __id}, {'$set': doc}, # manipulate=False, # upsert=False) #,safe=True) # Start worker processes for i in range(NUMBER_OF_PROCESSES): Process(target=worker, args=(input_queue, self.target)).start() for doc in doc_feeder(self.src[collection], step=step): _id = doc['_id'] if idmapping_d: _id = idmapping_d.get(_id, None) or _id for __id in alwayslist(_id): # there could be cases that idmapping returns multiple entrez_gene ids. __id = str(__id) if __id in geneid_set: doc['_id'] = __id input_queue.put(doc) # Tell child processes to stop for i in range(NUMBER_OF_PROCESSES): input_queue.put('STOP')
def sync_es_coldhot_jsondiff_worker(diff_file, es_config, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}, debug=False): res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} # check if diff files was already synced synced_file = "%s.synced" % diff_file if os.path.exists(synced_file): logging.info("Diff file '%s' already synced, skip it" % os.path.basename(diff_file)) diff = loadobj(synced_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res eskwargs = {} # pass optional ES Indexer args if hasattr(btconfig, "ES_TIMEOUT"): eskwargs["timeout"] = btconfig.ES_TIMEOUT if hasattr(btconfig, "ES_MAX_RETRY"): eskwargs["max_retries"] = btconfig.ES_MAX_RETRY if hasattr(btconfig, "ES_RETRY"): eskwargs["retry_on_timeout"] = btconfig.ES_RETRY logging.debug("Create ES backend with args: (%s,%s)" % (es_config, eskwargs)) bckend = create_backend(es_config, **eskwargs) indexer = bckend.target_esidxer diff = loadobj(diff_file) # add: diff between hot collections showed we have new documents but it's # possible some of those docs already exist in premerge/cold collection. # if so, they should be treated as dict.update() where the hot document content # has precedence over the cold content for fields in common if selfcontained: # diff["add"] contains all documents, no mongo needed cur = diff["add"] else: new = create_backend(new_db_col_names) # mongo collection to sync from assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): # remove potenial existing _timestamp from document # (not allowed within an ES document (_source)) [d.pop("_timestamp", None) for d in docs] # check which docs already exist in existing index (meaning they exist in cold collection) dids = dict([(d["_id"], d) for d in docs]) dexistings = dict([ (d["_id"], d) for d in indexer.get_docs([k for k in dids.keys()]) ]) logging.debug("From current batch, %d already exist" % len(dexistings)) # remove existing docs from "add" so the rest of the dict will be treated # as "real" added documents while update existing ones with new content toremove = [] for _id, d in dexistings.items(): # update in-place if d == dids[d["_id"]]: logging.debug("%s was already added, skip it" % d["_id"]) toremove.append(d["_id"]) res["skipped"] += 1 else: newd = copy.deepcopy(d) d.update(dids[d["_id"]]) if d == newd: logging.debug("%s was already updated, skip it" % d["_id"]) toremove.append(d["_id"]) res["skipped"] += 1 dids.pop(d["_id"]) for _id in toremove: dexistings.pop(_id) logging.info("Syncing 'add' documents (%s in total) from cold/hot merge: " % len(docs) + "%d documents will be updated as they already exist in the index, " % len(dexistings) + "%d documents will be added (%d skipped as already processed)" % (len(dids), len(toremove))) # treat real "added" documents # Note: no need to check for "already exists" errors, as we already checked that before # in order to know what to do try: res["added"] += indexer.index_bulk(dids.values(), batch_size, action="create")[0] except BulkIndexError: logging.error("Error while adding documents %s" % [k for k in dids.keys()]) # update already existing docs in cold collection # treat real "added" documents try: res["updated"] += indexer.index_bulk(dexistings.values(), batch_size)[0] except BulkIndexError as e: logging.error( "Error while updating (via new hot detected docs) documents: %s" % e) # update: get doc from indexer and apply diff # note: it's the same process as for non-coldhot sync_es_for_update(diff_file, indexer, diff["update"], batch_size, res, debug) # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): del_skip = indexer.delete_docs(ids) res["deleted"] += del_skip[0] res["skipped"] += del_skip[1] logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) # mark as synced os.rename(diff_file, synced_file) return res
def sync_es_jsondiff_worker(diff_file, es_config, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}, debug=False): """Worker to sync data between a new mongo collection and an elasticsearch index""" res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} # check if diff files was already synced synced_file = "%s.synced" % diff_file if os.path.exists(synced_file): logging.info("Diff file '%s' already synced, skip it" % os.path.basename(diff_file)) diff = loadobj(synced_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res eskwargs = {} # pass optional ES Indexer args if hasattr(btconfig, "ES_TIMEOUT"): eskwargs["timeout"] = btconfig.ES_TIMEOUT if hasattr(btconfig, "ES_MAX_RETRY"): eskwargs["max_retries"] = btconfig.ES_MAX_RETRY if hasattr(btconfig, "ES_RETRY"): eskwargs["retry_on_timeout"] = btconfig.ES_RETRY logging.debug("Create ES backend with args: (%s,%s)" % (es_config, eskwargs)) bckend = create_backend(es_config, **eskwargs) indexer = bckend.target_esidxer diff = loadobj(diff_file) errors = [] # add: get ids from "new" if selfcontained: # diff["add"] contains all documents, no mongo needed cur = diff["add"] else: new = create_backend(new_db_col_names) # mongo collection to sync from assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): # remove potenial existing _timestamp from document # (not allowed within an ES document (_source)) [d.pop("_timestamp", None) for d in docs] try: res["added"] += indexer.index_bulk(docs, batch_size, action="create")[0] except BulkIndexError: for doc in docs: _id = doc.pop("_id") try: # force action=create to spot docs already added indexer.index(doc, _id, action="create") res["added"] += 1 except ConflictError: # already added logging.warning("_id '%s' already added" % _id) res["skipped"] += 1 continue except Exception as e: errors.append({"_id": _id, "file": diff_file, "error": e}) import pickle pickle.dump(errors, open("errors", "wb")) raise except Exception as e: if debug: logging.error( "From diff file '%s', following IDs couldn't be synced because: %s\n%s" % (diff_file, e, [d.get("_id") for d in docs])) pickfile = "batch_%s_%s.pickle" % (cnt, os.path.basename(diff_file)) logging.error("Documents pickled in '%s'" % pickfile) pickle.dump(docs, open(pickfile, "wb")) raise # update: get doc from indexer and apply diff sync_es_for_update(diff_file, indexer, diff["update"], batch_size, res, debug) # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): del_skip = indexer.delete_docs(ids) res["deleted"] += del_skip[0] res["skipped"] += del_skip[1] logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) # mark as synced os.rename(diff_file, synced_file) return res
def sync_mongo_jsondiff_worker(diff_file, old_db_col_names, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}, debug=False): """Worker to sync data between a new and an old mongo collection""" # check if diff files was already synced res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} synced_file = "%s.synced" % diff_file if os.path.exists(synced_file): logging.info("Diff file '%s' already synced, skip it" % os.path.basename(diff_file)) diff = loadobj(synced_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res new = create_backend(new_db_col_names) old = create_backend(old_db_col_names) storage = UpsertStorage(get_target_db(), old.target_collection.name, logging) diff = loadobj(diff_file) assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) # add: get ids from "new" if selfcontained: # diff["add"] contains all documents, not mongo needed for docs in iter_n(diff["add"], batch_size): res["added"] += storage.process((d for d in docs), batch_size) else: cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): # use generator otherwise process/doc_iterator will require a dict (that's bad...) res["added"] += storage.process((d for d in docs), batch_size) # update: get doc from "old" and apply diff batch = [] for patch_info in diff["update"]: doc = old.get_from_id(patch_info["_id"]) try: doc = jsonpatch.apply_patch(doc, patch_info["patch"]) batch.append(doc) except jsonpatch.JsonPatchConflict: # assuming already applieda res["skipped"] += 1 continue if len(batch) >= batch_size: res["updated"] += storage.process((d for d in batch), batch_size) batch = [] if batch: res["updated"] += storage.process((d for d in batch), batch_size) # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): res["deleted"] += old.remove_from_ids(ids) # we potentially modified the "old" collection so invalidate cache just to make sure invalidate_cache(old.target_collection.name, "target") logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) # mark as synced os.rename(diff_file, synced_file) return res
def get_all(self, batch_size=100000): for doc_ids in doc_feeder(self.col, step=batch_size, inbatch=True): for d in doc_ids: yield d
def make_genedoc_root(self): if not self._entrez_geneid_d: self._load_entrez_geneid_d() if 'ensembl_gene' in self._build_config['gene_root']: self._load_ensembl2entrez_li() ensembl2entrez = self._idmapping_d_cache['ensembl_gene'] if "species" in self._build_config: _query = {'taxid': {'$in': self._build_config['species']}} elif "species_to_exclude" in self._build_config: _query = {'taxid': {'$nin': self._build_config['species_to_exclude']}} else: _query = None geneid_set = [] species_set = set() if "entrez_gene" in self._build_config['gene_root']: for doc_li in doc_feeder(self.src['entrez_gene'], inbatch=True, step=self.step, query=_query): #target_collection.insert(doc_li, manipulate=False, check_keys=False) self.target.insert(doc_li) geneid_set.extend([doc['_id'] for doc in doc_li]) species_set |= set([doc['taxid'] for doc in doc_li]) cnt_total_entrez_genes = len(geneid_set) cnt_total_species = len(species_set) logging.info('# of entrez Gene IDs in total: %d' % cnt_total_entrez_genes) logging.info('# of species in total: %d' % cnt_total_species) if "ensembl_gene" in self._build_config['gene_root']: cnt_ensembl_only_genes = 0 cnt_total_ensembl_genes = 0 for doc_li in doc_feeder(self.src['ensembl_gene'], inbatch=True, step=self.step, query=_query): _doc_li = [] for _doc in doc_li: cnt_total_ensembl_genes += 1 ensembl_id = _doc['_id'] entrez_gene = ensembl2entrez.get(ensembl_id, None) if entrez_gene is None: #this is an Ensembl only gene _doc_li.append(_doc) cnt_ensembl_only_genes += 1 geneid_set.append(_doc['_id']) if _doc_li: #target_collection.insert(_doc_li, manipulate=False, check_keys=False) self.target.insert(_doc_li) cnt_matching_ensembl_genes = cnt_total_ensembl_genes - cnt_ensembl_only_genes logging.info('# of ensembl Gene IDs in total: %d' % cnt_total_ensembl_genes) logging.info('# of ensembl Gene IDs match entrez Gene IDs: %d' % cnt_matching_ensembl_genes) logging.info('# of ensembl Gene IDs DO NOT match entrez Gene IDs: %d' % cnt_ensembl_only_genes) geneid_set = set(geneid_set) logging.info('# of total Root Gene IDs: %d' % len(geneid_set)) _stats = {'total_entrez_genes': cnt_total_entrez_genes, 'total_species': cnt_total_species, 'total_ensembl_genes': cnt_total_ensembl_genes, 'total_ensembl_genes_mapped_to_entrez': cnt_matching_ensembl_genes, 'total_ensembl_only_genes': cnt_ensembl_only_genes, 'total_genes': len(geneid_set)} self._stats = _stats self._src_version = self.get_src_version() self.log_src_build({'stats': _stats, 'src_version': self._src_version}) return geneid_set
def diff_collections2(b1, b2, result_dir, use_parallel=True, step=10000): ''' b2 is new collection, b1 is old collection ''' if use_parallel: import multiprocessing from functools import partial DATA_FOLDER = result_dir data_new = doc_feeder(b2.target_collection, step=step, inbatch=True, fields={'_id': 1}) data_old = doc_feeder(b1.target_collection, step=step, inbatch=True, fields={'_id': 1}) cnt = 0 cnt_update = 0 cnt_add = 0 cnt_delete = 0 _timestamp = get_timestamp() if not os.path.exists(DATA_FOLDER): os.mkdir(DATA_FOLDER) for batch in data_new: cnt += 1 id_list_new = [doc['_id'] for doc in batch] ids_common = [ doc['_id'] for doc in b1.target_collection.find({'_id': { '$in': id_list_new }}, {'_id': 1}) ] id_in_new = list(set(id_list_new) - set(ids_common)) _updates = [] if len(ids_common) > 0: if use_parallel: step = int(len(ids_common) / multiprocessing.cpu_count()) task_list = [ ids_common[i:i + step] for i in range(0, len(ids_common), step) ] pool = multiprocessing.Pool() partial_worker = partial(_diff_parallel_worker, b1.target_collection.name, b2.target_collection.name) results = pool.map(partial_worker, task_list) pool.close() pool.join() for result in results: _updates += result else: _updates = _diff_doc_inner_worker2(b1, b2, list(ids_common)) file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj' _result = { 'add': id_in_new, 'update': _updates, 'delete': [], 'source': b2.target_collection.name, 'timestamp': _timestamp } if len(_updates) != 0 or len(id_in_new) != 0: dump(_result, file_name) print("(Updated: {}, Added: {})".format(len(_updates), len(id_in_new)), end='') cnt_update += len(_updates) cnt_add += len(id_in_new) print( "Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}" .format(cnt_update, cnt_add)) print("=" * 100) for _batch in data_old: cnt += 1 id_list_old = [_doc['_id'] for _doc in _batch] ids_common = [ doc['_id'] for doc in b2.target_collection.find({'_id': { '$in': id_list_old }}, {'_id': 1}) ] id_in_old = list(set(id_list_old) - set(ids_common)) _result = { 'delete': id_in_old, 'add': [], 'update': [], 'source': b2.target_collection.name, 'timestamp': _timestamp } file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj' if len(id_in_old) != 0: dump(_result, file_name) print("(Deleted: {})".format(len(id_in_old)), end='') cnt_delete += len(id_in_old) print( "Finished calculating diff for the old collection. Total number of docs deleted: {}" .format(cnt_delete)) print("=" * 100) print("Summary: (Updated: {}, Added: {}, Deleted: {})".format( cnt_update, cnt_add, cnt_delete))
def sync_es_jsondiff_worker(diff_file, es_config, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}): """Worker to sync data between a new mongo collection and an elasticsearch index""" new = create_backend(new_db_col_names) # mongo collection to sync from indexer = create_backend(es_config).target_esidxer diff = loadobj(diff_file) res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} # check if diff files was already synced if not force and diff.get("synced", {}).get("es") == True: logging.info("Diff file '%s' already synced, skip it" % diff_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) errors = [] # add: get ids from "new" if selfcontained: # diff["add"] contains all documents, no mongo needed cur = diff["add"] else: cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): try: res["added"] += indexer.index_bulk(docs, batch_size, action="create")[0] except BulkIndexError: for doc in docs: try: # force action=create to spot docs already added indexer.index(doc, doc["_id"], action="create") res["added"] += 1 except ConflictError: # already added res["skipped"] += 1 continue except Exception as e: errors.append({ "_id": doc["_id"], "file": diff_file, "error": e }) import pickle pickle.dump(errors, open("errors", "wb")) raise # update: get doc from indexer and apply diff batch = [] ids = [p["_id"] for p in diff["update"]] for i, doc in enumerate(indexer.get_docs(ids)): try: patch_info = diff["update"][ i] # same order as what's return by get_doc()... assert patch_info["_id"] == doc["_id"] # ... but just make sure newdoc = jsonpatch.apply_patch(doc, patch_info["patch"]) if newdoc == doc: # already applied res["skipped"] += 1 continue batch.append(newdoc) except jsonpatch.JsonPatchConflict: # assuming already applieda res["skipped"] += 1 continue if len(batch) >= batch_size: res["updated"] += indexer.index_bulk(batch, batch_size)[0] batch = [] if batch: res["updated"] += indexer.index_bulk(batch, batch_size)[0] # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): del_skip = indexer.delete_docs(ids) res["deleted"] += del_skip[0] res["skipped"] += del_skip[1] logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) diff.setdefault("synced", {}).setdefault("es", True) dump(diff, diff_file) return res
def make_genedoc_root(self): if not self._entrez_geneid_d: self._load_entrez_geneid_d() if 'ensembl_gene' in self._build_config['gene_root']: self._load_ensembl2entrez_li() ensembl2entrez = self._idmapping_d_cache['ensembl_gene'] if "species" in self._build_config: _query = {'taxid': {'$in': self._build_config['species']}} elif "species_to_exclude" in self._build_config: _query = { 'taxid': { '$nin': self._build_config['species_to_exclude'] } } else: _query = None geneid_set = [] species_set = set() if "entrez_gene" in self._build_config['gene_root']: for doc_li in doc_feeder(self.src['entrez_gene'], inbatch=True, step=self.step, query=_query): #target_collection.insert(doc_li, manipulate=False, check_keys=False) self.target.insert(doc_li) geneid_set.extend([doc['_id'] for doc in doc_li]) species_set |= set([doc['taxid'] for doc in doc_li]) cnt_total_entrez_genes = len(geneid_set) cnt_total_species = len(species_set) logging.info('# of entrez Gene IDs in total: %d' % cnt_total_entrez_genes) logging.info('# of species in total: %d' % cnt_total_species) if "ensembl_gene" in self._build_config['gene_root']: cnt_ensembl_only_genes = 0 cnt_total_ensembl_genes = 0 for doc_li in doc_feeder(self.src['ensembl_gene'], inbatch=True, step=self.step, query=_query): _doc_li = [] for _doc in doc_li: cnt_total_ensembl_genes += 1 ensembl_id = _doc['_id'] entrez_gene = ensembl2entrez.get(ensembl_id, None) if entrez_gene is None: #this is an Ensembl only gene _doc_li.append(_doc) cnt_ensembl_only_genes += 1 geneid_set.append(_doc['_id']) if _doc_li: #target_collection.insert(_doc_li, manipulate=False, check_keys=False) self.target.insert(_doc_li) cnt_matching_ensembl_genes = cnt_total_ensembl_genes - cnt_ensembl_only_genes logging.info('# of ensembl Gene IDs in total: %d' % cnt_total_ensembl_genes) logging.info('# of ensembl Gene IDs match entrez Gene IDs: %d' % cnt_matching_ensembl_genes) logging.info( '# of ensembl Gene IDs DO NOT match entrez Gene IDs: %d' % cnt_ensembl_only_genes) geneid_set = set(geneid_set) logging.info('# of total Root Gene IDs: %d' % len(geneid_set)) _stats = { 'total_entrez_genes': cnt_total_entrez_genes, 'total_species': cnt_total_species, 'total_ensembl_genes': cnt_total_ensembl_genes, 'total_ensembl_genes_mapped_to_entrez': cnt_matching_ensembl_genes, 'total_ensembl_only_genes': cnt_ensembl_only_genes, 'total_genes': len(geneid_set) } self._stats = _stats self._src_version = self.get_src_version() self.log_src_build({ 'stats': _stats, 'src_version': self._src_version }) return geneid_set