def diff_collections2(b1, b2, result_dir, step=10000): ''' b2 is new collection, b1 is old collection ''' DIFFFILE_PATH = '/home/kevinxin/diff_result/' DATA_FOLDER = os.path.join(DIFFFILE_PATH, result_dir) if not os.path.exists(DATA_FOLDER): os.mkdir(DATA_FOLDER) data_new = doc_feeder(b2.target_collection, step=step, inbatch=True, fields=[]) data_old = doc_feeder(b1.target_collection, step=step, inbatch=True, fields=[]) cnt = 0 cnt_update = 0 cnt_add = 0 cnt_delete = 0 for _batch in data_new: cnt += 1 id_list_new = [_doc['_id'] for _doc in _batch] docs_common = b1.target_collection.find({'_id': {'$in': id_list_new}}, projection=[]) ids_common = [_doc['_id'] for _doc in docs_common] id_in_new = list(set(id_list_new) - set(ids_common)) _updates = [] if len(ids_common) > 0: _updates = _diff_doc_inner_worker2(b1, b2, list(ids_common), fastdiff=True) file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj' _result = {'add': id_in_new, 'update': _updates, 'delete': [], 'source': b2.target_collection.name, 'timestamp': get_timestamp()} if len(_updates) != 0 or len(id_in_new) != 0: dump(_result, file_name) print("(Updated: {}, Added: {})".format(len(_updates), len(id_in_new)), end='') cnt_update += len(_updates) cnt_add += len(id_in_new) print("Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}".format(cnt_update, cnt_add)) print("="*100) for _batch in data_old: cnt += 1 id_list_old = [_doc['_id'] for _doc in _batch] docs_common = b2.target_collection.find({'_id': {'$in': id_list_old}}, projection=[]) ids_common = [_doc['_id'] for _doc in docs_common] id_in_old = list(set(id_list_old)-set(ids_common)) file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj' _result = {'delete': id_in_old, 'add': [], 'update': [], 'source': b2.target_collection.name, 'timestamp': get_timestamp()} if len(id_in_old) != 0: dump(_result, file_name) print("(Deleted: {})".format(len(id_in_old)), end='') cnt_delete += len(id_in_old) print("Finished calculating diff for the old collection. Total number of docs deleted: {}".format(cnt_delete)) print("="*100) print("Summary: (Updated: {}, Added: {}, Deleted: {})".format(cnt_update, cnt_add, cnt_delete))
def _build_index_sequential(self, collection, verbose=False, query=None, bulk=True, update=False, allow_upsert=True): from utils.mongo import doc_feeder def rate_control(cnt, t): delay = 0 if t > 90: delay = 30 elif t > 60: delay = 10 if delay: print("\tPausing for {}s...".format(delay), end='') time.sleep(delay) print("done.") src_docs = doc_feeder(collection, step=self.step, s=self.s, batch_callback=rate_control, query=query) if bulk: if update: # input doc will update existing one # if allow_upsert, create new one if not exist res = self.update_docs(src_docs, upsert=allow_upsert) else: # input doc will overwrite existing one res = self.index_bulk(src_docs) if len(res[1]) > 0: print("Error: {} docs failed indexing.".format(len(res[1]))) file_name = collection + '_es_error.pyobj' dump(res, file_name) return res[0] else: cnt = 0 for doc in src_docs: self.index(doc) cnt += 1 if verbose: print(cnt, ':', doc['_id']) return cnt
def update_from_temp_collections(config, no_confirm=False, use_parallel=False): t0 = time.time() sc = GeneDocSyncer(config) new_src_li = sc.get_new_source_list() if not new_src_li: logging.info("No new source collections need to update. Abort now.") return logging.info("Found {} new source collections need to update:".format( len(new_src_li))) logging.info("\n".join(['\t' + x for x in new_src_li])) if no_confirm or ask('Continue?') == 'Y': logfile = 'databuild_sync_{}_{}.log'.format(config, time.strftime('%Y%m%d')) logfile = os.path.join(LOG_FOLDER, logfile) setup_logfile(logfile) for src in new_src_li: t0 = time.time() logging.info("Current source collection: %s" % src) ts = _get_timestamp(src, as_str=True) logging.info("Calculating changes... ") changes = sc.get_changes(src, use_parallel=use_parallel) logging.info("Done") get_changes_stats(changes) if no_confirm or ask("Continue to save changes...") == 'Y': if config == 'genedoc_mygene': dumpfile = 'changes_{}.pyobj'.format(ts) else: dumpfile = 'changes_{}_allspecies.pyobj'.format(ts) dump(changes, dumpfile) dumpfile_key = 'genedoc_changes/' + dumpfile logging.info('Saving to S3: "{}"... '.format(dumpfile_key)) send_s3_file(dumpfile, dumpfile_key) logging.info('Done.') #os.remove(dumpfile) if no_confirm or ask("Continue to apply changes...") == 'Y': sc.apply_changes(changes) sc.verify_changes(changes) logging.info('=' * 20) logging.info("Finished. %s" % timesofar(t0))
def update_from_temp_collections(config,no_confirm=False,use_parallel=False): t0 = time.time() sc = GeneDocSyncer(config) new_src_li = sc.get_new_source_list() if not new_src_li: logging.info("No new source collections need to update. Abort now.") return logging.info("Found {} new source collections need to update:".format(len(new_src_li))) logging.info("\n".join(['\t' + x for x in new_src_li])) if no_confirm or ask('Continue?') == 'Y': logfile = 'databuild_sync_{}_{}.log'.format(config, time.strftime('%Y%m%d')) logfile = os.path.join(LOG_FOLDER, logfile) setup_logfile(logfile) for src in new_src_li: t0 = time.time() logging.info("Current source collection: %s" % src) ts = _get_timestamp(src, as_str=True) logging.info("Calculating changes... ") changes = sc.get_changes(src, use_parallel=use_parallel) logging.info("Done") get_changes_stats(changes) if no_confirm or ask("Continue to save changes...") == 'Y': if config == 'genedoc_mygene': dumpfile = 'changes_{}.pyobj'.format(ts) else: dumpfile = 'changes_{}_allspecies.pyobj'.format(ts) dump(changes, dumpfile) dumpfile_key = 'genedoc_changes/' + dumpfile logging.info('Saving to S3: "{}"... '.format(dumpfile_key)) send_s3_file(dumpfile, dumpfile_key) logging.info('Done.') #os.remove(dumpfile) if no_confirm or ask("Continue to apply changes...") == 'Y': sc.apply_changes(changes) sc.verify_changes(changes) logging.info('=' * 20) logging.info("Finished. %s" % timesofar(t0))
def finalize(self): '''dump target_dict into a file.''' from utils.common import dump dump(self.target_dict, self.target_name+'.pyobj')
def diff_collections2(b1, b2, result_dir, use_parallel=True, step=10000): ''' b2 is new collection, b1 is old collection ''' if use_parallel: import multiprocessing from functools import partial DATA_FOLDER = result_dir data_new = doc_feeder(b2.target_collection, step=step, inbatch=True, fields={'_id': 1}) data_old = doc_feeder(b1.target_collection, step=step, inbatch=True, fields={'_id': 1}) cnt = 0 cnt_update = 0 cnt_add = 0 cnt_delete = 0 _timestamp = get_timestamp() if not os.path.exists(DATA_FOLDER): os.mkdir(DATA_FOLDER) for batch in data_new: cnt += 1 id_list_new = [doc['_id'] for doc in batch] ids_common = [ doc['_id'] for doc in b1.target_collection.find({'_id': { '$in': id_list_new }}, {'_id': 1}) ] id_in_new = list(set(id_list_new) - set(ids_common)) _updates = [] if len(ids_common) > 0: if use_parallel: step = int(len(ids_common) / multiprocessing.cpu_count()) task_list = [ ids_common[i:i + step] for i in range(0, len(ids_common), step) ] pool = multiprocessing.Pool() partial_worker = partial(_diff_parallel_worker, b1.target_collection.name, b2.target_collection.name) results = pool.map(partial_worker, task_list) pool.close() pool.join() for result in results: _updates += result else: _updates = _diff_doc_inner_worker2(b1, b2, list(ids_common)) file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj' _result = { 'add': id_in_new, 'update': _updates, 'delete': [], 'source': b2.target_collection.name, 'timestamp': _timestamp } if len(_updates) != 0 or len(id_in_new) != 0: dump(_result, file_name) print("(Updated: {}, Added: {})".format(len(_updates), len(id_in_new)), end='') cnt_update += len(_updates) cnt_add += len(id_in_new) print( "Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}" .format(cnt_update, cnt_add)) print("=" * 100) for _batch in data_old: cnt += 1 id_list_old = [_doc['_id'] for _doc in _batch] ids_common = [ doc['_id'] for doc in b2.target_collection.find({'_id': { '$in': id_list_old }}, {'_id': 1}) ] id_in_old = list(set(id_list_old) - set(ids_common)) _result = { 'delete': id_in_old, 'add': [], 'update': [], 'source': b2.target_collection.name, 'timestamp': _timestamp } file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj' if len(id_in_old) != 0: dump(_result, file_name) print("(Deleted: {})".format(len(id_in_old)), end='') cnt_delete += len(id_in_old) print( "Finished calculating diff for the old collection. Total number of docs deleted: {}" .format(cnt_delete)) print("=" * 100) print("Summary: (Updated: {}, Added: {}, Deleted: {})".format( cnt_update, cnt_add, cnt_delete))
def finalize(self): '''dump target_dict into a file.''' from utils.common import dump dump(self.target_dict, self.target_name + '.pyobj')
def get_geneid_d(species_li=None, load_cache=True, save_cache=True): '''return a dictionary of current/retired geneid to current geneid mapping. This is useful, when other annotations were mapped to geneids may contain retired gene ids. if species_li is None, genes from all species are loaded. Note that all ids are int type. ''' if species_li: taxid_set = set([TAXONOMY[species] for species in species_li]) else: taxid_set = None orig_cwd = os.getcwd() os.chdir(DATA_FOLDER) # check cache file _cache_file = 'gene/geneid_d.pyobj' if load_cache and os.path.exists(_cache_file) and \ file_newer(_cache_file, 'gene/gene_info.gz') and \ file_newer(_cache_file, 'gene/gene_history.gz'): print('Loading "geneid_d" from cache file...', end='') _taxid_set, out_d = loadobj(_cache_file) assert _taxid_set == taxid_set print('Done.') os.chdir(orig_cwd) return out_d DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz') load_start(DATAFILE) if species_li: species_filter = lambda ld: int(ld[0]) in taxid_set else: species_filter = None geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter)) load_done('[%d]' % len(geneid_li)) DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz') load_start(DATAFILE) if species_li: _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li else: _includefn = lambda ld: ld[1] in geneid_li # include all species retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn) # includefn above makes sure taxid is for species_li and filters out those # mapped_to geneid exists in gene_info list load_done('[%d]' % len(retired2gene)) # convert key/value to int out_d = dict_convert(retired2gene, keyfn=int, valuefn=int) for g in geneid_li: _g = int(g) out_d[_g] = _g if save_cache: if species_li: dump((taxid_set, out_d), _cache_file) else: dump((None, out_d), _cache_file) os.chdir(orig_cwd) return out_d
def get_geneid_d(species_li=None, load_cache=True, save_cache=True): '''return a dictionary of current/retired geneid to current geneid mapping. This is useful, when other annotations were mapped to geneids may contain retired gene ids. if species_li is None, genes from all species are loaded. Note that all ids are int type. ''' if species_li: taxid_set = set([taxid_d[species] for species in species_li]) else: taxid_set = None orig_cwd = os.getcwd() os.chdir(DATA_FOLDER) # check cache file _cache_file = 'gene/geneid_d.pyobj' if load_cache and os.path.exists(_cache_file) and \ file_newer(_cache_file, 'gene/gene_info.gz') and \ file_newer(_cache_file, 'gene/gene_history.gz'): print('Loading "geneid_d" from cache file...', end='') _taxid_set, out_d = loadobj(_cache_file) assert _taxid_set == taxid_set print('Done.') os.chdir(orig_cwd) return out_d DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz') load_start(DATAFILE) if species_li: species_filter = lambda ld: int(ld[0]) in taxid_set else: species_filter = None geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter)) load_done('[%d]' % len(geneid_li)) DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz') load_start(DATAFILE) if species_li: _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li else: _includefn = lambda ld: ld[1] in geneid_li # include all species retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn) # includefn above makes sure taxid is for species_li and filters out those mapped_to geneid exists in gene_info list load_done('[%d]' % len(retired2gene)) out_d = dict_convert(retired2gene, keyfn=int, valuefn=int) # convert key/value to int for g in geneid_li: _g = int(g) out_d[_g] = _g if save_cache: if species_li: dump((taxid_set, out_d), _cache_file) else: dump((None, out_d), _cache_file) os.chdir(orig_cwd) return out_d