def create_backend(db_col_names,name_only=False,**kwargs): """ Guess what's inside 'db_col_names' and return the corresponding backend. - It could be a string (by default, will lookup a mongo collection in target database) - or a tuple("target|src","col_name") - or a ("mongodb://*****:*****@host","db","col_name") URI. - or a ("es_host:port","index_name","doc_type") If name_only is true, just return the name uniquely identifying the collection or index URI connection. """ col = None db = None is_mongo = True if type(db_col_names) == str: db = mongo.get_target_db() col = db[db_col_names] # normalize params db_col_names = ["%s:%s" % (db.client.HOST,db.client.PORT),db.name,col.name] elif db_col_names[0].startswith("mongodb://"): assert len(db_col_names) == 3, "Missing connection information for %s" % repr(db_col_names) conn = mongo.MongoClient(db_col_names[0]) db = conn[db_col_names[1]] col = db[db_col_names[2]] # normalize params db_col_names = ["%s:%s" % (db.client.HOST,db.client.PORT),db.name,col.name] elif len(db_col_names) == 3 and ":" in db_col_names[0]: is_mongo = False idxr = ESIndexer(index=db_col_names[1],doc_type=db_col_names[2],es_host=db_col_names[0],**kwargs) db = idxr col = db_col_names[1] else: assert len(db_col_names) == 2, "Missing connection information for %s" % repr(db_col_names) db = db_col_names[0] == "target" and mongo.get_target_db() or mongo.get_src_db() col = db[db_col_names[1]] # normalize params (0:host, 1:port) db_col_names = ["%s:%s" % (db.client.address[0],db.client.address[1]),db.name,col.name] assert not col is None, "Could not create collection object from %s" % repr(db_col_names) if name_only: if is_mongo: return "mongo_%s_%s_%s" % (db_col_names[0].replace(":","_"), db_col_names[1],db_col_names[2]) else: return "es_%s_%s_%s" % (db_col_names[0].replace(":","_"), db_col_names[1],db_col_names[2]) else: if is_mongo: return DocMongoBackend(db,col) else: return DocESBackend(db)
def do(srcs, tgt): pinfo = { "category": "cache", "source": None, "step": "rebuild", "description": "" } config.logger.info("Rebuild cache for sources: %s, target: %s" % (srcs, tgt)) for src in srcs: # src can be a full name (eg. clinvar.clinvar_hg38) but id_feeder knows only name (clinvar_hg38) if "." in src: src = src.split(".")[1] config.logger.info("Rebuilding cache for source '%s'" % src) col = mongo.get_src_db()[src] pinfo["source"] = src job = yield from job_manager.defer_to_thread( pinfo, partial(rebuild, col)) yield from job config.logger.info("Done rebuilding cache for source '%s'" % src) if tgt: config.logger.info("Rebuilding cache for target '%s'" % tgt) col = mongo.get_target_db()[tgt] pinfo["source"] = tgt job = job_manager.defer_to_thread(pinfo, partial(rebuild, col)) yield from job
def post_merge(self, source_names, batch_size, job_manager): tgt = mongo.get_target_db()[self.target_name] # background=true or it'll lock the whole database... self.logger.info("Indexing 'taxid'") tgt.create_index("taxid",background=True) self.logger.info("Indexing 'entrezgene'") tgt.create_index("entrezgene",background=True)
def test(): target = get_target_db() sync_src = backend.GeneDocMongoDBBackend( target['genedoc_mygene_allspecies_20130402_uiu7bkyi']) idxer = ESIndexer() sync_target = backend.GeneDocESBackend(idxer) return sync_src, sync_target
def chrom_worker(col_name, ids): tgt = mongo.get_target_db() col = tgt[col_name] cur = col.find({'_id': {'$in': ids}}) bob = col.initialize_unordered_bulk_op() disagreed = [] missing = [] root_keys = {} at_least_one = False for doc in cur: dchrom = get_chrom(doc) if dchrom["chrom"] is None: missing.append(doc["_id"]) elif dchrom["agreed"] is False: disagreed.append(doc["_id"]) chrom = dchrom["chrom"] if chrom: bob.find({"_id": doc["_id"]}).update({"$set": {"chrom" : chrom}}) at_least_one = True # count root keys for later metadata for k in doc: # other root keys are actual sources and # are counted under "src" key while merge_stats if k in ["_id","vcf","total","hg19","hg38","observed"]: root_keys.setdefault(k,0) root_keys[k] += 1 at_least_one and bob.execute() return {"missing": missing, "disagreed" : disagreed, "root_keys" : root_keys}
def pick_target_collection(self, autoselect=True): '''print out a list of available target_collection, let user to pick one.''' target_db = get_target_db() target_collection_prefix = 'genedoc_' + self._build_config['name'] target_collection_list = [ target_db[name] for name in sorted(target_db.collection_names()) if name.startswith(target_collection_prefix) ] if target_collection_list: logging.info("Found {} target collections:".format( len(target_collection_list))) logging.info('\n'.join([ '\t{0:<5}{1.name:<45}\t{2}'.format( str(i + 1) + ':', target, target.count()) for (i, target) in enumerate(target_collection_list) ])) logging.info() while 1: if autoselect: selected_idx = input("Pick one above [{}]:".format( len(target_collection_list))) else: selected_idx = input("Pick one above:") if autoselect: selected_idx = selected_idx or len(target_collection_list) try: selected_idx = int(selected_idx) break except ValueError: continue return target_collection_list[selected_idx - 1] else: logging.info("Found no target collections.")
def chrom_worker(col_name, ids): tgt = mongo.get_target_db() col = tgt[col_name] cur = col.find({'_id': {'$in': ids}}) bob = col.initialize_unordered_bulk_op() disagreed = [] missing = [] root_keys = {} at_least_one = False for doc in cur: dchrom = get_chrom(doc) if dchrom["chrom"] is None: missing.append(doc["_id"]) elif dchrom["agreed"] == False: disagreed.append(doc["_id"]) chrom = dchrom["chrom"] if chrom: bob.find({"_id": doc["_id"]}).update({"$set": {"chrom" : chrom}}) at_least_one = True # count root keys for later metadata for k in doc: root_keys.setdefault(k,0) root_keys[k] += 1 at_least_one and bob.execute() return {"missing": missing, "disagreed" : disagreed, "root_keys" : root_keys}
def merger_worker(col_name, dest_name, ids, mapper, upsert, batch_num): try: src = mongo.get_src_db() tgt = mongo.get_target_db() col = src[col_name] #if batch_num == 2: # raise ValueError("oula pa bon") dest = DocMongoBackend(tgt, tgt[dest_name]) cur = doc_feeder(col, step=len(ids), inbatch=False, query={'_id': { '$in': ids }}) mapper.load() docs = mapper.process(cur) cnt = dest.update(docs, upsert=upsert) return cnt except Exception as e: logger_name = "build_%s_%s_batch_%s" % (dest_name, col_name, batch_num) logger = get_logger(logger_name, btconfig.LOG_FOLDER) logger.exception(e) exc_fn = os.path.join(btconfig.LOG_FOLDER, "%s.pick" % logger_name) pickle.dump(e, open(exc_fn, "wb")) logger.info("Exception was dumped in pickle file '%s'" % exc_fn) raise
def chrom_worker(col_name, ids): tgt = mongo.get_target_db() col = tgt[col_name] cur = col.find({'_id': {'$in': ids}}) bob = col.initialize_unordered_bulk_op() disagreed = [] missing = [] root_keys = {} at_least_one = False for doc in cur: dchrom = get_chrom(doc) if dchrom["chrom"] is None: missing.append(doc["_id"]) elif dchrom["agreed"] == False: disagreed.append(doc["_id"]) chrom = dchrom["chrom"] if chrom: bob.find({"_id": doc["_id"]}).update({"$set": {"chrom": chrom}}) at_least_one = True # count root keys for later metadata for k in doc: root_keys.setdefault(k, 0) root_keys[k] += 1 at_least_one and bob.execute() return {"missing": missing, "disagreed": disagreed, "root_keys": root_keys}
def pick_target_collection(self, autoselect=True): '''print out a list of available target_collection, let user to pick one.''' target_db = get_target_db() target_collection_prefix = 'genedoc_' + self._build_config['name'] target_collection_list = [target_db[name] for name in sorted(target_db.collection_names()) if name.startswith(target_collection_prefix)] if target_collection_list: logging.info("Found {} target collections:".format(len(target_collection_list))) logging.info('\n'.join(['\t{0:<5}{1.name:<45}\t{2}'.format( str(i + 1) + ':', target, target.count()) for (i, target) in enumerate(target_collection_list)])) logging.info() while 1: if autoselect: selected_idx = input("Pick one above [{}]:".format(len(target_collection_list))) else: selected_idx = input("Pick one above:") if autoselect: selected_idx = selected_idx or len(target_collection_list) try: selected_idx = int(selected_idx) break except ValueError: continue return target_collection_list[selected_idx - 1] else: logging.info("Found no target collections.")
def get_stats(self,sources,job_manager): self.stats = super(MyGeneDataBuilder,self).get_stats(sources,job_manager) # enrich with some specific mygene counts, specially regarding ensembl vs. entrez tgt = mongo.get_target_db()[self.target_name] self.stats["total_genes"] = tgt.count() # entrez genes are digits only (also, don't count entrez_gene collection, # because tgt can be a subset, we have to work with the merged collection) self.logger.debug("Counting 'total_entrez_genes'") entrez_cnt = tgt.find({"entrezgene":{"$exists":1}},{"_id":1}).count() self.stats["total_entrez_genes"] = entrez_cnt # ensembl genes aount are taken from : # 1. "ensembl" field, but it can a list => use aggregation. # Note: "ensembl.0" means first element of the list, so it implicitely # select doc with a list. Finally, filtering with {$type:"array"} doesn't work because # mongo filters this on the most inner field (that's weird, but it is what is it...) # 2. when document is root doc coming from ensembl_gene collection without a "ensembl" key ("orphan") # Note: we can't create a sparce or conditional index to help querying "ensembl" # because data is too long for an index key, and "hashed" mode doesn't work because list aren't supported # Queries are gonna use colscan strategy... self.logger.debug("Counting 'total_ensembl_genes'") res = tgt.aggregate([ {"$match" : {"ensembl.0" : {"$exists" : True}}}, {"$project" : {"num_gene" : {"$size" : "$ensembl"}}}, {"$group" : {"_id" : None, "sum" : {"$sum": "$num_gene"}}} ]) try: list_count = next(res)["sum"] except StopIteration: list_count = 0 object_count = tgt.find({"ensembl" : {"$type" : "object"}},{"_id":1}).count() orphan_count = tgt.find({"_id":{"$regex":'''\\w'''},"ensembl":{"$exists":0}},{"_id":1}).count() total_ensembl_genes = list_count + object_count + orphan_count self.stats["total_ensembl_genes"] = total_ensembl_genes # this one can't be computed from merged collection, and is only valid when build # involves all data (no filter, no subset) self.logger.debug("Counting 'total_ensembl_genes_mapped_to_entrez'") # this one is similar to total_ensembl_genes except we cross with entrezgene (ie. so they're mapped) try: list_count = next(tgt.aggregate([ {"$match" : {"$and" : [{"ensembl.0" : {"$exists" : True}},{"entrezgene":{"$exists":1}}]}}, {"$project" : {"num_gene" : {"$size" : "$ensembl"}}}, {"$group" : {"_id" : None, "sum" : {"$sum": "$num_gene"}}} ]))["sum"] except StopIteration: list_count = 0 object_count = tgt.find({"$and": [{"ensembl" : {"$type" : "object"}},{"entrezgene":{"$exists":1}}]},{"_id":1}).count() mapped = list_count + object_count self.stats["total_ensembl_genes_mapped_to_entrez"] = mapped # ensembl gene contains letters (if it wasn't, it means it would only contain digits # so it would be an entrez gene (\\D = non-digits, can't use \\w as a digit *is* a letter) self.logger.debug("Counting 'total_ensembl_only_genes'") ensembl_unmapped = tgt.find({"_id":{"$regex":'''\\D'''}},{"_id":1}).count() self.stats["total_ensembl_only_genes"] = ensembl_unmapped self.logger.debug("Counting 'total_species'") self.stats["total_species"] = len(tgt.distinct("taxid")) return self.stats
def mark_timestamp(timestamp): #.update({'_id': {'$in': xli1}}, {'$set': {'_timestamp': ts}}, multi=True) target = get_target_db() #genedoc_col = target.genedoc_mygene_allspecies_current genedoc_col = target.genedoc_mygene_xxxxx for doc in doc_feeder(genedoc_col): genedoc_col.update({'_id': doc['_id']}, {'$set': {'_timestamp': timestamp}}, manipulate=False, check_keys=False, upsert=False, w=0)
def __init__(self, pindexer, *args, **kwargs): super(IndexerManager, self).__init__(*args, **kwargs) self.pindexer = pindexer self.src_build = mongo.get_src_build() self.target_db = mongo.get_target_db() self.t0 = time.time() self.prepared = False self.log_folder = LOG_FOLDER self.timestamp = datetime.now() self.setup()
def get_target_collection(self): '''get the lastest target_collection from src_build record.''' src_build = getattr(self, 'src_build', None) if src_build: _cfg = src_build.find_one({'_id': self._build_config['_id']}) if _cfg['build'][-1].get('status', None) == 'success' and \ _cfg['build'][-1].get('target', None): target_collection = _cfg['build'][-1]['target'] _db = get_target_db() target_collection = _db[target_collection] return target_collection
def clean_old_collections(self): # use target_name is given, otherwise build name will be used # as collection name prefix, so they should start like that prefix = "%s_" % (self.target_name or self.build_name) db = mongo.get_target_db() cols = [c for c in db.collection_names() if c.startswith(prefix)] # timestamp is what's after _archive_, YYYYMMDD, so we can sort it safely cols = sorted(cols, reverse=True) to_drop = cols[self.keep_archive:] for colname in to_drop: self.logger.info("Cleaning old archive collection '%s'" % colname) db[colname].drop()
def do_index_worker(col_name, ids, pindexer, batch_num): tgt = mongo.get_target_db() col = tgt[col_name] idxer = pindexer() cur = doc_feeder(col, step=len(ids), inbatch=False, query={'_id': { '$in': ids }}) cnt = idxer.index_bulk(cur) return cnt
def mark_timestamp(timestamp): #.update({'_id': {'$in': xli1}}, {'$set': {'_timestamp': ts}}, multi=True) target = get_target_db() #genedoc_col = target.genedoc_mygene_allspecies_current genedoc_col = target.genedoc_mygene_xxxxx for doc in doc_feeder(genedoc_col): genedoc_col.update({'_id': doc['_id']}, {'$set': { '_timestamp': timestamp }}, manipulate=False, check_keys=False, upsert=False, w=0)
def prepare_target(self, target_name=None): '''call self.update_backend() after validating self._build_config.''' if self.target.name == 'mongodb': _db = get_target_db() target_collection_name = target_name or self._get_target_name() self.target.target_collection = _db[target_collection_name] logging.info("Target: %s" % repr(target_collection_name)) elif self.target.name == 'es': self.target.target_esidxer.ES_INDEX_NAME = target_name or self._get_target_name() self.target.target_esidxer._mapping = self.get_mapping() elif self.target.name == 'couchdb': self.target.db_name = target_name or ('genedoc' + '_' + self._build_config['name']) elif self.target.name == 'memory': self.target.target_name = target_name or ('genedoc' + '_' + self._build_config['name'])
def apply_changes(self, changes, verify=True, noconfirm=False): if verify: self.pre_verify_changes(changes) if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'): print("Aborted.") return -1 step = self.step _db = get_target_db() source_col = _db[changes['source']] src = GeneDocMongoDBBackend(source_col) target = GeneDocESBackend(self) _timestamp = changes['timestamp'] def _add_docs(ids): i = 0 for _ids in iter_n(ids, step): t1 = time.time() _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp i += 1 target.insert(_doc_li) print('\t{}\t{}'.format(i, timesofar(t1))) t0 = time.time() if changes['add']: print("Adding {} new docs...".format(len(changes['add']))) t00 = time.time() _add_docs(changes['add']) print("done. [{}]".format(timesofar(t00))) if changes['delete']: print("Deleting {} discontinued docs...".format( len(changes['delete'])), end='') t00 = time.time() target.remove_from_ids(changes['delete'], step=step) print("done. [{}]".format(timesofar(t00))) if changes['update']: print("Updating {} existing docs...".format(len( changes['update']))) t00 = time.time() ids = [x['_id'] for x in changes['update']] _add_docs(ids) print("done. [{}]".format(timesofar(t00))) target.finalize() print("\n") print("Finished.", timesofar(t0))
def clean_temp_collections(self, build_name, date=None, prefix=''): """ Delete all target collections created from builder named "build_name" at given date (or any date is none given -- carefull...). Date is a string (YYYYMMDD or regex) Common collection name prefix can also be specified if needed. """ target_db = mongo.get_target_db() for col_name in target_db.collection_names(): search = prefix and prefix + "_" or "" search += build_name + '_' search += date and date + '_' or '' pat = re.compile(search) if pat.match(col_name) and not 'current' in col_name: logging.info("Dropping target collection '%s" % col_name) target_db[col_name].drop()
def apply_changes(self, changes, verify=True, noconfirm=False): if verify: self.pre_verify_changes(changes) if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'): print("Aborted.") return -1 step = self.step _db = get_target_db() source_col = _db[changes['source']] src = GeneDocMongoDBBackend(source_col) target = GeneDocESBackend(self) _timestamp = changes['timestamp'] def _add_docs(ids): i = 0 for _ids in iter_n(ids, step): t1 = time.time() _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp i += 1 target.insert(_doc_li) print('\t{}\t{}'.format(i, timesofar(t1))) t0 = time.time() if changes['add']: print("Adding {} new docs...".format(len(changes['add']))) t00 = time.time() _add_docs(changes['add']) print("done. [{}]".format(timesofar(t00))) if changes['delete']: print("Deleting {} discontinued docs...".format(len(changes['delete'])), end='') t00 = time.time() target.remove_from_ids(changes['delete'], step=step) print("done. [{}]".format(timesofar(t00))) if changes['update']: print("Updating {} existing docs...".format(len(changes['update']))) t00 = time.time() ids = [x['_id'] for x in changes['update']] _add_docs(ids) print("done. [{}]".format(timesofar(t00))) target.finalize() print("\n") print("Finished.", timesofar(t0))
def prepare_target(self, target_name=None): '''call self.update_backend() after validating self._build_config.''' if self.target.name == 'mongodb': _db = get_target_db() target_collection_name = target_name or self._get_target_name() self.target.target_collection = _db[target_collection_name] logging.info("Target: %s" % repr(target_collection_name)) elif self.target.name == 'es': self.target.target_esidxer.ES_INDEX_NAME = target_name or self._get_target_name( ) self.target.target_esidxer._mapping = self.get_mapping() elif self.target.name == 'couchdb': self.target.db_name = target_name or ('genedoc' + '_' + self._build_config['name']) elif self.target.name == 'memory': self.target.target_name = target_name or ( 'genedoc' + '_' + self._build_config['name'])
def validate(build_config=None): from pprint import pprint from utils.diff import diff_collections from databuild.backend import GeneDocMongoDBBackend, GeneDocESBackend from biothings.utils.mongo import get_src_build, get_target_db from utils.es import ESIndexer src_build = get_src_build() _cfg = src_build.find_one({'_id': build_config}) last_build = _cfg['build'][-1] print("Last build record:") pprint(last_build) target_name = last_build['target'] mongo_target = get_target_db() b1 = GeneDocMongoDBBackend(mongo_target[target_name]) b2 = GeneDocESBackend(ESIndexer(es_index_name=target_name, es_host='127.0.0.1:' + str(es_local_tunnel_port))) changes = diff_collections(b1, b2, use_parallel=True, step=10000) return changes
def post_merge(self, source_names, batch_size, job_manager): # get the lineage mapper mapper = LineageMapper(name="lineage") # load cache (it's being loaded automatically # as it's not part of an upload process mapper.load() # create a storage to save docs back to merged collection db = get_target_db() col_name = self.target_backend.target_collection.name storage = UpsertStorage(db,col_name) for docs in doc_feeder(self.target_backend.target_collection, step=batch_size, inbatch=True): docs = mapper.process(docs) storage.process(docs,batch_size) # add indices used to create metadata stats keys = ["rank","taxid"] self.logger.info("Creating indices on %s" % repr(keys)) for k in keys: self.target_backend.target_collection.ensure_index(k)
def validate(build_config=None): from pprint import pprint from utils.diff import diff_collections from databuild.backend import GeneDocMongoDBBackend, GeneDocESBackend from biothings.utils.mongo import get_src_build, get_target_db from utils.es import ESIndexer src_build = get_src_build() _cfg = src_build.find_one({'_id': build_config}) last_build = _cfg['build'][-1] print("Last build record:") pprint(last_build) target_name = last_build['target'] mongo_target = get_target_db() b1 = GeneDocMongoDBBackend(mongo_target[target_name]) b2 = GeneDocESBackend( ESIndexer(es_index_name=target_name, es_host='127.0.0.1:' + str(es_local_tunnel_port))) changes = diff_collections(b1, b2, use_parallel=True, step=10000) return changes
def chrom_worker(col_name, ids): tgt = mongo.get_target_db() col = tgt[col_name] cur = col.find({'_id': {'$in': ids}}) bob = col.initialize_unordered_bulk_op() disagreed = [] missing = [] at_least_one = False for doc in cur: dchrom = get_chrom(doc) if dchrom["chrom"] is None: missing.append(doc["_id"]) elif dchrom["agreed"] == False: disagreed.append(doc["_id"]) chrom = dchrom["chrom"] if chrom: bob.find({"_id": doc["_id"]}).update({"$set": {"chrom" : chrom}}) at_least_one = True at_least_one and bob.execute() return {"missing": missing, "disagreed" : disagreed}
def diff2src(use_parallel=True, noconfirm=False): src_li = [] target_db = get_target_db() src_li.extend([(name, target_db[name].count(), 'mongodb') for name in sorted(target_db.collection_names()) if name.startswith('genedoc')]) es_idxer = ESIndexer() es_idxer.conn.default_indices = [] for es_idx in es_idxer.conn.indices.get_indices(): if es_idx.startswith('genedoc'): es_idxer.ES_INDEX_NAME = es_idx src_li.append((es_idx, es_idxer.count()['count'], 'es')) print("Found {} sources:".format(len(src_li))) src_1 = _pick_one(src_li, "Pick first source above: ") src_li.remove(src_1) print src_2 = _pick_one(src_li, "Pick second source above: ") sync_li = [] for src in (src_1, src_2): if src[2] == 'mongodb': b = backend.GeneDocMongoDBBackend(target_db[src[0]]) elif src[2] == 'es': es_idxer = ESIndexer() es_idxer.ES_INDEX_NAME = src[0] es_idxer.step = 10000 b = backend.GeneDocESBackend(es_idxer) sync_li.append(b) sync_src, sync_target = sync_li print('\tsync_src:\t{:<45}{}\t{}'.format(*src_1)) print('\tsync_target\t{:<45}{}\t{}'.format(*src_2)) if noconfirm or ask("Continue?") == "Y": changes = diff.diff_collections(sync_src, sync_target, use_parallel=use_parallel) return changes
def do(srcs,tgt): pinfo = {"category" : "cache", "source" : None, "step" : "rebuild", "description" : ""} config.logger.info("Rebuild cache for sources: %s, target: %s" % (srcs,tgt)) for src in srcs: # src can be a full name (eg. clinvar.clinvar_hg38) but id_feeder knows only name (clinvar_hg38) if "." in src: src = src.split(".")[1] config.logger.info("Rebuilding cache for source '%s'" % src) col = mongo.get_src_db()[src] pinfo["source"] = src job = yield from job_manager.defer_to_thread(pinfo, partial(rebuild,col)) yield from job config.logger.info("Done rebuilding cache for source '%s'" % src) if tgt: config.logger.info("Rebuilding cache for target '%s'" % tgt) col = mongo.get_target_db()[tgt] pinfo["source"] = tgt job = job_manager.defer_to_thread(pinfo, partial(rebuild,col)) yield from job
def do(srcs, tgt): pinfo = { "category": "cache", "source": None, "step": "rebuild", "description": "" } config.logger.info("Rebuild cache for sources: %s, target: %s" % (srcs, tgt)) for src in srcs: config.logger.info("Rebuilding cache for source '%s'" % src) col = mongo.get_src_db()[src] pinfo["source"] = src job = yield from job_manager.defer_to_thread( pinfo, partial(rebuild, col)) yield from job config.logger.info("Done rebuilding cache for source '%s'" % src) if tgt: config.logger.info("Rebuilding cache for target '%s'" % tgt) col = mongo.get_target_db()[tgt] pinfo["source"] = tgt job = job_manager.defer_to_thread(pinfo, partial(rebuild, col)) yield from job
def __init__(self, build_config='genedoc_mygene'): self.build_config = build_config self._db = get_target_db() self._target_col = self._db[self.build_config + '_current'] self.step = 10000
def diff_two(col_1, col_2, use_parallel=True): target = get_target_db() b1 = GeneDocMongoDBBackend(target[col_1]) b2 = GeneDocMongoDBBackend(target[col_2]) return diff_collections(b1, b2, use_parallel=use_parallel)
def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None, noconfirm=False): """Build ES index from last successfully-merged mongodb collection. optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST. optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name """ self.load_build_config(build_config) assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config last_build = self._build_config['build'][last_build_idx] logging.info("Last build record:") logging.info(pformat(last_build)) assert last_build['status'] == 'success', \ "Abort. Last build did not success." assert last_build['target_backend'] == "mongodb", \ 'Abort. Last build need to be built using "mongodb" backend.' assert last_build.get('stats', None), \ 'Abort. Last build stats are not available.' self._stats = last_build['stats'] assert last_build.get('target', None), \ 'Abort. Last build target_collection is not available.' # Get the source collection to build the ES index # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field, # only the "genedoc_*_current" collection does. When "timestamp" is enabled # in mappings, last_build['target'] collection won't be indexed by ES correctly, # therefore, we use "genedoc_*_current" collection as the source here: #target_collection = last_build['target'] target_collection = "genedoc_{}_current".format(build_config) _db = get_target_db() target_collection = _db[target_collection] logging.info("") logging.info('Source: %s' % target_collection.name) _mapping = self.get_mapping() _meta = {} src_version = self.get_src_version() if src_version: _meta['src_version'] = src_version if getattr(self, '_stats', None): _meta['stats'] = self._stats if 'timestamp' in last_build: _meta['timestamp'] = last_build['timestamp'] if _meta: _mapping['_meta'] = _meta es_index_name = es_index_name or target_collection.name es_idxer = ESIndexer(mapping=_mapping, es_index_name=es_index_name, es_host=es_host, step=5000) if build_config == 'mygene_allspecies': es_idxer.number_of_shards = 10 # default 5 es_idxer.check() if noconfirm or ask("Continue to build ES index?") == 'Y': es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 if es_idxer.exists_index(es_idxer.ES_INDEX_NAME): if noconfirm or ask('Index "{}" exists. Delete?'.format(es_idxer.ES_INDEX_NAME)) == 'Y': es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME) else: logging.info("Abort.") return es_idxer.create_index() #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False)
def get_stats(self, sources, job_manager): self.stats = super(MyChemDataBuilder, self).get_stats(sources, job_manager) tgt = mongo.get_target_db()[self.target_name] self.stats["total"] = tgt.count() return self.stats
def export_ids(col_name): """ Export all _ids from collection named col_name. If col_name refers to a build where a cold_collection is defined, will also extract _ids and sort/uniq them to have the full list of _ids of the actual merged (cold+hot) collection Output file is stored in DATA_EXPORT_FOLDER/ids, defaulting to <DATA_ARCHIVE_ROOT>/export/ids. Output filename is returned as the end, if successful. """ # prepare output directory DATA_EXPORT_FOLDER = getattr(btconfig,"DATA_EXPORT_FOLDER",None) if not DATA_EXPORT_FOLDER: DATA_EXPORT_FOLDER = os.path.join(btconfig.DATA_ARCHIVE_ROOT,"export") ids_export_folder = os.path.join(DATA_EXPORT_FOLDER,"ids") if not os.path.exists(ids_export_folder): logging.debug("Creating export/ids folder: %s" % ids_export_folder) os.makedirs(ids_export_folder) build = get_src_build().find_one({"_id":col_name}) cold = None if build: col = get_target_db()[col_name] if build.get("build_config",{}).get("cold_collection"): cold_name = build["build_config"]["cold_collection"] cold = get_target_db()[cold_name] logging.info("Found a cold collection '%s' associated to '%s'" % (cold_name,col_name)) else: # it's a src col = get_src_db()[col_name] # first iterate over all _ids. This will potentially update underlying _id cache it's not valid anymore, # so we're sure to work with latest data. If cache is valid, this will be pretty fast logging.info("Screening _ids in collection '%s'" % col.name) for _id in id_feeder(col,validate_only=True): pass # now accessing cache col_ids_cache = get_cache_filename(col.name) assert os.path.exists(col_ids_cache) logging.info("Now using cache file %s" % col_ids_cache) if cold: logging.info("Screening _ids in cold collection '%s'" % cold.name) for _id in id_feeder(cold,validate_only=True): pass # now accessing cache cold_ids_cache = get_cache_filename(cold.name) assert os.path.exists(cold_ids_cache) logging.info("Now using cache file %s" % cold_ids_cache) outfn = os.path.join(ids_export_folder,"%s_ids.xz" % col_name) # NOTE: can't use anyfile to open cache files and send _id through pipes # because it would load _id in memory (unless using hacks) so use cat (and # existing uncompressing ones, like gzcat/xzcat/...) to fully run the pipe # on the shell if cold: fout = anyfile(outfn,"wb") colext = os.path.splitext(col_ids_cache)[1] coldext = os.path.splitext(cold_ids_cache)[1] assert colext == coldext, "Hot and Cold _id cache are compressed differently (%s and %s), it should be the same" % (coldext,coldext) comp = colext.replace(".","") supportedcomps = ["xz","gz",""] # no compression allowed as well assert comp in supportedcomps, "Compression '%s' isn't supported (%s)" % (comp,supportedcomps) # IDs sent to pipe's input (sort) then compress it (xz) pcat = subprocess.Popen(["%scat" % comp, col_ids_cache, cold_ids_cache],stdout=subprocess.PIPE) psort = subprocess.Popen(["sort","-u"],stdin=pcat.stdout,stdout=subprocess.PIPE,universal_newlines=True) pcat.stdout.close() # will raise end of pipe error when finished if comp: pcomp = subprocess.Popen(["xz","-c"],stdin=psort.stdout,stdout=fout) else: # just print stdin to stdout pcomp = subprocess.Popen(["tee"],stdin=psort.stdout,stdout=fout) psort.stdout.close() try: logging.info("Running pipe to compute list of unique _ids") (out,err) = pcomp.communicate() # run the pipe! (blocking) if err: raise Exception(err) except Exception as e: logging.error("Error while running pipe to export _ids: %s" % e) # make sure to clean empty or half processed files try: os.unlink(outfn) finally: pass raise else: logging.info("Copying cache _id file") try: shutil.copyfile(col_ids_cache,outfn) except Exception as e: logging.error("Error while exporting _ids: %s" % e) # make sure to clean empty or half processed files try: os.unlink(outfn) finally: pass raise logging.info("Done exporting _ids to '%s'" % outfn) return outfn
def create_backend(db_col_names, name_only=False, follow_ref=False, **kwargs): """ Guess what's inside 'db_col_names' and return the corresponding backend. - It could be a string (will first check for an src_build doc to check a backend_url field, if nothing there, will lookup a mongo collection in target database) - or a tuple("target|src","col_name") - or a ("mongodb://*****:*****@host","db","col_name") URI. - or a ("es_host:port","index_name","doc_type") If name_only is true, just return the name uniquely identifying the collection or index URI connection. """ col = None db = None is_mongo = True if type(db_col_names) == str: # first check build doc, if there's backend_url key, we'll use it instead of # direclty using db_col_names as target collection (see LinkDataBuilder) bdoc = get_src_build().find_one({"_id": db_col_names}) if follow_ref and bdoc and bdoc.get( "backend_url") and bdoc["backend_url"] != db_col_names: return create_backend(bdoc["backend_url"], name_only=name_only, follow_ref=follow_ref, **kwargs) else: db = mongo.get_target_db() col = db[db_col_names] # normalize params db_col_names = [ "%s:%s" % (db.client.HOST, db.client.PORT), db.name, col.name ] elif db_col_names[0].startswith("mongodb://"): assert len( db_col_names ) == 3, "Missing connection information for %s" % repr(db_col_names) conn = mongo.MongoClient(db_col_names[0]) db = conn[db_col_names[1]] col = db[db_col_names[2]] # normalize params db_col_names = [ "%s:%s" % (db.client.HOST, db.client.PORT), db.name, col.name ] elif len(db_col_names) == 3 and ":" in db_col_names[0]: is_mongo = False idxr = ESIndexer(index=db_col_names[1], doc_type=db_col_names[2], es_host=db_col_names[0], **kwargs) db = idxr col = db_col_names[1] else: assert len( db_col_names ) == 2, "Missing connection information for %s" % repr(db_col_names) db = db_col_names[0] == "target" and mongo.get_target_db( ) or mongo.get_src_db() col = db[db_col_names[1]] # normalize params (0:host, 1:port) db_col_names = [ "%s:%s" % (db.client.address[0], db.client.address[1]), db.name, col.name ] assert col is not None, "Could not create collection object from %s" % repr( db_col_names) if name_only: if is_mongo: return "mongo_%s_%s_%s" % (db_col_names[0].replace( ":", "_"), db_col_names[1], db_col_names[2]) else: return "es_%s_%s_%s" % (db_col_names[0].replace( ":", "_"), db_col_names[1], db_col_names[2]) else: if is_mongo: return DocMongoBackend(db, col) else: return DocESBackend(db)
def index(self, target_name, index_name, job_manager, steps=["index", "post"], batch_size=10000, ids=None, mode="index"): """ Build an index named "index_name" with data from collection "target_collection". "ids" can be passed to selectively index documents. "mode" can have the following values: - 'purge': will delete index if it exists - 'resume': will use existing index and add documents. "ids" can be passed as a list of missing IDs, or, if not pass, ES will be queried to identify which IDs are missing for each batch in order to complete the index. - None (default): will create a new index, assuming it doesn't already exist """ assert job_manager # check what to do if type(steps) == str: steps = [steps] self.target_name = target_name self.index_name = index_name self.setup_log() self.load_build() got_error = False cnt = 0 if "index" in steps: _db = mongo.get_target_db() target_collection = _db[target_name] _mapping = self.get_mapping() _extra = self.get_index_creation_settings() _meta = {} # partially instantiated indexer instance for process workers partial_idxer = partial(ESIndexer, doc_type=self.doc_type, index=index_name, es_host=self.host, step=batch_size, number_of_shards=self.num_shards, number_of_replicas=self.num_replicas) # instantiate one here for index creation es_idxer = partial_idxer() if es_idxer.exists_index(): if mode == "purge": es_idxer.delete_index() elif mode != "resume": raise IndexerException( "Index already '%s' exists, (use mode='purge' to auto-delete it or mode='resume' to add more documents)" % index_name) if mode != "resume": es_idxer.create_index({self.doc_type: _mapping}, _extra) jobs = [] total = target_collection.count() btotal = math.ceil(total / batch_size) bnum = 1 if ids: self.logger.info( "Indexing from '%s' with specific list of _ids, create indexer job with batch_size=%d" % (target_name, batch_size)) id_provider = [ids] else: self.logger.info( "Fetch _ids from '%s', and create indexer job with batch_size=%d" % (target_name, batch_size)) id_provider = id_feeder(target_collection, batch_size=batch_size, logger=self.logger) for ids in id_provider: yield from asyncio.sleep(0.0) cnt += len(ids) pinfo = self.get_pinfo() pinfo["step"] = self.target_name pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum, btotal, (cnt / total * 100)) self.logger.info("Creating indexer job #%d/%d, to index '%s' %d/%d (%.1f%%)" % \ (bnum,btotal,target_name,cnt,total,(cnt/total*100.))) job = yield from job_manager.defer_to_process( pinfo, partial(indexer_worker, self.target_name, ids, partial_idxer, bnum, mode)) def batch_indexed(f, batch_num): nonlocal got_error res = f.result() if type(res) != tuple or type(res[0]) != int: got_error = Exception( "Batch #%s failed while indexing collection '%s' [result:%s]" % (batch_num, self.target_name, repr(f.result()))) job.add_done_callback(partial(batch_indexed, batch_num=bnum)) jobs.append(job) bnum += 1 # raise error as soon as we know if got_error: raise got_error self.logger.info("%d jobs created for indexing step" % len(jobs)) tasks = asyncio.gather(*jobs) def done(f): nonlocal got_error if None in f.result(): got_error = Exception("Some batches failed") return # compute overall inserted/updated records # returned values looks like [(num,[]),(num,[]),...] cnt = sum([val[0] for val in f.result()]) self.logger.info("Index '%s' successfully created" % index_name, extra={"notify": True}) tasks.add_done_callback(done) yield from tasks if "post" in steps: self.logger.info("Running post-index process for index '%s'" % index_name) pinfo = self.get_pinfo() pinfo["step"] = "post_index" # for some reason (like maintaining object's state between pickling). # we can't use process there. Need to use thread to maintain that state without # building an unmaintainable monster job = yield from job_manager.defer_to_thread( pinfo, partial(self.post_index, target_name, index_name, job_manager, steps=steps, batch_size=batch_size, ids=ids, mode=mode)) def posted(f): try: res = f.result() self.logger.info( "Post-index process done for index '%s': %s" % (index_name, res)) except Exception as e: got_error = e self.logger.error( "Post-index process failed for index '%s': %s" % (index_name, e), extra={"notify": True}) raise job.add_done_callback(posted) yield from asyncio.gather(job) # consume future if got_error: raise got_error else: return {"%s" % self.target_name: cnt}
def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None, noconfirm=False): """Build ES index from last successfully-merged mongodb collection. optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST. optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name """ self.load_build_config(build_config) assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config last_build = self._build_config['build'][last_build_idx] logging.info("Last build record:") logging.info(pformat(last_build)) assert last_build['status'] == 'success', \ "Abort. Last build did not success." assert last_build['target_backend'] == "mongodb", \ 'Abort. Last build need to be built using "mongodb" backend.' assert last_build.get('stats', None), \ 'Abort. Last build stats are not available.' self._stats = last_build['stats'] assert last_build.get('target', None), \ 'Abort. Last build target_collection is not available.' # Get the source collection to build the ES index # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field, # only the "genedoc_*_current" collection does. When "timestamp" is enabled # in mappings, last_build['target'] collection won't be indexed by ES correctly, # therefore, we use "genedoc_*_current" collection as the source here: #target_collection = last_build['target'] target_collection = "genedoc_{}_current".format(build_config) _db = get_target_db() target_collection = _db[target_collection] logging.info("") logging.info('Source: %s' % target_collection.name) _mapping = self.get_mapping() _meta = {} src_version = self.get_src_version() if src_version: _meta['src_version'] = src_version if getattr(self, '_stats', None): _meta['stats'] = self._stats if 'timestamp' in last_build: _meta['timestamp'] = last_build['timestamp'] if _meta: _mapping['_meta'] = _meta es_index_name = es_index_name or target_collection.name es_idxer = ESIndexer(mapping=_mapping, es_index_name=es_index_name, es_host=es_host, step=5000) if build_config == 'mygene_allspecies': es_idxer.number_of_shards = 10 # default 5 es_idxer.check() if noconfirm or ask("Continue to build ES index?") == 'Y': es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 if es_idxer.exists_index(es_idxer.ES_INDEX_NAME): if noconfirm or ask('Index "{}" exists. Delete?'.format( es_idxer.ES_INDEX_NAME)) == 'Y': es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME) else: logging.info("Abort.") return es_idxer.create_index() #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False)
def sync_mongo_jsondiff_worker(diff_file, old_db_col_names, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}, debug=False): """Worker to sync data between a new and an old mongo collection""" # check if diff files was already synced res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} synced_file = "%s.synced" % diff_file if os.path.exists(synced_file): logging.info("Diff file '%s' already synced, skip it" % os.path.basename(diff_file)) diff = loadobj(synced_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res new = create_backend(new_db_col_names) old = create_backend(old_db_col_names) storage = UpsertStorage(get_target_db(), old.target_collection.name, logging) diff = loadobj(diff_file) assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) # add: get ids from "new" if selfcontained: # diff["add"] contains all documents, not mongo needed for docs in iter_n(diff["add"], batch_size): res["added"] += storage.process((d for d in docs), batch_size) else: cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): # use generator otherwise process/doc_iterator will require a dict (that's bad...) res["added"] += storage.process((d for d in docs), batch_size) # update: get doc from "old" and apply diff batch = [] for patch_info in diff["update"]: doc = old.get_from_id(patch_info["_id"]) try: doc = jsonpatch.apply_patch(doc, patch_info["patch"]) batch.append(doc) except jsonpatch.JsonPatchConflict: # assuming already applieda res["skipped"] += 1 continue if len(batch) >= batch_size: res["updated"] += storage.process((d for d in batch), batch_size) batch = [] if batch: res["updated"] += storage.process((d for d in batch), batch_size) # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): res["deleted"] += old.remove_from_ids(ids) # we potentially modified the "old" collection so invalidate cache just to make sure invalidate_cache(old.target_collection.name, "target") logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) # mark as synced os.rename(diff_file, synced_file) return res
def test(): target = get_target_db() sync_src = backend.GeneDocMongoDBBackend(target['genedoc_mygene_allspecies_20130402_uiu7bkyi']) idxer = ESIndexer() sync_target = backend.GeneDocESBackend(idxer) return sync_src, sync_target