Exemplo n.º 1
0
def create_backend(db_col_names,name_only=False,**kwargs):
    """
    Guess what's inside 'db_col_names' and return the corresponding backend.
    - It could be a string (by default, will lookup a mongo collection in target database)
    - or a tuple("target|src","col_name")
    - or a ("mongodb://*****:*****@host","db","col_name") URI.
    - or a ("es_host:port","index_name","doc_type")
    If name_only is true, just return the name uniquely identifying the collection or index
    URI connection.
    """
    col = None
    db = None
    is_mongo = True
    if type(db_col_names) == str:
        db = mongo.get_target_db()
        col = db[db_col_names]
        # normalize params
        db_col_names = ["%s:%s" % (db.client.HOST,db.client.PORT),db.name,col.name]
    elif db_col_names[0].startswith("mongodb://"):
        assert len(db_col_names) == 3, "Missing connection information for %s" % repr(db_col_names)
        conn = mongo.MongoClient(db_col_names[0])
        db = conn[db_col_names[1]]
        col = db[db_col_names[2]]
        # normalize params
        db_col_names = ["%s:%s" % (db.client.HOST,db.client.PORT),db.name,col.name]
    elif len(db_col_names) == 3 and ":" in db_col_names[0]:
        is_mongo = False
        idxr = ESIndexer(index=db_col_names[1],doc_type=db_col_names[2],es_host=db_col_names[0],**kwargs)
        db = idxr
        col = db_col_names[1]
    else:
        assert len(db_col_names) == 2, "Missing connection information for %s" % repr(db_col_names)
        db = db_col_names[0] == "target" and mongo.get_target_db() or mongo.get_src_db()
        col = db[db_col_names[1]]
        # normalize params (0:host, 1:port)
        db_col_names = ["%s:%s" % (db.client.address[0],db.client.address[1]),db.name,col.name]
    assert not col is None, "Could not create collection object from %s" % repr(db_col_names)
    if name_only:
        if is_mongo:
            return "mongo_%s_%s_%s" % (db_col_names[0].replace(":","_"),
                                      db_col_names[1],db_col_names[2])
        else:
            return "es_%s_%s_%s" % (db_col_names[0].replace(":","_"),
                                    db_col_names[1],db_col_names[2])
    else:
        if is_mongo:
            return DocMongoBackend(db,col)
        else:
            return DocESBackend(db)
Exemplo n.º 2
0
 def do(srcs, tgt):
     pinfo = {
         "category": "cache",
         "source": None,
         "step": "rebuild",
         "description": ""
     }
     config.logger.info("Rebuild cache for sources: %s, target: %s" %
                        (srcs, tgt))
     for src in srcs:
         # src can be a full name (eg. clinvar.clinvar_hg38) but id_feeder knows only name (clinvar_hg38)
         if "." in src:
             src = src.split(".")[1]
         config.logger.info("Rebuilding cache for source '%s'" % src)
         col = mongo.get_src_db()[src]
         pinfo["source"] = src
         job = yield from job_manager.defer_to_thread(
             pinfo, partial(rebuild, col))
         yield from job
         config.logger.info("Done rebuilding cache for source '%s'" % src)
     if tgt:
         config.logger.info("Rebuilding cache for target '%s'" % tgt)
         col = mongo.get_target_db()[tgt]
         pinfo["source"] = tgt
         job = job_manager.defer_to_thread(pinfo, partial(rebuild, col))
         yield from job
Exemplo n.º 3
0
 def post_merge(self, source_names, batch_size, job_manager):
     tgt = mongo.get_target_db()[self.target_name]
     # background=true or it'll lock the whole database...
     self.logger.info("Indexing 'taxid'")
     tgt.create_index("taxid",background=True)
     self.logger.info("Indexing 'entrezgene'")
     tgt.create_index("entrezgene",background=True)
Exemplo n.º 4
0
def test():
    target = get_target_db()
    sync_src = backend.GeneDocMongoDBBackend(
        target['genedoc_mygene_allspecies_20130402_uiu7bkyi'])
    idxer = ESIndexer()
    sync_target = backend.GeneDocESBackend(idxer)
    return sync_src, sync_target
Exemplo n.º 5
0
def chrom_worker(col_name, ids):
    tgt = mongo.get_target_db()
    col = tgt[col_name]
    cur = col.find({'_id': {'$in': ids}})
    bob = col.initialize_unordered_bulk_op()
    disagreed = []
    missing = []
    root_keys = {}
    at_least_one = False
    for doc in cur:
        dchrom = get_chrom(doc)
        if dchrom["chrom"] is None:
            missing.append(doc["_id"])
        elif dchrom["agreed"] is False:
            disagreed.append(doc["_id"])
        chrom = dchrom["chrom"]
        if chrom:
            bob.find({"_id": doc["_id"]}).update({"$set": {"chrom" : chrom}})
            at_least_one = True
        # count root keys for later metadata
        for k in doc:
            # other root keys are actual sources and
            # are counted under "src" key while merge_stats
            if k in ["_id","vcf","total","hg19","hg38","observed"]:
                root_keys.setdefault(k,0)
                root_keys[k] += 1

    at_least_one and bob.execute()

    return {"missing": missing, "disagreed" : disagreed, "root_keys" : root_keys}
Exemplo n.º 6
0
 def pick_target_collection(self, autoselect=True):
     '''print out a list of available target_collection, let user to pick one.'''
     target_db = get_target_db()
     target_collection_prefix = 'genedoc_' + self._build_config['name']
     target_collection_list = [
         target_db[name] for name in sorted(target_db.collection_names())
         if name.startswith(target_collection_prefix)
     ]
     if target_collection_list:
         logging.info("Found {} target collections:".format(
             len(target_collection_list)))
         logging.info('\n'.join([
             '\t{0:<5}{1.name:<45}\t{2}'.format(
                 str(i + 1) + ':', target, target.count())
             for (i, target) in enumerate(target_collection_list)
         ]))
         logging.info()
         while 1:
             if autoselect:
                 selected_idx = input("Pick one above [{}]:".format(
                     len(target_collection_list)))
             else:
                 selected_idx = input("Pick one above:")
             if autoselect:
                 selected_idx = selected_idx or len(target_collection_list)
             try:
                 selected_idx = int(selected_idx)
                 break
             except ValueError:
                 continue
         return target_collection_list[selected_idx - 1]
     else:
         logging.info("Found no target collections.")
Exemplo n.º 7
0
def chrom_worker(col_name, ids):
    tgt = mongo.get_target_db()
    col = tgt[col_name]
    cur = col.find({'_id': {'$in': ids}})
    bob = col.initialize_unordered_bulk_op()
    disagreed = []
    missing = []
    root_keys = {}
    at_least_one = False
    for doc in cur:
        dchrom = get_chrom(doc)
        if dchrom["chrom"] is None:
            missing.append(doc["_id"])
        elif dchrom["agreed"] == False:
            disagreed.append(doc["_id"])
        chrom = dchrom["chrom"]
        if chrom:
            bob.find({"_id": doc["_id"]}).update({"$set": {"chrom" : chrom}})
            at_least_one = True
        # count root keys for later metadata
        for k in doc:
            root_keys.setdefault(k,0)
            root_keys[k] += 1

    at_least_one and bob.execute()

    return {"missing": missing, "disagreed" : disagreed, "root_keys" : root_keys}
Exemplo n.º 8
0
def merger_worker(col_name, dest_name, ids, mapper, upsert, batch_num):
    try:
        src = mongo.get_src_db()
        tgt = mongo.get_target_db()
        col = src[col_name]
        #if batch_num == 2:
        #    raise ValueError("oula pa bon")
        dest = DocMongoBackend(tgt, tgt[dest_name])
        cur = doc_feeder(col,
                         step=len(ids),
                         inbatch=False,
                         query={'_id': {
                             '$in': ids
                         }})
        mapper.load()
        docs = mapper.process(cur)
        cnt = dest.update(docs, upsert=upsert)
        return cnt
    except Exception as e:
        logger_name = "build_%s_%s_batch_%s" % (dest_name, col_name, batch_num)
        logger = get_logger(logger_name, btconfig.LOG_FOLDER)
        logger.exception(e)
        exc_fn = os.path.join(btconfig.LOG_FOLDER, "%s.pick" % logger_name)
        pickle.dump(e, open(exc_fn, "wb"))
        logger.info("Exception was dumped in pickle file '%s'" % exc_fn)
        raise
Exemplo n.º 9
0
def chrom_worker(col_name, ids):
    tgt = mongo.get_target_db()
    col = tgt[col_name]
    cur = col.find({'_id': {'$in': ids}})
    bob = col.initialize_unordered_bulk_op()
    disagreed = []
    missing = []
    root_keys = {}
    at_least_one = False
    for doc in cur:
        dchrom = get_chrom(doc)
        if dchrom["chrom"] is None:
            missing.append(doc["_id"])
        elif dchrom["agreed"] == False:
            disagreed.append(doc["_id"])
        chrom = dchrom["chrom"]
        if chrom:
            bob.find({"_id": doc["_id"]}).update({"$set": {"chrom": chrom}})
            at_least_one = True
        # count root keys for later metadata
        for k in doc:
            root_keys.setdefault(k, 0)
            root_keys[k] += 1

    at_least_one and bob.execute()

    return {"missing": missing, "disagreed": disagreed, "root_keys": root_keys}
Exemplo n.º 10
0
 def pick_target_collection(self, autoselect=True):
     '''print out a list of available target_collection, let user to pick one.'''
     target_db = get_target_db()
     target_collection_prefix = 'genedoc_' + self._build_config['name']
     target_collection_list = [target_db[name] for name in sorted(target_db.collection_names()) if name.startswith(target_collection_prefix)]
     if target_collection_list:
         logging.info("Found {} target collections:".format(len(target_collection_list)))
         logging.info('\n'.join(['\t{0:<5}{1.name:<45}\t{2}'.format(
             str(i + 1) + ':', target, target.count()) for (i, target) in enumerate(target_collection_list)]))
         logging.info()
         while 1:
             if autoselect:
                 selected_idx = input("Pick one above [{}]:".format(len(target_collection_list)))
             else:
                 selected_idx = input("Pick one above:")
             if autoselect:
                 selected_idx = selected_idx or len(target_collection_list)
             try:
                 selected_idx = int(selected_idx)
                 break
             except ValueError:
                 continue
         return target_collection_list[selected_idx - 1]
     else:
         logging.info("Found no target collections.")
Exemplo n.º 11
0
    def get_stats(self,sources,job_manager):
        self.stats = super(MyGeneDataBuilder,self).get_stats(sources,job_manager)
        # enrich with some specific mygene counts, specially regarding ensembl vs. entrez
        tgt = mongo.get_target_db()[self.target_name]
        self.stats["total_genes"] = tgt.count()
        # entrez genes are digits only (also, don't count entrez_gene collection,
        # because tgt can be a subset, we have to work with the merged collection)
        self.logger.debug("Counting 'total_entrez_genes'")
        entrez_cnt = tgt.find({"entrezgene":{"$exists":1}},{"_id":1}).count()
        self.stats["total_entrez_genes"] = entrez_cnt
        # ensembl genes aount are taken from :
        # 1. "ensembl" field, but it can a list => use aggregation. 
        #    Note: "ensembl.0" means first element of the list, so it implicitely
        #    select doc with a list. Finally, filtering with {$type:"array"} doesn't work because
        #    mongo filters this on the most inner field (that's weird, but it is what is it...)
        # 2. when document is root doc coming from ensembl_gene collection without a "ensembl" key ("orphan")
        # Note: we can't create a sparce or conditional index to help querying "ensembl"
        # because data is too long for an index key, and "hashed" mode doesn't work because list aren't supported
        # Queries are gonna use colscan strategy...
        self.logger.debug("Counting 'total_ensembl_genes'")
        res = tgt.aggregate([
            {"$match" : {"ensembl.0" : {"$exists" : True}}},
            {"$project" : {"num_gene" : {"$size" : "$ensembl"}}},
            {"$group" : {"_id" : None, "sum" : {"$sum": "$num_gene"}}}
            ])
        try:
            list_count = next(res)["sum"]
        except StopIteration:
            list_count = 0
        object_count = tgt.find({"ensembl" : {"$type" : "object"}},{"_id":1}).count()
        orphan_count = tgt.find({"_id":{"$regex":'''\\w'''},"ensembl":{"$exists":0}},{"_id":1}).count()
        total_ensembl_genes = list_count + object_count + orphan_count
        self.stats["total_ensembl_genes"] = total_ensembl_genes
        # this one can't be computed from merged collection, and is only valid when build
        # involves all data (no filter, no subset)
        self.logger.debug("Counting 'total_ensembl_genes_mapped_to_entrez'")
        # this one is similar to total_ensembl_genes except we cross with entrezgene (ie. so they're mapped)
        try:
            list_count = next(tgt.aggregate([
                {"$match" : {"$and" : [{"ensembl.0" : {"$exists" : True}},{"entrezgene":{"$exists":1}}]}},
                {"$project" : {"num_gene" : {"$size" : "$ensembl"}}},
                {"$group" : {"_id" : None, "sum" : {"$sum": "$num_gene"}}}
                ]))["sum"]
        except StopIteration:
            list_count = 0
        object_count = tgt.find({"$and": [{"ensembl" : {"$type" : "object"}},{"entrezgene":{"$exists":1}}]},{"_id":1}).count()
        mapped = list_count + object_count
        self.stats["total_ensembl_genes_mapped_to_entrez"] = mapped
        # ensembl gene contains letters (if it wasn't, it means it would only contain digits
        # so it would be an entrez gene (\\D = non-digits, can't use \\w as a digit *is* a letter)
        self.logger.debug("Counting 'total_ensembl_only_genes'")
        ensembl_unmapped = tgt.find({"_id":{"$regex":'''\\D'''}},{"_id":1}).count()
        self.stats["total_ensembl_only_genes"] = ensembl_unmapped
        self.logger.debug("Counting 'total_species'")
        self.stats["total_species"] = len(tgt.distinct("taxid"))

        return self.stats
Exemplo n.º 12
0
def mark_timestamp(timestamp):
    #.update({'_id': {'$in': xli1}}, {'$set': {'_timestamp': ts}}, multi=True)
    target = get_target_db()
    #genedoc_col = target.genedoc_mygene_allspecies_current
    genedoc_col = target.genedoc_mygene_xxxxx
    for doc in doc_feeder(genedoc_col):
        genedoc_col.update({'_id': doc['_id']},
                           {'$set': {'_timestamp': timestamp}},
                           manipulate=False, check_keys=False,
                           upsert=False, w=0)
Exemplo n.º 13
0
 def __init__(self, pindexer, *args, **kwargs):
     super(IndexerManager, self).__init__(*args, **kwargs)
     self.pindexer = pindexer
     self.src_build = mongo.get_src_build()
     self.target_db = mongo.get_target_db()
     self.t0 = time.time()
     self.prepared = False
     self.log_folder = LOG_FOLDER
     self.timestamp = datetime.now()
     self.setup()
Exemplo n.º 14
0
 def get_target_collection(self):
     '''get the lastest target_collection from src_build record.'''
     src_build = getattr(self, 'src_build', None)
     if src_build:
         _cfg = src_build.find_one({'_id': self._build_config['_id']})
         if _cfg['build'][-1].get('status', None) == 'success' and \
            _cfg['build'][-1].get('target', None):
             target_collection = _cfg['build'][-1]['target']
             _db = get_target_db()
             target_collection = _db[target_collection]
             return target_collection
Exemplo n.º 15
0
 def get_target_collection(self):
     '''get the lastest target_collection from src_build record.'''
     src_build = getattr(self, 'src_build', None)
     if src_build:
         _cfg = src_build.find_one({'_id': self._build_config['_id']})
         if _cfg['build'][-1].get('status', None) == 'success' and \
            _cfg['build'][-1].get('target', None):
             target_collection = _cfg['build'][-1]['target']
             _db = get_target_db()
             target_collection = _db[target_collection]
             return target_collection
Exemplo n.º 16
0
 def clean_old_collections(self):
     # use target_name is given, otherwise build name will be used
     # as collection name prefix, so they should start like that
     prefix = "%s_" % (self.target_name or self.build_name)
     db = mongo.get_target_db()
     cols = [c for c in db.collection_names() if c.startswith(prefix)]
     # timestamp is what's after _archive_, YYYYMMDD, so we can sort it safely
     cols = sorted(cols, reverse=True)
     to_drop = cols[self.keep_archive:]
     for colname in to_drop:
         self.logger.info("Cleaning old archive collection '%s'" % colname)
         db[colname].drop()
Exemplo n.º 17
0
def do_index_worker(col_name, ids, pindexer, batch_num):
    tgt = mongo.get_target_db()
    col = tgt[col_name]
    idxer = pindexer()
    cur = doc_feeder(col,
                     step=len(ids),
                     inbatch=False,
                     query={'_id': {
                         '$in': ids
                     }})
    cnt = idxer.index_bulk(cur)
    return cnt
Exemplo n.º 18
0
def mark_timestamp(timestamp):
    #.update({'_id': {'$in': xli1}}, {'$set': {'_timestamp': ts}}, multi=True)
    target = get_target_db()
    #genedoc_col = target.genedoc_mygene_allspecies_current
    genedoc_col = target.genedoc_mygene_xxxxx
    for doc in doc_feeder(genedoc_col):
        genedoc_col.update({'_id': doc['_id']},
                           {'$set': {
                               '_timestamp': timestamp
                           }},
                           manipulate=False,
                           check_keys=False,
                           upsert=False,
                           w=0)
Exemplo n.º 19
0
 def prepare_target(self, target_name=None):
     '''call self.update_backend() after validating self._build_config.'''
     if self.target.name == 'mongodb':
         _db = get_target_db()
         target_collection_name = target_name or self._get_target_name()
         self.target.target_collection = _db[target_collection_name]
         logging.info("Target: %s" % repr(target_collection_name))
     elif self.target.name == 'es':
         self.target.target_esidxer.ES_INDEX_NAME = target_name or self._get_target_name()
         self.target.target_esidxer._mapping = self.get_mapping()
     elif self.target.name == 'couchdb':
         self.target.db_name = target_name or ('genedoc' + '_' + self._build_config['name'])
     elif self.target.name == 'memory':
         self.target.target_name = target_name or ('genedoc' + '_' + self._build_config['name'])
Exemplo n.º 20
0
    def apply_changes(self, changes, verify=True, noconfirm=False):
        if verify:
            self.pre_verify_changes(changes)

        if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'):
            print("Aborted.")
            return -1
        step = self.step
        _db = get_target_db()
        source_col = _db[changes['source']]
        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocESBackend(self)
        _timestamp = changes['timestamp']

        def _add_docs(ids):
            i = 0
            for _ids in iter_n(ids, step):
                t1 = time.time()
                _doc_li = src.mget_from_ids(_ids)
                for _doc in _doc_li:
                    _doc['_timestamp'] = _timestamp
                    i += 1
                target.insert(_doc_li)
                print('\t{}\t{}'.format(i, timesofar(t1)))

        t0 = time.time()
        if changes['add']:
            print("Adding {} new docs...".format(len(changes['add'])))
            t00 = time.time()
            _add_docs(changes['add'])
            print("done. [{}]".format(timesofar(t00)))
        if changes['delete']:
            print("Deleting {} discontinued docs...".format(
                len(changes['delete'])),
                  end='')
            t00 = time.time()
            target.remove_from_ids(changes['delete'], step=step)
            print("done. [{}]".format(timesofar(t00)))
        if changes['update']:
            print("Updating {} existing docs...".format(len(
                changes['update'])))
            t00 = time.time()
            ids = [x['_id'] for x in changes['update']]
            _add_docs(ids)
            print("done. [{}]".format(timesofar(t00)))

        target.finalize()

        print("\n")
        print("Finished.", timesofar(t0))
Exemplo n.º 21
0
 def clean_temp_collections(self, build_name, date=None, prefix=''):
     """
     Delete all target collections created from builder named
     "build_name" at given date (or any date is none given -- carefull...).
     Date is a string (YYYYMMDD or regex)
     Common collection name prefix can also be specified if needed.
     """
     target_db = mongo.get_target_db()
     for col_name in target_db.collection_names():
         search = prefix and prefix + "_" or ""
         search += build_name + '_'
         search += date and date + '_' or ''
         pat = re.compile(search)
         if pat.match(col_name) and not 'current' in col_name:
             logging.info("Dropping target collection '%s" % col_name)
             target_db[col_name].drop()
Exemplo n.º 22
0
    def apply_changes(self, changes, verify=True, noconfirm=False):
        if verify:
            self.pre_verify_changes(changes)

        if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'):
            print("Aborted.")
            return -1
        step = self.step
        _db = get_target_db()
        source_col = _db[changes['source']]
        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocESBackend(self)
        _timestamp = changes['timestamp']

        def _add_docs(ids):
            i = 0
            for _ids in iter_n(ids, step):
                t1 = time.time()
                _doc_li = src.mget_from_ids(_ids)
                for _doc in _doc_li:
                    _doc['_timestamp'] = _timestamp
                    i += 1
                target.insert(_doc_li)
                print('\t{}\t{}'.format(i, timesofar(t1)))

        t0 = time.time()
        if changes['add']:
            print("Adding {} new docs...".format(len(changes['add'])))
            t00 = time.time()
            _add_docs(changes['add'])
            print("done. [{}]".format(timesofar(t00)))
        if changes['delete']:
            print("Deleting {} discontinued docs...".format(len(changes['delete'])), end='')
            t00 = time.time()
            target.remove_from_ids(changes['delete'], step=step)
            print("done. [{}]".format(timesofar(t00)))
        if changes['update']:
            print("Updating {} existing docs...".format(len(changes['update'])))
            t00 = time.time()
            ids = [x['_id'] for x in changes['update']]
            _add_docs(ids)
            print("done. [{}]".format(timesofar(t00)))

        target.finalize()

        print("\n")
        print("Finished.", timesofar(t0))
Exemplo n.º 23
0
 def prepare_target(self, target_name=None):
     '''call self.update_backend() after validating self._build_config.'''
     if self.target.name == 'mongodb':
         _db = get_target_db()
         target_collection_name = target_name or self._get_target_name()
         self.target.target_collection = _db[target_collection_name]
         logging.info("Target: %s" % repr(target_collection_name))
     elif self.target.name == 'es':
         self.target.target_esidxer.ES_INDEX_NAME = target_name or self._get_target_name(
         )
         self.target.target_esidxer._mapping = self.get_mapping()
     elif self.target.name == 'couchdb':
         self.target.db_name = target_name or ('genedoc' + '_' +
                                               self._build_config['name'])
     elif self.target.name == 'memory':
         self.target.target_name = target_name or (
             'genedoc' + '_' + self._build_config['name'])
Exemplo n.º 24
0
def validate(build_config=None):
    from pprint import pprint
    from utils.diff import diff_collections
    from databuild.backend import GeneDocMongoDBBackend, GeneDocESBackend
    from biothings.utils.mongo import get_src_build, get_target_db
    from utils.es import ESIndexer

    src_build = get_src_build()
    _cfg = src_build.find_one({'_id': build_config})
    last_build = _cfg['build'][-1]
    print("Last build record:")
    pprint(last_build)
    target_name = last_build['target']

    mongo_target = get_target_db()
    b1 = GeneDocMongoDBBackend(mongo_target[target_name])
    b2 = GeneDocESBackend(ESIndexer(es_index_name=target_name,
                                    es_host='127.0.0.1:' + str(es_local_tunnel_port)))
    changes = diff_collections(b1, b2, use_parallel=True, step=10000)
    return changes
Exemplo n.º 25
0
    def post_merge(self, source_names, batch_size, job_manager):
        # get the lineage mapper
        mapper = LineageMapper(name="lineage")
        # load cache (it's being loaded automatically
        # as it's not part of an upload process
        mapper.load()

        # create a storage to save docs back to merged collection
        db = get_target_db()
        col_name = self.target_backend.target_collection.name
        storage = UpsertStorage(db,col_name)

        for docs in doc_feeder(self.target_backend.target_collection, step=batch_size, inbatch=True):
            docs = mapper.process(docs)
            storage.process(docs,batch_size)

        # add indices used to create metadata stats
        keys = ["rank","taxid"]
        self.logger.info("Creating indices on %s" % repr(keys))
        for k in keys:
            self.target_backend.target_collection.ensure_index(k)
Exemplo n.º 26
0
    def post_merge(self, source_names, batch_size, job_manager):
        # get the lineage mapper
        mapper = LineageMapper(name="lineage")
        # load cache (it's being loaded automatically
        # as it's not part of an upload process
        mapper.load()

        # create a storage to save docs back to merged collection
        db = get_target_db()
        col_name = self.target_backend.target_collection.name
        storage = UpsertStorage(db,col_name)

        for docs in doc_feeder(self.target_backend.target_collection, step=batch_size, inbatch=True):
            docs = mapper.process(docs)
            storage.process(docs,batch_size)

        # add indices used to create metadata stats
        keys = ["rank","taxid"]
        self.logger.info("Creating indices on %s" % repr(keys))
        for k in keys:
            self.target_backend.target_collection.ensure_index(k)
Exemplo n.º 27
0
def validate(build_config=None):
    from pprint import pprint
    from utils.diff import diff_collections
    from databuild.backend import GeneDocMongoDBBackend, GeneDocESBackend
    from biothings.utils.mongo import get_src_build, get_target_db
    from utils.es import ESIndexer

    src_build = get_src_build()
    _cfg = src_build.find_one({'_id': build_config})
    last_build = _cfg['build'][-1]
    print("Last build record:")
    pprint(last_build)
    target_name = last_build['target']

    mongo_target = get_target_db()
    b1 = GeneDocMongoDBBackend(mongo_target[target_name])
    b2 = GeneDocESBackend(
        ESIndexer(es_index_name=target_name,
                  es_host='127.0.0.1:' + str(es_local_tunnel_port)))
    changes = diff_collections(b1, b2, use_parallel=True, step=10000)
    return changes
Exemplo n.º 28
0
def chrom_worker(col_name, ids):
    tgt = mongo.get_target_db()
    col = tgt[col_name]
    cur = col.find({'_id': {'$in': ids}})
    bob = col.initialize_unordered_bulk_op()  
    disagreed = []
    missing = []
    at_least_one = False
    for doc in cur:
        dchrom = get_chrom(doc)
        if dchrom["chrom"] is None:
            missing.append(doc["_id"])
        elif dchrom["agreed"] == False:
            disagreed.append(doc["_id"])
        chrom = dchrom["chrom"]
        if chrom:
            bob.find({"_id": doc["_id"]}).update({"$set": {"chrom" : chrom}})
            at_least_one = True
    at_least_one and bob.execute()

    return {"missing": missing, "disagreed" : disagreed}
Exemplo n.º 29
0
def diff2src(use_parallel=True, noconfirm=False):
    src_li = []

    target_db = get_target_db()
    src_li.extend([(name, target_db[name].count(), 'mongodb')
                   for name in sorted(target_db.collection_names())
                   if name.startswith('genedoc')])

    es_idxer = ESIndexer()
    es_idxer.conn.default_indices = []
    for es_idx in es_idxer.conn.indices.get_indices():
        if es_idx.startswith('genedoc'):
            es_idxer.ES_INDEX_NAME = es_idx
            src_li.append((es_idx, es_idxer.count()['count'], 'es'))

    print("Found {} sources:".format(len(src_li)))
    src_1 = _pick_one(src_li, "Pick first source above: ")
    src_li.remove(src_1)
    print
    src_2 = _pick_one(src_li, "Pick second source above: ")

    sync_li = []
    for src in (src_1, src_2):
        if src[2] == 'mongodb':
            b = backend.GeneDocMongoDBBackend(target_db[src[0]])
        elif src[2] == 'es':
            es_idxer = ESIndexer()
            es_idxer.ES_INDEX_NAME = src[0]
            es_idxer.step = 10000
            b = backend.GeneDocESBackend(es_idxer)
        sync_li.append(b)

    sync_src, sync_target = sync_li
    print('\tsync_src:\t{:<45}{}\t{}'.format(*src_1))
    print('\tsync_target\t{:<45}{}\t{}'.format(*src_2))
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src,
                                        sync_target,
                                        use_parallel=use_parallel)
        return changes
Exemplo n.º 30
0
 def do(srcs,tgt):
     pinfo = {"category" : "cache",
             "source" : None,
             "step" : "rebuild",
             "description" : ""}
     config.logger.info("Rebuild cache for sources: %s, target: %s" % (srcs,tgt))
     for src in srcs:
         # src can be a full name (eg. clinvar.clinvar_hg38) but id_feeder knows only name (clinvar_hg38)
         if "." in src:
             src = src.split(".")[1]
         config.logger.info("Rebuilding cache for source '%s'" % src)
         col = mongo.get_src_db()[src]
         pinfo["source"] = src
         job = yield from job_manager.defer_to_thread(pinfo, partial(rebuild,col))
         yield from job
         config.logger.info("Done rebuilding cache for source '%s'" % src)
     if tgt:
         config.logger.info("Rebuilding cache for target '%s'" % tgt)
         col = mongo.get_target_db()[tgt]
         pinfo["source"] = tgt
         job = job_manager.defer_to_thread(pinfo, partial(rebuild,col))
         yield from job
Exemplo n.º 31
0
 def do(srcs, tgt):
     pinfo = {
         "category": "cache",
         "source": None,
         "step": "rebuild",
         "description": ""
     }
     config.logger.info("Rebuild cache for sources: %s, target: %s" %
                        (srcs, tgt))
     for src in srcs:
         config.logger.info("Rebuilding cache for source '%s'" % src)
         col = mongo.get_src_db()[src]
         pinfo["source"] = src
         job = yield from job_manager.defer_to_thread(
             pinfo, partial(rebuild, col))
         yield from job
         config.logger.info("Done rebuilding cache for source '%s'" % src)
     if tgt:
         config.logger.info("Rebuilding cache for target '%s'" % tgt)
         col = mongo.get_target_db()[tgt]
         pinfo["source"] = tgt
         job = job_manager.defer_to_thread(pinfo, partial(rebuild, col))
         yield from job
Exemplo n.º 32
0
def diff2src(use_parallel=True, noconfirm=False):
    src_li = []

    target_db = get_target_db()
    src_li.extend([(name, target_db[name].count(), 'mongodb') for name in sorted(target_db.collection_names()) if name.startswith('genedoc')])

    es_idxer = ESIndexer()
    es_idxer.conn.default_indices = []
    for es_idx in es_idxer.conn.indices.get_indices():
        if es_idx.startswith('genedoc'):
            es_idxer.ES_INDEX_NAME = es_idx
            src_li.append((es_idx, es_idxer.count()['count'], 'es'))

    print("Found {} sources:".format(len(src_li)))
    src_1 = _pick_one(src_li, "Pick first source above: ")
    src_li.remove(src_1)
    print
    src_2 = _pick_one(src_li, "Pick second source above: ")

    sync_li = []
    for src in (src_1, src_2):
        if src[2] == 'mongodb':
            b = backend.GeneDocMongoDBBackend(target_db[src[0]])
        elif src[2] == 'es':
            es_idxer = ESIndexer()
            es_idxer.ES_INDEX_NAME = src[0]
            es_idxer.step = 10000
            b = backend.GeneDocESBackend(es_idxer)
        sync_li.append(b)

    sync_src, sync_target = sync_li
    print('\tsync_src:\t{:<45}{}\t{}'.format(*src_1))
    print('\tsync_target\t{:<45}{}\t{}'.format(*src_2))
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src, sync_target, use_parallel=use_parallel)
        return changes
Exemplo n.º 33
0
 def __init__(self, build_config='genedoc_mygene'):
     self.build_config = build_config
     self._db = get_target_db()
     self._target_col = self._db[self.build_config + '_current']
     self.step = 10000
Exemplo n.º 34
0
def diff_two(col_1, col_2, use_parallel=True):
    target = get_target_db()
    b1 = GeneDocMongoDBBackend(target[col_1])
    b2 = GeneDocMongoDBBackend(target[col_2])
    return diff_collections(b1, b2, use_parallel=use_parallel)
Exemplo n.º 35
0
    def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None, noconfirm=False):
        """Build ES index from last successfully-merged mongodb collection.
            optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST.
            optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name
        """
        self.load_build_config(build_config)
        assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config
        last_build = self._build_config['build'][last_build_idx]
        logging.info("Last build record:")
        logging.info(pformat(last_build))
        assert last_build['status'] == 'success', \
            "Abort. Last build did not success."
        assert last_build['target_backend'] == "mongodb", \
            'Abort. Last build need to be built using "mongodb" backend.'
        assert last_build.get('stats', None), \
            'Abort. Last build stats are not available.'
        self._stats = last_build['stats']
        assert last_build.get('target', None), \
            'Abort. Last build target_collection is not available.'

        # Get the source collection to build the ES index
        # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field,
        #            only the "genedoc_*_current" collection does. When "timestamp" is enabled
        #            in mappings, last_build['target'] collection won't be indexed by ES correctly,
        #            therefore, we use "genedoc_*_current" collection as the source here:
        #target_collection = last_build['target']
        target_collection = "genedoc_{}_current".format(build_config)
        _db = get_target_db()
        target_collection = _db[target_collection]
        logging.info("")
        logging.info('Source: %s' % target_collection.name)
        _mapping = self.get_mapping()
        _meta = {}
        src_version = self.get_src_version()
        if src_version:
            _meta['src_version'] = src_version
        if getattr(self, '_stats', None):
            _meta['stats'] = self._stats
        if 'timestamp' in last_build:
            _meta['timestamp'] = last_build['timestamp']
        if _meta:
            _mapping['_meta'] = _meta
        es_index_name = es_index_name or target_collection.name
        es_idxer = ESIndexer(mapping=_mapping,
                             es_index_name=es_index_name,
                             es_host=es_host,
                             step=5000)
        if build_config == 'mygene_allspecies':
            es_idxer.number_of_shards = 10   # default 5
        es_idxer.check()
        if noconfirm or ask("Continue to build ES index?") == 'Y':
            es_idxer.use_parallel = use_parallel
            #es_idxer.s = 609000
            if es_idxer.exists_index(es_idxer.ES_INDEX_NAME):
                if noconfirm or ask('Index "{}" exists. Delete?'.format(es_idxer.ES_INDEX_NAME)) == 'Y':
                    es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME)
                else:
                    logging.info("Abort.")
                    return
            es_idxer.create_index()
            #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True)
            es_idxer.build_index(target_collection, verbose=False)
Exemplo n.º 36
0
def diff_two(col_1, col_2, use_parallel=True):
    target = get_target_db()
    b1 = GeneDocMongoDBBackend(target[col_1])
    b2 = GeneDocMongoDBBackend(target[col_2])
    return diff_collections(b1, b2, use_parallel=use_parallel)
Exemplo n.º 37
0
 def get_stats(self, sources, job_manager):
     self.stats = super(MyChemDataBuilder,
                        self).get_stats(sources, job_manager)
     tgt = mongo.get_target_db()[self.target_name]
     self.stats["total"] = tgt.count()
     return self.stats
Exemplo n.º 38
0
def export_ids(col_name):
    """
    Export all _ids from collection named col_name.
    If col_name refers to a build where a cold_collection is defined,
    will also extract _ids and sort/uniq them to have the full list of _ids
    of the actual merged (cold+hot) collection
    Output file is stored in DATA_EXPORT_FOLDER/ids,
    defaulting to <DATA_ARCHIVE_ROOT>/export/ids. Output filename is
    returned as the end, if successful.
    """
    # prepare output directory
    DATA_EXPORT_FOLDER = getattr(btconfig,"DATA_EXPORT_FOLDER",None)
    if not DATA_EXPORT_FOLDER:
        DATA_EXPORT_FOLDER = os.path.join(btconfig.DATA_ARCHIVE_ROOT,"export")
    ids_export_folder = os.path.join(DATA_EXPORT_FOLDER,"ids")
    if not os.path.exists(ids_export_folder):
        logging.debug("Creating export/ids folder: %s" % ids_export_folder)
        os.makedirs(ids_export_folder)
    build = get_src_build().find_one({"_id":col_name})
    cold = None
    if build:
        col = get_target_db()[col_name]
        if build.get("build_config",{}).get("cold_collection"):
            cold_name = build["build_config"]["cold_collection"]
            cold = get_target_db()[cold_name]
            logging.info("Found a cold collection '%s' associated to '%s'" % (cold_name,col_name))
    else:
        # it's a src
        col = get_src_db()[col_name]
    
    # first iterate over all _ids. This will potentially update underlying _id cache it's not valid anymore,
    # so we're sure to work with latest data. If cache is valid, this will be pretty fast
    logging.info("Screening _ids in collection '%s'" % col.name)
    for _id in id_feeder(col,validate_only=True):
        pass
    # now accessing cache
    col_ids_cache = get_cache_filename(col.name)
    assert os.path.exists(col_ids_cache)
    logging.info("Now using cache file %s" % col_ids_cache)
    if cold:
        logging.info("Screening _ids in cold collection '%s'" % cold.name)
        for _id in id_feeder(cold,validate_only=True):
            pass
        # now accessing cache
        cold_ids_cache = get_cache_filename(cold.name)
        assert os.path.exists(cold_ids_cache)
        logging.info("Now using cache file %s" % cold_ids_cache)
    outfn = os.path.join(ids_export_folder,"%s_ids.xz" % col_name)
    # NOTE: can't use anyfile to open cache files and send _id through pipes
    # because it would load _id in memory (unless using hacks) so use cat (and
    # existing uncompressing ones, like gzcat/xzcat/...) to fully run the pipe
    # on the shell
    if cold:
        fout = anyfile(outfn,"wb")
        colext = os.path.splitext(col_ids_cache)[1]
        coldext = os.path.splitext(cold_ids_cache)[1]
        assert colext == coldext, "Hot and Cold _id cache are compressed differently (%s and %s), it should be the same" % (coldext,coldext)
        comp = colext.replace(".","")
        supportedcomps = ["xz","gz",""] # no compression allowed as well
        assert comp in supportedcomps, "Compression '%s' isn't supported (%s)" % (comp,supportedcomps)
        # IDs sent to pipe's input (sort) then compress it (xz)
        pcat = subprocess.Popen(["%scat" % comp, col_ids_cache, cold_ids_cache],stdout=subprocess.PIPE)
        psort = subprocess.Popen(["sort","-u"],stdin=pcat.stdout,stdout=subprocess.PIPE,universal_newlines=True)
        pcat.stdout.close() # will raise end of pipe error when finished
        if comp:
            pcomp = subprocess.Popen(["xz","-c"],stdin=psort.stdout,stdout=fout)
        else:
            # just print stdin to stdout
            pcomp = subprocess.Popen(["tee"],stdin=psort.stdout,stdout=fout)
        psort.stdout.close()
        try:
            logging.info("Running pipe to compute list of unique _ids")
            (out,err) = pcomp.communicate() # run the pipe! (blocking)
            if err:
                raise Exception(err)
        except Exception as e:
            logging.error("Error while running pipe to export _ids: %s" % e)
            # make sure to clean empty or half processed files
            try:
                os.unlink(outfn)
            finally:
                pass
            raise
    else:
        logging.info("Copying cache _id file")
        try:
            shutil.copyfile(col_ids_cache,outfn)
        except Exception as e:
            logging.error("Error while exporting _ids: %s" % e)
            # make sure to clean empty or half processed files
            try:
                os.unlink(outfn)
            finally:
                pass
            raise

    logging.info("Done exporting _ids to '%s'" % outfn)
    return outfn
Exemplo n.º 39
0
 def __init__(self, build_config='genedoc_mygene'):
     self.build_config = build_config
     self._db = get_target_db()
     self._target_col = self._db[self.build_config + '_current']
     self.step = 10000
Exemplo n.º 40
0
def create_backend(db_col_names, name_only=False, follow_ref=False, **kwargs):
    """
    Guess what's inside 'db_col_names' and return the corresponding backend.
    - It could be a string (will first check for an src_build doc to check
      a backend_url field, if nothing there, will lookup a mongo collection
      in target database)
    - or a tuple("target|src","col_name")
    - or a ("mongodb://*****:*****@host","db","col_name") URI.
    - or a ("es_host:port","index_name","doc_type")
    If name_only is true, just return the name uniquely identifying the collection or index
    URI connection.
    """
    col = None
    db = None
    is_mongo = True
    if type(db_col_names) == str:
        # first check build doc, if there's backend_url key, we'll use it instead of
        # direclty using db_col_names as target collection (see LinkDataBuilder)
        bdoc = get_src_build().find_one({"_id": db_col_names})
        if follow_ref and bdoc and bdoc.get(
                "backend_url") and bdoc["backend_url"] != db_col_names:
            return create_backend(bdoc["backend_url"],
                                  name_only=name_only,
                                  follow_ref=follow_ref,
                                  **kwargs)
        else:
            db = mongo.get_target_db()
            col = db[db_col_names]
            # normalize params
            db_col_names = [
                "%s:%s" % (db.client.HOST, db.client.PORT), db.name, col.name
            ]
    elif db_col_names[0].startswith("mongodb://"):
        assert len(
            db_col_names
        ) == 3, "Missing connection information for %s" % repr(db_col_names)
        conn = mongo.MongoClient(db_col_names[0])
        db = conn[db_col_names[1]]
        col = db[db_col_names[2]]
        # normalize params
        db_col_names = [
            "%s:%s" % (db.client.HOST, db.client.PORT), db.name, col.name
        ]
    elif len(db_col_names) == 3 and ":" in db_col_names[0]:
        is_mongo = False
        idxr = ESIndexer(index=db_col_names[1],
                         doc_type=db_col_names[2],
                         es_host=db_col_names[0],
                         **kwargs)
        db = idxr
        col = db_col_names[1]
    else:
        assert len(
            db_col_names
        ) == 2, "Missing connection information for %s" % repr(db_col_names)
        db = db_col_names[0] == "target" and mongo.get_target_db(
        ) or mongo.get_src_db()
        col = db[db_col_names[1]]
        # normalize params (0:host, 1:port)
        db_col_names = [
            "%s:%s" % (db.client.address[0], db.client.address[1]), db.name,
            col.name
        ]
    assert col is not None, "Could not create collection object from %s" % repr(
        db_col_names)
    if name_only:
        if is_mongo:
            return "mongo_%s_%s_%s" % (db_col_names[0].replace(
                ":", "_"), db_col_names[1], db_col_names[2])
        else:
            return "es_%s_%s_%s" % (db_col_names[0].replace(
                ":", "_"), db_col_names[1], db_col_names[2])
    else:
        if is_mongo:
            return DocMongoBackend(db, col)
        else:
            return DocESBackend(db)
Exemplo n.º 41
0
    def index(self,
              target_name,
              index_name,
              job_manager,
              steps=["index", "post"],
              batch_size=10000,
              ids=None,
              mode="index"):
        """
        Build an index named "index_name" with data from collection
        "target_collection". "ids" can be passed to selectively index documents. "mode" can have the following
        values:
        - 'purge': will delete index if it exists
        - 'resume': will use existing index and add documents. "ids" can be passed as a list of missing IDs, 
                 or, if not pass, ES will be queried to identify which IDs are missing for each batch in
                 order to complete the index.
        - None (default): will create a new index, assuming it doesn't already exist
        """
        assert job_manager
        # check what to do
        if type(steps) == str:
            steps = [steps]
        self.target_name = target_name
        self.index_name = index_name
        self.setup_log()
        self.load_build()

        got_error = False
        cnt = 0

        if "index" in steps:
            _db = mongo.get_target_db()
            target_collection = _db[target_name]
            _mapping = self.get_mapping()
            _extra = self.get_index_creation_settings()
            _meta = {}
            # partially instantiated indexer instance for process workers
            partial_idxer = partial(ESIndexer,
                                    doc_type=self.doc_type,
                                    index=index_name,
                                    es_host=self.host,
                                    step=batch_size,
                                    number_of_shards=self.num_shards,
                                    number_of_replicas=self.num_replicas)
            # instantiate one here for index creation
            es_idxer = partial_idxer()
            if es_idxer.exists_index():
                if mode == "purge":
                    es_idxer.delete_index()
                elif mode != "resume":
                    raise IndexerException(
                        "Index already '%s' exists, (use mode='purge' to auto-delete it or mode='resume' to add more documents)"
                        % index_name)

            if mode != "resume":
                es_idxer.create_index({self.doc_type: _mapping}, _extra)

            jobs = []
            total = target_collection.count()
            btotal = math.ceil(total / batch_size)
            bnum = 1
            if ids:
                self.logger.info(
                    "Indexing from '%s' with specific list of _ids, create indexer job with batch_size=%d"
                    % (target_name, batch_size))
                id_provider = [ids]
            else:
                self.logger.info(
                    "Fetch _ids from '%s', and create indexer job with batch_size=%d"
                    % (target_name, batch_size))
                id_provider = id_feeder(target_collection,
                                        batch_size=batch_size,
                                        logger=self.logger)
            for ids in id_provider:
                yield from asyncio.sleep(0.0)
                cnt += len(ids)
                pinfo = self.get_pinfo()
                pinfo["step"] = self.target_name
                pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum, btotal,
                                                            (cnt / total *
                                                             100))
                self.logger.info("Creating indexer job #%d/%d, to index '%s' %d/%d (%.1f%%)" % \
                        (bnum,btotal,target_name,cnt,total,(cnt/total*100.)))
                job = yield from job_manager.defer_to_process(
                    pinfo,
                    partial(indexer_worker, self.target_name, ids,
                            partial_idxer, bnum, mode))

                def batch_indexed(f, batch_num):
                    nonlocal got_error
                    res = f.result()
                    if type(res) != tuple or type(res[0]) != int:
                        got_error = Exception(
                            "Batch #%s failed while indexing collection '%s' [result:%s]"
                            % (batch_num, self.target_name, repr(f.result())))

                job.add_done_callback(partial(batch_indexed, batch_num=bnum))
                jobs.append(job)
                bnum += 1
                # raise error as soon as we know
                if got_error:
                    raise got_error
            self.logger.info("%d jobs created for indexing step" % len(jobs))
            tasks = asyncio.gather(*jobs)

            def done(f):
                nonlocal got_error
                if None in f.result():
                    got_error = Exception("Some batches failed")
                    return
                # compute overall inserted/updated records
                # returned values looks like [(num,[]),(num,[]),...]
                cnt = sum([val[0] for val in f.result()])
                self.logger.info("Index '%s' successfully created" %
                                 index_name,
                                 extra={"notify": True})

            tasks.add_done_callback(done)
            yield from tasks

        if "post" in steps:
            self.logger.info("Running post-index process for index '%s'" %
                             index_name)
            pinfo = self.get_pinfo()
            pinfo["step"] = "post_index"
            # for some reason (like maintaining object's state between pickling).
            # we can't use process there. Need to use thread to maintain that state without
            # building an unmaintainable monster
            job = yield from job_manager.defer_to_thread(
                pinfo,
                partial(self.post_index,
                        target_name,
                        index_name,
                        job_manager,
                        steps=steps,
                        batch_size=batch_size,
                        ids=ids,
                        mode=mode))

            def posted(f):
                try:
                    res = f.result()
                    self.logger.info(
                        "Post-index process done for index '%s': %s" %
                        (index_name, res))
                except Exception as e:
                    got_error = e
                    self.logger.error(
                        "Post-index process failed for index '%s': %s" %
                        (index_name, e),
                        extra={"notify": True})
                    raise

            job.add_done_callback(posted)
            yield from asyncio.gather(job)  # consume future

        if got_error:
            raise got_error
        else:
            return {"%s" % self.target_name: cnt}
Exemplo n.º 42
0
    def build_index2(self,
                     build_config='mygene_allspecies',
                     last_build_idx=-1,
                     use_parallel=False,
                     es_host=None,
                     es_index_name=None,
                     noconfirm=False):
        """Build ES index from last successfully-merged mongodb collection.
            optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST.
            optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name
        """
        self.load_build_config(build_config)
        assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config
        last_build = self._build_config['build'][last_build_idx]
        logging.info("Last build record:")
        logging.info(pformat(last_build))
        assert last_build['status'] == 'success', \
            "Abort. Last build did not success."
        assert last_build['target_backend'] == "mongodb", \
            'Abort. Last build need to be built using "mongodb" backend.'
        assert last_build.get('stats', None), \
            'Abort. Last build stats are not available.'
        self._stats = last_build['stats']
        assert last_build.get('target', None), \
            'Abort. Last build target_collection is not available.'

        # Get the source collection to build the ES index
        # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field,
        #            only the "genedoc_*_current" collection does. When "timestamp" is enabled
        #            in mappings, last_build['target'] collection won't be indexed by ES correctly,
        #            therefore, we use "genedoc_*_current" collection as the source here:
        #target_collection = last_build['target']
        target_collection = "genedoc_{}_current".format(build_config)
        _db = get_target_db()
        target_collection = _db[target_collection]
        logging.info("")
        logging.info('Source: %s' % target_collection.name)
        _mapping = self.get_mapping()
        _meta = {}
        src_version = self.get_src_version()
        if src_version:
            _meta['src_version'] = src_version
        if getattr(self, '_stats', None):
            _meta['stats'] = self._stats
        if 'timestamp' in last_build:
            _meta['timestamp'] = last_build['timestamp']
        if _meta:
            _mapping['_meta'] = _meta
        es_index_name = es_index_name or target_collection.name
        es_idxer = ESIndexer(mapping=_mapping,
                             es_index_name=es_index_name,
                             es_host=es_host,
                             step=5000)
        if build_config == 'mygene_allspecies':
            es_idxer.number_of_shards = 10  # default 5
        es_idxer.check()
        if noconfirm or ask("Continue to build ES index?") == 'Y':
            es_idxer.use_parallel = use_parallel
            #es_idxer.s = 609000
            if es_idxer.exists_index(es_idxer.ES_INDEX_NAME):
                if noconfirm or ask('Index "{}" exists. Delete?'.format(
                        es_idxer.ES_INDEX_NAME)) == 'Y':
                    es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME)
                else:
                    logging.info("Abort.")
                    return
            es_idxer.create_index()
            #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True)
            es_idxer.build_index(target_collection, verbose=False)
Exemplo n.º 43
0
def sync_mongo_jsondiff_worker(diff_file,
                               old_db_col_names,
                               new_db_col_names,
                               batch_size,
                               cnt,
                               force=False,
                               selfcontained=False,
                               metadata={},
                               debug=False):
    """Worker to sync data between a new and an old mongo collection"""
    # check if diff files was already synced
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    synced_file = "%s.synced" % diff_file
    if os.path.exists(synced_file):
        logging.info("Diff file '%s' already synced, skip it" %
                     os.path.basename(diff_file))
        diff = loadobj(synced_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    new = create_backend(new_db_col_names)
    old = create_backend(old_db_col_names)
    storage = UpsertStorage(get_target_db(), old.target_collection.name,
                            logging)
    diff = loadobj(diff_file)
    assert new.target_collection.name == diff[
        "source"], "Source is different in diff file '%s': %s" % (
            diff_file, diff["source"])

    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, not mongo needed
        for docs in iter_n(diff["add"], batch_size):
            res["added"] += storage.process((d for d in docs), batch_size)
    else:
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
        for docs in iter_n(cur, batch_size):
            # use generator otherwise process/doc_iterator will require a dict (that's bad...)
            res["added"] += storage.process((d for d in docs), batch_size)

    # update: get doc from "old" and apply diff
    batch = []
    for patch_info in diff["update"]:
        doc = old.get_from_id(patch_info["_id"])
        try:
            doc = jsonpatch.apply_patch(doc, patch_info["patch"])
            batch.append(doc)
        except jsonpatch.JsonPatchConflict:
            # assuming already applieda
            res["skipped"] += 1
            continue
        if len(batch) >= batch_size:
            res["updated"] += storage.process((d for d in batch), batch_size)
            batch = []
    if batch:
        res["updated"] += storage.process((d for d in batch), batch_size)

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        res["deleted"] += old.remove_from_ids(ids)

    # we potentially modified the "old" collection so invalidate cache just to make sure
    invalidate_cache(old.target_collection.name, "target")
    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    # mark as synced
    os.rename(diff_file, synced_file)
    return res
Exemplo n.º 44
0
def test():
    target = get_target_db()
    sync_src = backend.GeneDocMongoDBBackend(target['genedoc_mygene_allspecies_20130402_uiu7bkyi'])
    idxer = ESIndexer()
    sync_target = backend.GeneDocESBackend(idxer)
    return sync_src, sync_target