Exemplo n.º 1
0
    def __init__(self, build_config=None, backend='mongodb'):
        self.src = get_src_db()
        self.step = 10000
        self.use_parallel = False
        self.merge_logging = True     # save output into a logging file when merge is called.
        self.max_build_status = 10    # max no. of records kept in "build" field of src_build collection.

        self.using_ipython_cluster = False
        self.shutdown_ipengines_after_done = False
        self.log_folder = LOG_FOLDER

        self._build_config = build_config
        self._entrez_geneid_d = None
        self._idmapping_d_cache = {}

        self.get_src_master()

        if backend == 'mongodb':
            self.target = databuild.backend.GeneDocMongoDBBackend()
        elif backend == 'es':
            self.target = databuild.backend.GeneDocESBackend(ESIndexer())
        elif backend == 'couchdb':
            from config import COUCHDB_URL
            import couchdb
            self.target = databuild.backend.GeneDocCouchDBBackend(couchdb.Server(COUCHDB_URL))
        elif backend == 'memory':
            self.target = databuild.backend.GeneDocMemeoryBackend()
        else:
            raise ValueError('Invalid backend "%s".' % backend)
Exemplo n.º 2
0
def load_contig(contig):
    '''save cadd contig into mongodb collection.
       should be an iterable.
    '''
    # if CADD_INPUT == "exome":
    # CADD_INPUT = exome
    tabix = pysam.Tabixfile(whole_genome)
    src_db = get_src_db()
    target_coll = src_db["cadd"]
    t0 = time.time()
    cnt = 0
    docs = (doc for doc in fetch_generator(tabix, contig))
    doc_list = []
    for doc in docs:
        doc_list.append(doc)
        cnt += 1
        if len(doc_list) == 100:
            target_coll.insert(doc_list,
                               manipulate=False,
                               check_keys=False,
                               w=0)
            doc_list = []
        if cnt % 100000 == 0:
            print(cnt, timesofar(t0))
    if doc_list:
        target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0)
    print("successfully loaded cadd chromosome %s into mongodb" % contig)
    print("total docs: {}; total time: {}".format(cnt, timesofar(t0)))
Exemplo n.º 3
0
def load_contig(contig):
    '''save cadd contig into mongodb collection.
       should be an iterable.
    '''
    # if CADD_INPUT == "exome":
    # CADD_INPUT = exome
    tabix = pysam.Tabixfile(whole_genome)
    src_db = get_src_db()
    target_coll = src_db["cadd"]
    t0 = time.time()
    cnt = 0
    docs = (doc for doc in fetch_generator(tabix, contig))
    doc_list = []
    for doc in docs:
        doc_list.append(doc)
        cnt += 1
        if len(doc_list) == 100:
            target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0)
            doc_list = []
        if cnt % 100000 == 0:
            print(cnt, timesofar(t0))
    if doc_list:
        target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0)
    print("successfully loaded cadd chromosome %s into mongodb" % contig)
    print("total docs: {}; total time: {}".format(cnt, timesofar(t0)))
Exemplo n.º 4
0
 def __init__(self, index=None, doc_type=None, es_host=None, step=5000):
     self._es = get_es(es_host)
     self._index = index or config.ES_INDEX_NAME
     self._doc_type = doc_type or config.ES_DOC_TYPE
     self._esi = ESIndexer(es_host=es_host)
     self._esi._index = self._index
     self._src = get_src_db()
     self.step = step
Exemplo n.º 5
0
def get_backend(target_name, bk_type, **kwargs):
    '''Return a backend instance for given target_name and backend type.
        currently support MongoDB and ES backend.
    '''
    if bk_type == 'mongodb':
        target_db = get_src_db()
        target_col = target_db[target_name]
        return GeneDocMongoDBBackend(target_col)
    elif bk_type == 'es':
        esi = ESIndexer(target_name, **kwargs)
        return GeneDocESBackend(esi)
Exemplo n.º 6
0
def get_backend(target_name, bk_type, **kwargs):
    '''Return a backend instance for given target_name and backend type.
        currently support MongoDB and ES backend.
    '''
    if bk_type == 'mongodb':
        target_db = get_src_db()
        target_col = target_db[target_name]
        return GeneDocMongoDBBackend(target_col)
    elif bk_type == 'es':
        esi = ESIndexer(target_name, **kwargs)
        return GeneDocESBackend(esi)
Exemplo n.º 7
0
def get_discrepancy_id(output_file_name, mongo_collection, uri):
	src = get_src_db()
	context = load_context('mygene.info')
	data = src[mongo_collection].find()
	with open(output_file_name, 'w') as f:
		for _doc in data:
			_doc.update(context)
			jsonld_doc = nquads_transform(_doc)
			rsid = fetch_value_by_uri(jsonld_doc, uri)
			if type(rsid) == list:
				f.write(_doc['_id'] + "\n")
Exemplo n.º 8
0
    def validate_src(self,
                     collection,
                     return_false=False,
                     return_none=False,
                     return_true=False,
                     verbose=False,
                     flag_invalid=False):
        '''Validate hgvs ids from a src collection.'''

        return_dict = {
            False: return_false,
            True: return_true,
            None: return_none
        }

        # read in the collection from mongodb
        if is_str(collection):
            src = get_src_db()
            _coll = src[collection]
        else:
            _coll = collection
        cursor = doc_feeder(_coll, step=10000)

        out = {}
        print_only = not (return_false or return_none or return_true)
        if not print_only:
            # output dictionary, three keys: 'false','true','none'
            for k in return_dict:
                if return_dict[k]:
                    out[k] = []

        # initialize the count
        cnt_d = {True: 0, False: 0, None: 0}  # cnt_d
        # validate each item in the cursor
        for item in cursor:
            _id = item['_id']
            valid = self.validate_hgvs(_id, verbose=verbose)
            if valid == False and flag_invalid:
                collection.update({"_id": _id},
                                  {'$set': {
                                      "unmatched_ref": "True"
                                  }})
            cnt_d[valid] += 1
            if return_dict[valid]:
                out[valid].append(_id)

        # print out counts
        print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True]))
        print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False]))
        print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None]))

        out['summary'] = cnt_d
        return out
Exemplo n.º 9
0
def load_source(collection_name,
                src_module=None,
                src_data=None,
                inbatch=True,
                new_collection=True):
    '''save src data into mongodb collection.
       if src_module is provided, src_data = src_module.load_data()
       if new_collection is True, it requires the target collection is empty.
       else, use src_data directly, should be a iterable.
    '''
    src_db = get_src_db()
    target_coll = src_db[collection_name]
    if new_collection and target_coll.count() > 0:
        print("Error: target collection {} exists.".format(collection_name))
        return

    t0 = time.time()
    cnt = 0
    if src_module:
        src_data = src_module.load_data()
    if src_data:
        doc_list = []
        for doc in src_data:
            cnt += 1
            if not inbatch:
                target_coll.insert(doc,
                                   manipulate=False,
                                   check_keys=False,
                                   w=0)
            else:
                doc_list.append(doc)
                if len(doc_list) == 100:
                    target_coll.insert(doc_list,
                                       manipulate=False,
                                       check_keys=False,
                                       w=0)
                    doc_list = []
            if cnt % 100000 == 0:
                print(cnt, timesofar(t0))
        if doc_list:
            target_coll.insert(doc_list,
                               manipulate=False,
                               check_keys=False,
                               w=0)

        print("successfully loaded %s into mongodb" % collection_name)
        print("total docs: {}; total time: {}".format(cnt, timesofar(t0)))
    else:
        print("Error: no src data to load.")
Exemplo n.º 10
0
    def validate_src(self, collection, return_false=False,
                     return_none=False, return_true=False, verbose=False, flag_invalid=False, generator=False):
        '''Validate hgvs ids from a src collection.'''

        return_dict = {
            False: return_false,
            True: return_true,
            None: return_none
        }

        # read in the collection from mongodb
        if is_str(collection):
            src = get_src_db()
            _coll = src[collection]
        else:
            _coll = collection
        cursor = doc_feeder(_coll, step=10000)

        out = {}
        print_only = not (return_false or return_none or return_true)
        if not print_only:
            # output dictionary, three keys: 'false','true','none'
            for k in return_dict:
                if return_dict[k]:
                    out[k] = []

        # initialize the count
        cnt_d = {True: 0, False: 0, None: 0}    # cnt_d
        # validate each item in the cursor
        for item in cursor:
            _id = item['_id']
            valid = self.validate_hgvs(_id, verbose=verbose)
            if valid == False and flag_invalid:
                collection.update({"_id": _id}, {'$set':{"unmatched_ref": "True"}})
            cnt_d[valid] += 1
            if return_dict[valid]:
                out[valid].append(_id)

        # print out counts
        print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True]))
        print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False]))
        print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None]))

        out['summary'] = cnt_d
        return out
Exemplo n.º 11
0
def load_source(collection_name, src_module=None, src_data=None, inbatch=True, new_collection=True, step=1000):
    '''save src data into mongodb collection.
       if src_module is provided, src_data = src_module.load_data()
       if new_collection is True, it requires the target collection is empty.
       else, use src_data directly, should be a iterable.
    '''
    src_db = get_src_db()
    target_coll = src_db[collection_name]
    if new_collection and target_coll.count() > 0:
        print("Error: target collection {} exists.".format(collection_name))
        return

    t0 = time.time()
    cnt = 0
    if src_module:
        src_data = src_module.load_data()
    if src_data:
        doc_list = []
        for doc in src_data:
            cnt += 1
            if not inbatch:
                try:
                    target_coll.insert_one(doc)
                except:
                    print('One duplicate id exists, id is {}'.format(doc['_id']))
                    continue
            else:
                doc_list.append(doc)
                if len(doc_list) == step:
                    target_coll.insert_many(doc_list)
                    doc_list = []
            if cnt % 100000 == 0:
                print(cnt, timesofar(t0))
        if doc_list:
            target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0)

        print("successfully loaded %s into mongodb" % collection_name)
        print("total docs: {}; total time: {}".format(cnt, timesofar(t0)))
    else:
        print("Error: no src data to load.")
Exemplo n.º 12
0
 def __init__(self, src_module):
     self.src_module = src_module
     self.src_name = src_module.__METADATA__['src_name']
     self.src_db = get_src_db()
     self.temp_collection = None
Exemplo n.º 13
0
 def __init__(self):
     self._src = get_src_db()
Exemplo n.º 14
0
 def __init__(self):
     self._src = get_src_db()