def pyobj_compare_worker(file_range, dir1, dir2): print("Starting worker on: " + str(file_range)) fail_list = [] for _file_num in file_range: #First doc starts at '1' _obj1 = loadobj(dir1 + '/' + str(_file_num) + '.pyobj') _obj2 = loadobj(dir2 + '/' + str(_file_num) + '.pyobj') if (_obj1['source'] != _obj2['source'] or _obj1['add'] != _obj2['add'] or _obj1['delete'] != _obj2['delete'] or _obj1['update'] != _obj2['update']): fail_list.append(_file_num) print("Finished worker on: " + str(file_range)) return (file_range, fail_list)
def main(self, diff_filepath, merge_collection, field): diff = loadobj(diff_filepath) source_collection = diff['source'] add_ids = diff['add'] delete_ids = diff['delete'] update_ids = [_doc['_id'] for _doc in diff['update']] self.add_update(source_collection, merge_collection, add_ids) self.add_update(source_collection, merge_collection, update_ids) self.delete(merge_collection, field, delete_ids)
def _load_ensembl2entrez_li(self): ensembl2entrez_li = loadobj(("ensembl_gene__2entrezgene_list.pyobj", self.src), mode='gridfs') #filter out those deprecated entrez gene ids logging.info(len(ensembl2entrez_li)) ensembl2entrez_li = [(ensembl_id, self._entrez_geneid_d[int(entrez_id)]) for (ensembl_id, entrez_id) in ensembl2entrez_li if int(entrez_id) in self._entrez_geneid_d] logging.info(len(ensembl2entrez_li)) ensembl2entrez = list2dict(ensembl2entrez_li, 0) self._idmapping_d_cache['ensembl_gene'] = ensembl2entrez
def load_genedoc(self=None): genedoc_d = loadobj(os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/gnfreagents/gnfreagents_20110817.pyobj')) #Fixing invalid key "GNF_hs-ORFeome1.1_reads" (replacing "." with "_") for k in genedoc_d: doc = genedoc_d[k] if "GNF_hs-ORFeome1.1_reads" in doc['reagent']: doc['reagent']['GNF_hs-ORFeome1_1_reads'] = doc['reagent']['GNF_hs-ORFeome1.1_reads'] del doc['reagent']['GNF_hs-ORFeome1.1_reads'] genedoc_d[k] = doc return genedoc_d
def _load_ensembl2entrez_li(self): ensembl2entrez_li = loadobj( ("ensembl_gene__2entrezgene_list.pyobj", self.src), mode='gridfs') #filter out those deprecated entrez gene ids logging.info(len(ensembl2entrez_li)) ensembl2entrez_li = [(ensembl_id, self._entrez_geneid_d[int(entrez_id)]) for (ensembl_id, entrez_id) in ensembl2entrez_li if int(entrez_id) in self._entrez_geneid_d] logging.info(len(ensembl2entrez_li)) ensembl2entrez = list2dict(ensembl2entrez_li, 0) self._idmapping_d_cache['ensembl_gene'] = ensembl2entrez
def load_genedoc(self=None): genedoc_d = loadobj( os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/gnfreagents/gnfreagents_20110817.pyobj')) #Fixing invalid key "GNF_hs-ORFeome1.1_reads" (replacing "." with "_") for k in genedoc_d: doc = genedoc_d[k] if "GNF_hs-ORFeome1.1_reads" in doc['reagent']: doc['reagent']['GNF_hs-ORFeome1_1_reads'] = doc['reagent'][ 'GNF_hs-ORFeome1.1_reads'] del doc['reagent']['GNF_hs-ORFeome1.1_reads'] genedoc_d[k] = doc return genedoc_d
def main(self, index, collection, diff_filepath, validate=False, wait=60): self._index = index self._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_list = self.add(source_collection, diff['add']) delete_list = self.delete(collection, diff['delete']) update_list = self.update(diff['update']) t00 = time() print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() bulk(self._es, add_list) print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() bulk(self._es, delete_list) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() bulk(self._es, update_list) print("Done. [{}]".format(timesofar(t0))) print("=" * 20) print("Finished! [{}]".format(timesofar(t00))) if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": collection } } } } } data = self._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() self._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) self._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result
def main(self, index, collection, diff_filepath, validate=False, wait=60): self._index = index self._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_list = self.add(source_collection, diff['add']) delete_list = self.delete(collection, diff['delete']) update_list = self.update(diff['update']) t00 = time() print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() bulk(self._es, add_list) print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() bulk(self._es, delete_list) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() bulk(self._es, update_list) print("Done. [{}]".format(timesofar(t0))) print("="*20) print("Finished! [{}]".format(timesofar(t00))) if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": collection } } } } } data = self._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() self._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) self._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result
def main(): if len(sys.argv) > 1: config = sys.argv[1] else: config = 'mygene' #config = 'mygene_allspecies' if not config.startswith('genedoc_'): config = 'genedoc_' + config assert config in ['genedoc_mygene', 'genedoc_mygene_allspecies'] noconfirm = '-b' in sys.argv _changes_fn = _get_current_changes_fn(config) if _changes_fn: print("Changes file: " + _changes_fn) else: print("No changes file found. Aborted.") return -1 if noconfirm or ask("Continue to load?") == 'Y': changes = loadobj(_changes_fn) else: print("Aborted.") return -2 _es_host = 'localhost:' + str(es_local_tunnel_port) _es_index = config + TARGET_ES_INDEX_SUFFIX # '_current_1' # for test #_es_host = 'localhost:9200' #_es_index = config + TARGET_ES_INDEX_SUFFIX # '_current_1' with open_tunnel() as tunnel: if tunnel.ok: esi = ESIndexer2(_es_index, es_host=_es_host) meta = esi.get_mapping_meta(changes) print('\033[34;06m{}\033[0m:'.format('[Metadata]')) pprint(meta) code = esi.apply_changes(changes, noconfirm=noconfirm) if code != -1: # aborted when code == -1 _meta = {'_meta': meta} # somehow when only update "_meta", "_timestamp" get empty # so add "_timestamp" explicitly here. This is an ES bug. _meta['_timestamp'] = { "enabled": True, "path": "_timestamp" } #esi.update_mapping_meta(_meta) print(esi.conn.indices.put_mapping(esi.ES_INDEX_TYPE, _meta, [esi.ES_INDEX_NAME])) esi.post_verify_changes(changes)
def main(): if len(sys.argv) > 1: config = sys.argv[1] else: config = 'mygene_allspecies' if not config.startswith('genedoc_'): config = 'genedoc_' + config assert config in ['genedoc_mygene', 'genedoc_mygene_allspecies'] noconfirm = '-b' in sys.argv _changes_fn = _get_current_changes_fn(config) if _changes_fn: print("Changes file: " + _changes_fn) else: print("No changes file found. Aborted.") return -1 if noconfirm or ask("Continue to load?") == 'Y': changes = loadobj(_changes_fn) else: print("Aborted.") return -2 _es_index = config + TARGET_ES_INDEX_SUFFIX # ES host will be set depending on whether a tunnel is used or not with open_tunnel() as tunnel: if tunnel.ok: _es_host = 'localhost:' + str(es_local_tunnel_port) else: _es_host = ES_HOST esi = ESIndexer2(_es_index, es_host=_es_host) meta = esi.get_mapping_meta(changes) print('\033[34;06m{}\033[0m:'.format('[Metadata]')) pprint(meta) code = esi.apply_changes(changes, noconfirm=noconfirm) if code != -1: # aborted when code == -1 _meta = {'_meta': meta} print( esi.conn.indices.put_mapping(esi.ES_INDEX_TYPE, _meta, [esi.ES_INDEX_NAME])) esi.post_verify_changes(changes)
def main(): if len(sys.argv) > 1: config = sys.argv[1] else: config = 'mygene_allspecies' if not config.startswith('genedoc_'): config = 'genedoc_' + config assert config in ['genedoc_mygene', 'genedoc_mygene_allspecies'] noconfirm = '-b' in sys.argv _changes_fn = _get_current_changes_fn(config) if _changes_fn: print("Changes file: " + _changes_fn) else: print("No changes file found. Aborted.") return -1 if noconfirm or ask("Continue to load?") == 'Y': changes = loadobj(_changes_fn) else: print("Aborted.") return -2 _es_index = config + TARGET_ES_INDEX_SUFFIX # ES host will be set depending on whether a tunnel is used or not with open_tunnel() as tunnel: if tunnel.ok: _es_host = 'localhost:' + str(es_local_tunnel_port) else: _es_host = ES_HOST esi = ESIndexer2(_es_index, es_host=_es_host) meta = esi.get_mapping_meta(changes) print('\033[34;06m{}\033[0m:'.format('[Metadata]')) pprint(meta) code = esi.apply_changes(changes, noconfirm=noconfirm) if code != -1: # aborted when code == -1 _meta = {'_meta': meta} print(esi.conn.indices.put_mapping(esi.ES_INDEX_TYPE, _meta, [esi.ES_INDEX_NAME])) esi.post_verify_changes(changes)
def get_geneid_d(species_li=None, load_cache=True, save_cache=True): '''return a dictionary of current/retired geneid to current geneid mapping. This is useful, when other annotations were mapped to geneids may contain retired gene ids. if species_li is None, genes from all species are loaded. Note that all ids are int type. ''' if species_li: taxid_set = set([TAXONOMY[species] for species in species_li]) else: taxid_set = None orig_cwd = os.getcwd() os.chdir(DATA_FOLDER) # check cache file _cache_file = 'gene/geneid_d.pyobj' if load_cache and os.path.exists(_cache_file) and \ file_newer(_cache_file, 'gene/gene_info.gz') and \ file_newer(_cache_file, 'gene/gene_history.gz'): print('Loading "geneid_d" from cache file...', end='') _taxid_set, out_d = loadobj(_cache_file) assert _taxid_set == taxid_set print('Done.') os.chdir(orig_cwd) return out_d DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz') load_start(DATAFILE) if species_li: species_filter = lambda ld: int(ld[0]) in taxid_set else: species_filter = None geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter)) load_done('[%d]' % len(geneid_li)) DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz') load_start(DATAFILE) if species_li: _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li else: _includefn = lambda ld: ld[1] in geneid_li # include all species retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn) # includefn above makes sure taxid is for species_li and filters out those # mapped_to geneid exists in gene_info list load_done('[%d]' % len(retired2gene)) # convert key/value to int out_d = dict_convert(retired2gene, keyfn=int, valuefn=int) for g in geneid_li: _g = int(g) out_d[_g] = _g if save_cache: if species_li: dump((taxid_set, out_d), _cache_file) else: dump((None, out_d), _cache_file) os.chdir(orig_cwd) return out_d
def get_geneid_d(species_li=None, load_cache=True, save_cache=True): '''return a dictionary of current/retired geneid to current geneid mapping. This is useful, when other annotations were mapped to geneids may contain retired gene ids. if species_li is None, genes from all species are loaded. Note that all ids are int type. ''' if species_li: taxid_set = set([taxid_d[species] for species in species_li]) else: taxid_set = None orig_cwd = os.getcwd() os.chdir(DATA_FOLDER) # check cache file _cache_file = 'gene/geneid_d.pyobj' if load_cache and os.path.exists(_cache_file) and \ file_newer(_cache_file, 'gene/gene_info.gz') and \ file_newer(_cache_file, 'gene/gene_history.gz'): print('Loading "geneid_d" from cache file...', end='') _taxid_set, out_d = loadobj(_cache_file) assert _taxid_set == taxid_set print('Done.') os.chdir(orig_cwd) return out_d DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz') load_start(DATAFILE) if species_li: species_filter = lambda ld: int(ld[0]) in taxid_set else: species_filter = None geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter)) load_done('[%d]' % len(geneid_li)) DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz') load_start(DATAFILE) if species_li: _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li else: _includefn = lambda ld: ld[1] in geneid_li # include all species retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn) # includefn above makes sure taxid is for species_li and filters out those mapped_to geneid exists in gene_info list load_done('[%d]' % len(retired2gene)) out_d = dict_convert(retired2gene, keyfn=int, valuefn=int) # convert key/value to int for g in geneid_li: _g = int(g) out_d[_g] = _g if save_cache: if species_li: dump((taxid_set, out_d), _cache_file) else: dump((None, out_d), _cache_file) os.chdir(orig_cwd) return out_d
def load_data(step=1000, offset=0, gwas_data_local=None): if gwas_data_local: gwas_data = loadobj('gwasdata.pyobj') for item in gwas_data: snp = item chrom = snp[1] chrom = chrom[3:] rsid = snp[4] pubMedID = snp[5] title = snp[9] trait = snp[10] region = snp[13] gene_name = snp[14] riskAllele = snp[15] riskAlleleFreq = snp[16] if not is_float(riskAlleleFreq): riskAlleleFreq = None pValue = snp[17] pValue_desc = snp[18] if not is_float(pValue): pValue = None pValue_desc = None # parse from myvariant.info to get hgvs_id, # ref, alt information based on rsid url = 'http://localhost:8000/v1/query?q=dbsnp.rsid:'\ + rsid + '&fields=_id,dbsnp.ref,dbsnp.alt,dbsnp.chrom,dbsnp.hg19' r = requests.get(url) for hits in r.json()['hits']: HGVS = hits['_id'] one_snp_json = { "_id": HGVS, "gwassnp": { "rsid": rsid, "pubmed": pubMedID, "title": title, "trait": trait, "region": region, "genename": gene_name, "risk_allele": riskAllele, "risk_allele_freq": riskAlleleFreq, "pvalue": pValue, "pvalue_desc": pValue_desc } } yield one_snp_json else: MySQLHG19 = MySQLdb.connect('genome-mysql.cse.ucsc.edu', db='hg19', user='******', passwd='password') Cursor = MySQLHG19.cursor() # get the row number of gwasCatalog sql = "SELECT COUNT(*) FROM gwasCatalog" Cursor.execute(sql) numrows = Cursor.fetchone()[0] print(numrows) sql = "SELECT * FROM gwasCatalog" Cursor.execute(sql) for i in range(numrows): snp = Cursor.fetchone() if i and i % step == 0: print(i) chrom = snp[1] chrom = chrom[3:] rsid = snp[4] pubMedID = snp[5] title = snp[9] trait = snp[10] region = snp[13] gene_name = snp[14] riskAllele = snp[15] riskAlleleFreq = snp[16] if not is_float(riskAlleleFreq): riskAlleleFreq = None pValue = snp[17] pValue_desc = snp[18] if not is_float(pValue): pValue = None pValue_desc = None # parse from myvariant.info to get hgvs_id, ref, alt information based on rsid url = 'http://localhost:8000/v1/query?q=dbsnp.rsid:'\ + rsid + '&fields=_id,dbsnp.ref,dbsnp.alt,dbsnp.chrom,dbsnp.hg19' r = requests.get(url) for hits in r.json()['hits']: HGVS = hits['_id'] one_snp_json = { "_id": HGVS, "gwassnp": { "rsid": rsid, "pubmed": pubMedID, "title": title, "trait": trait, "region": region, "genename": gene_name, "risk_allele": riskAllele, "risk_allele_freq": riskAlleleFreq, "pvalue": pValue, "pvalue_desc": pValue_desc } } yield one_snp_json
def load_chr_data(self): print("\tLoading chromosome data...", end='') self._chr_data = loadobj(HG19_DATAFILE) print("Done.")
def _load_entrez_geneid_d(self): self._entrez_geneid_d = loadobj( ("entrez_gene__geneid_d.pyobj", self.src), mode='gridfs')
def sync_from_one_diff(index, collection, diff_filepath, validate=False, wait=60, dryrun=False, returncnt=False, save2file=None): sync = ESSyncer(index=index) #sync._index = index #sync._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_iter = sync.add(source_collection, diff['add']) delete_iter = sync.delete(collection, diff['delete']) update_iter = sync.update2(diff['update'], collection, source_collection) t00 = time() if save2file: from itertools import chain import json for op in chain(add_iter, delete_iter, update_iter): json.dump(op, save2file) print("="*20) print("Finished! [{}]".format(timesofar(t00))) return print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() if not dryrun: try: bulk(sync._es, add_iter) except: pass print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() if not dryrun: bulk(sync._es, delete_iter) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() if not dryrun: bulk(sync._es, update_iter) print("Done. [{}]".format(timesofar(t0))) # add flush and refresh try: res = sync._es.indices.flush() print("Flushing...", res) res = sync._es.indices.refresh() print("Refreshing...", res) except: pass print("="*20) print("Finished! [{}]".format(timesofar(t00))) if returncnt: cnt = { 'add': len(diff['add']), 'delete': len(diff['delete']), 'update': len(diff['update']) } return cnt if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": 'clinvar' } } } } } data = sync._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() sync._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) sync._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result
def _load_entrez_geneid_d(self): self._entrez_geneid_d = loadobj(("entrez_gene__geneid_d.pyobj", self.src), mode='gridfs')