def load_ensembl2interpro(self): #Interpro datafile = os.path.join(self.data_folder, 'gene_ensembl__prot_interpro__dm.txt') for datadict in tab2dict_iter(datafile, (1, 4, 5, 6), 0): datadict = dict_nodup(datadict) # optimize with on call/convert datadict = value_convert( datadict, lambda x: { 'id': x[0], 'short_desc': x[1], 'desc': x[2] }) # __aslistofdict__ : merge to 'interpro' as list of dict, not merging keys as list # (these are merging instructions for later called merge_struct) # 'interpro' : {'a': 1, 'b': 2} and 'interpro' : {'a': 3, 'b': 4} should result in: # => 'interpro' : [{'a': 1, 'b': 2},{'a': 3, 'b': 4}] # or not: # => 'interpro' : {'a': [1,3], 'b': [2,4]} datadict = value_convert(datadict, lambda x: { 'interpro': x, '__aslistofdict__': 'interpro' }, traverse_list=False) for doc in map_id(datadict, self.ensembl2entrez): yield doc
def load_ensembl_main(self): """loading ensembl gene to symbol+name mapping""" def _fn(x): import logging out = {'taxid': int(x[0])} if x[1].strip() not in ['', '\\N']: out['symbol'] = x[1].strip() if x[2].strip() not in ['', '\\N']: _name = SubStr(x[2].strip(), '', ' [Source:').strip() if _name: out['name'] = _name return out skip_count = 0 datafile = os.path.join( self.data_folder, 'gene_ensembl__gene__main.txt') for datadict in tab2dict_iter(datafile, (0, 1, 2, 7, 8), 1, includefn=_not_LRG): datadict = value_convert(datadict, _fn) for id, doc in datadict.items(): if id.isdigit(): if skip_count < ERR_THRESHOLD: skip_count += 1 else: raise ValueError('Too many ensembl ids are entirely numeric') self.logger.warning( "Document Skipped: All-digit id {}".format(id)) continue doc['_id'] = id yield doc
def _cvt(pli): _d = list2dict(pli, 2) _d = value_convert(_d, _inner_cvt) for p_source in _d: if isinstance(_d[p_source], list): _d[p_source].sort(key=lambda e: e["id"]) return {'pathway': _d}
def restructure_dict(dictionary): restr_dict = dict() restr_dict['_id'] = dictionary['ChEBI ID'] restr_dict['chebi']= dictionary restr_dict['chebi'] = clean_up(restr_dict['chebi']) restr_dict = dict_sweep(restr_dict,vals=[None,".", "-", "", "NA", "none", " ", "Not Available", "unknown","null","None"]) restr_dict = value_convert(unlist(restr_dict),skipped_keys=["beilstein_registry_numbers","pubchem_database_links","pubmed_citation_links","sabio_rk_database_links","gmelin_registry_numbers","molbase_database_links"]) return restr_dict
def _load_ensembl_2taxid(self): """ensembl2taxid""" datafile = os.path.join(self.data_folder, 'gene_ensembl__translation__main.txt') ensembl2taxid = dict_nodup( tab2dict(datafile, (0, 1), 1, includefn=_not_LRG)) # need to convert taxid to integer here ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x)) return ensembl2taxid
def load_ensembl2pos(self): datafile = os.path.join( self.data_folder, 'gene_ensembl__gene__main.txt') # Twice 1 because first is the dict key, the second because we need gene id within genomic_pos ensembl2pos = dict_nodup( tab2dict(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG)) ensembl2pos = value_convert(ensembl2pos, lambda x: { 'ensemblgene': x[0], 'chr': x[3], 'start': int(x[1]), 'end': int(x[2]), 'strand': int(x[4])}) ensembl2pos = value_convert(ensembl2pos, lambda x: { 'genomic_pos': x}, traverse_list=False) for datadict in tab2dict_iter(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG): datadict = dict_nodup(datadict) datadict = value_convert(datadict, lambda x: {'ensemblgene': x[0], 'chr': x[3], 'start': int( x[1]), 'end': int(x[2]), 'strand': int(x[4])}) datadict = value_convert(datadict, lambda x: { 'genomic_pos': x, '__aslistofdict__': 'genomic_pos'}, traverse_list=False) for doc in map_id(datadict, self.ensembl2entrez): yield doc
def load_data(self, data_folder): raise Exception("Collection-only resource, no more dataload") reporter_d = {} for module in reporter_modules: reporter_d.update(module.loaddata(data_folder)) platform_li = reporter_d.keys() genedoc_d = merge_dict([reporter_d[k] for k in platform_li], platform_li) fn = lambda value: {'reporter': value} genedoc_d = value_convert(genedoc_d, fn, traverse_list=False) return genedoc_d
def load_ensembl2pfam(self): # Prosite datafile = os.path.join( self.data_folder, 'gene_ensembl__prot_pfam__dm.txt') for datadict in tab2dict_iter(datafile, (1, 4), 0): datadict = dict_nodup(datadict) datadict = value_convert(datadict, lambda x: { 'pfam': x}, traverse_list=False) for doc in map_id(datadict, self.ensembl2entrez): yield doc
def transform(xli2): gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True) gene2uniprot = value_convert(gene2uniprot, _dict_convert, traverse_list=False) gid, uniprot = list(gene2uniprot.items())[0] docs = [] for gid, uniprot in gene2uniprot.items(): doc = {"_id": gid} doc.update(uniprot) docs.append(doc) return docs
def _load_ensembl2name(self): """loading ensembl gene to symbol+name mapping""" datafile = os.path.join( self.data_folder, 'gene_ensembl__gene__main.txt') ensembl2name = tab2dict(datafile, (1, 2, 7), 0, includefn=_not_LRG) def _fn(x): out = {} if x[0].strip() not in ['', '\\N']: out['symbol'] = x[0].strip() if x[1].strip() not in ['', '\\N']: _name = SubStr(x[1].strip(), '', ' [Source:').strip() if _name: out['name'] = _name return out ensembl2name = value_convert(ensembl2name, _fn) return ensembl2name
def convert2entrez(self, ensembl2x): '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.''' if not self.ensembl2entrez_li: self._load_ensembl2entrez_li() if not self.ensembl_main: self.ensembl_main = self.load_ensembl_main() ensembl2entrez = list2dict(self.ensembl2entrez_li, 0) entrez2ensembl = list2dict(self.ensembl2entrez_li, 1) # Now make a dictionary indexed by entrez gene id print('# of ensembl IDs in total: %d' % len(set(ensembl2x) | set(ensembl2entrez))) print('# of ensembl IDs match entrez Gene IDs: %d' % len(set(ensembl2x) & set(ensembl2entrez))) print('# of ensembl IDs DO NOT match entrez Gene IDs: %d' % len(set(ensembl2x) - set(ensembl2entrez))) # all genes with matched entrez def _fn(eid, taxid=None): # need to make a copy of the value here. d = copy.copy(ensembl2x.get(eid, {})) # otherwise, it will cause issue when multiple entrezgene ids return d # match the same ensembl gene, for example, # ENSMUSG00000027104 --> (11909, 100047997) data = value_convert(entrez2ensembl, _fn) # add those has no matched entrez geneid, using ensembl id as the key for eid in (set(ensembl2x) - set(ensembl2entrez)): _g = ensembl2x[eid] #_g.update(self.ensembl_main.get(eid, {})) data[eid] = _g for id in data: if isinstance(data[id], dict): _doc = dict_nodup(data[id], sort=True) else: # if one entrez gene matches multiple ensembl genes _doc = dict_attrmerge(data[id], removedup=True, sort=True) data[id] = _doc return data
def load_ensembl_main(self): """loading ensembl gene to symbol+name mapping""" def _fn(x): import logging out = {'taxid': int(x[0])} if x[1].strip() not in ['', '\\N']: out['symbol'] = x[1].strip() if x[2].strip() not in ['', '\\N']: _name = SubStr(x[2].strip(), '', ' [Source:').strip() if _name: out['name'] = _name return out datafile = os.path.join(self.data_folder, 'gene_ensembl__gene__main.txt') for datadict in tab2dict_iter(datafile, (0, 1, 2, 7, 8), 1, includefn=_not_LRG): datadict = value_convert(datadict, _fn) for id, doc in datadict.items(): doc['_id'] = id yield doc
def restructure_dict(dictionary): restr_dict = dict() _flag = 0 for key in list(dictionary): # this is for 1 if key == 'molecule_chembl_id': restr_dict['_id'] = dictionary[key] if key == 'molecule_structures' and type( dictionary['molecule_structures']) == dict: restr_dict['chembl'] = dictionary _flag = 1 for x, y in iter(dictionary['molecule_structures'].items()): if x == 'standard_inchi_key': restr_dict['chembl'].update(dictionary) restr_dict['chembl'].update({'inchi_key': y}) if x == 'canonical_smiles': restr_dict['chembl']['smiles'] = y if x == 'standard_inchi': restr_dict['chembl']['inchi'] = y if _flag == 0: restr_dict['chembl'] = dictionary del restr_dict['chembl']['molecule_structures'] restr_dict = unlist(restr_dict) restr_dict = dict_sweep(restr_dict, vals=[ None, ".", "-", "", "NA", "None", "none", " ", "Not Available", "unknown", "null" ]) restr_dict = value_convert(restr_dict, skipped_keys=["chebi_par_id", "first_approval"]) restr_dict = boolean_convert(restr_dict, added_keys=[ "topical", "oral", "parenteral", "dosed_ingredient", "polymer_flag", "therapeutic_flag", "med_chem_friendly", "ro3_pass" ]) return restr_dict
def load(self, aslist=False): ''' loading ncbi "gene_info" file This must be called first to create basic gene documents with all basic fields, e.g., name, symbol, synonyms, etc. format of gene_info file: #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs map_location description type_of_gene Symbol_from nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_da te (tab is used as a separator, pound sign - start of a comment) ''' gene_d = tab2dict_iter(self.datafile, (0, 1, 2, 3, 4, 5, 7, 8, 9, 13, 14), key=1, alwayslist=0, includefn=self.species_filter) def _ff(d): (taxid, symbol, locus_tag, synonyms, dbxrefs, map_location, description, type_of_gene, other_designations, modification_date) = d out = dict(taxid=int(taxid), symbol=symbol, name=description) if map_location != '-': out['map_location'] = map_location if type_of_gene != '-': out['type_of_gene'] = type_of_gene if synonyms != '-': out['alias'] = normalized_value(synonyms.split('|')) if locus_tag != '-': out['locus_tag'] = locus_tag if other_designations != "-": out['other_names'] = normalized_value( other_designations.split('|')) ### when merged, this will become the default timestamp ### as of 2017/12/10, some timestamps can have different formats ##if len(modification_date) > 8: ## out["_timestamp"] = datetime.datetime.strptime(modification_date,"%m/%d/%Y %H:%M:%S") ##else: ## out["_timestamp"] = datetime.datetime.strptime(modification_date,"%Y%m%d") for x in dbxrefs.split('|'): if x == '-': continue xd = x.split(':') if len(xd) == 3 and xd[0] == xd[1] and \ xd[0] in ['VGNC', 'HGNC', 'MGI']: # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328' xd = xd[1:] try: _db, _id = xd except: print(repr(x)) raise # we don't need ensembl xref from here, we will get it from # Ensembl directly if _db.lower() in ['ensembl', 'imgt/gene-db']: # we don't need 'IMGT/GENE-DB" xref either, because they # are mostly the same as gene symbol continue # add "MGI:" prefix for MGI ids. if _db.lower() == 'mgi': _id = "MGI:" + _id out[_db] = _id return out # add entrezgene field cnt = 0 for d in gene_d: d = value_convert(d, _ff) yield self.format(d) cnt += 1
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO chrom = fields[0] chromStart = fields[1] ref = fields[2] alt = fields[4] HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "cadd": { 'chrom': fields[0], 'pos': fields[1], 'ref': fields[2], 'anc': fields[3], 'alt': fields[4], 'type': fields[5], 'length': fields[6], 'istv': fields[7], 'isderived': fields[8], 'annotype': fields[9], 'consequence': fields[10], 'consscore': fields[11], 'consdetail': fields[12], 'gc': fields[13], 'cpg': fields[14], 'mapability': { '20bp': fields[15], '35bp': fields[16] }, 'scoresegdup': fields[17], 'phast_cons': { 'primate': fields[18], 'mammalian': fields[19], 'vertebrate': fields[20] }, 'phylop': { 'primate': fields[21], 'mammalian': fields[22], 'vertebrate': fields[23] }, 'gerp': { 'n': fields[24], 's': fields[25], 'rs': fields[26], 'rs_pval': fields[27] }, 'bstatistic': fields[28], 'mutindex': fields[29], 'dna': { 'helt': fields[30], 'mgw': fields[31], 'prot': fields[32], 'roll': fields[33] }, 'mirsvr': { 'score': fields[34], 'e': fields[35], 'aln': fields[36] }, 'targetscans': fields[37], 'fitcons': fields[38], 'chmm': { 'tssa': fields[39], 'tssaflnk': fields[40], 'txflnk': fields[41], 'tx': fields[42], 'txwk': fields[43], 'enh': fields[44], # 'enh': fields[45], 'znfrpts': fields[46], 'het': fields[47], 'tssbiv': fields[48], 'bivflnk': fields[49], 'enhbiv': fields[50], 'reprpc': fields[51], 'reprpcwk': fields[52], 'quies': fields[53], }, 'encode': { 'exp': fields[54], 'h3k27ac': fields[55], 'h3k4me1': fields[56], 'h3k4me3': fields[57], 'nucleo': fields[58], 'occ': fields[59], 'p_val': { 'comb': fields[60], 'dnas': fields[61], 'faire': fields[62], 'polii': fields[63], 'ctcf': fields[64], 'mycp': fields[65] }, 'sig': { 'dnase': fields[66], 'faire': fields[67], 'polii': fields[68], 'ctcf': fields[69], 'myc': fields[70] }, }, 'segway': fields[71], 'motif': { 'toverlap': fields[72], 'dist': fields[73], 'ecount': fields[74], 'ename': fields[75], 'ehipos': fields[76], 'escorechng': fields[77] }, 'tf': { 'bs': fields[78], 'bs_peaks': fields[79], 'bs_peaks_max': fields[80] }, 'isknownvariant': fields[81], 'esp': { 'af': fields[82], 'afr': fields[83], 'eur': fields[84] }, '1000g': { 'af': fields[85], 'asn': fields[86], 'amr': fields[87], 'afr': fields[88], 'eur': fields[89] }, 'min_dist_tss': fields[90], 'min_dist_tse': fields[91], 'gene': { 'gene_id': fields[92], 'feature_id': fields[93], 'ccds_id': fields[94], 'genename': fields[95], 'cds': { 'cdna_pos': fields[96], 'rel_cdna_pos': fields[97], 'cds_pos': fields[98], 'rel_cds_pos': fields[99] }, 'prot': { 'protpos': fields[100], 'rel_prot_pos': fields[101], 'domain': fields[102] } }, 'dst2splice': fields[103], 'dst2spltype': fields[104], 'exon': fields[105], 'intron': fields[106], 'oaa': fields[107], # ref aa 'naa': fields[108], # alt aa 'grantham': fields[109], 'polyphen': { 'cat': fields[110], 'val': fields[111] }, 'sift': { 'cat': fields[112], 'val': fields[113] }, 'rawscore': fields[114], # raw CADD score 'phred': fields[115] # log-percentile of raw CADD score } } obj = dict_sweep(unlist(value_convert(one_snp_json)), ["NA"]) yield obj
def restructure_dict(dictionary): smile_dict = dict() iupac_dict = dict() d = dict() for key, value in iter(dictionary.items()): if key == "PC-Compound_id": for cnt in value: for m, n in iter(value[cnt].items()): for x, y in iter(n.items()): d["cid"] = y elif key == "PC-Compound_charge": d["formal_charge"] = dictionary[key] elif key == "PC-Compound_props": for cnt in value: for ele in value[cnt]: for x, y in iter(ele.items()): if x == "PC-InfoData_urn": for i, j in iter(y.items()): if i == "PC-Urn": val = ele["PC-InfoData_value"] for z in val: val1 = val[z] for k, l in iter(j.items()): if l == "Hydrogen Bond Acceptor": d["hydrogen_bond_acceptor_count"] = val1 elif l == "Hydrogen Bond Donor": d["hydrogen_bond_donor_count"] = val1 elif l == "Rotatable Bond": d["rotatable_bond_count"] = val1 elif l == "IUPAC Name": IUPAC = j["PC-Urn_name"] IUPAC = IUPAC.lower() iupac_dict[IUPAC] = val1 d["iupac"] = iupac_dict iupac_dict = {} elif l == "InChI": d["inchi"] = val1 break elif l == "InChIKey": d["inchi_key"] = val1 break elif l == "Log P": d["xlogp"] = val1 elif l == "Mass": d["exact_mass"] = val1 elif l == "Molecular Formula": d["molecular_formula"] = val1 elif l == "Molecular Weight": d["molecular_weight"] = val1 elif l == "SMILES": smiles = j["PC-Urn_name"] smiles = smiles.lower() smile_dict[smiles] = val1 d["smiles"] = smile_dict smile_dict = {} elif l == "Topological": d["topological_polar_surface_area"] = val1 elif l == "Weight": d["monoisotopic_weight"] = val1 elif l == "Compound Complexity": d["complexity"] = val1 elif key == "PC-Compound_count": for cnt in value: for x, y in iter(value[cnt].items()): if x == "PC-Count_heavy-atom": d["heavy_atom_count"] = y elif x == "PC-Count_atom-chiral": d["chiral_atom_count"] = y elif x == "PC-Count_atom-chiral-def": d["defined_atom_stereoceter_count"] = y elif x == "PC-Count_atom-chiral-undef": d["undefined_atom_stereoceter_count"] = y elif x == "PC-Count_bond-chiral": d["chiral_bond_count"] = y elif x == "PC-Count_bond-chiral-def": d["defined_bond_stereocenter_count"] = y elif x == "PC-Count_bond-chiral-undef": d["undefined_bond_stereocenter_count"] = y elif x == "PC-Count_isotope-atom": d["isotope_atom_count"] = y elif x == "PC-Count_covalent-unit": d["covalently-bonded_unit_count"] = y elif x == "PC-Count_tautomers": d["tautomers_count"] = y restr_dict = {} restr_dict['_id'] = d["cid"] d["cid"] = 'CID' + restr_dict['_id'] restr_dict["pubchem"] = d restr_dict = value_convert(restr_dict) return restr_dict
def restructure_dict(dictionary): restr_dict = dict() d1 = dict() pred_properties_dict = {} products_list = [] categories_list = [] enzymes_list = [] targets_list = [] carriers_list = [] transporters_list = [] atccode_list = [] for key,value in iter(dictionary.items()): if key == 'name' and value: d1[key] = value elif key == 'drugbank-id' and value: id_list = [] if isinstance(value,list): for ele in value: if isinstance(ele,collections.OrderedDict): for x,y in iter(ele.items()): if x == '#text': key = key.replace('-','_') id_list.append(y) d1.update({'accession_number':id_list}) restr_dict['_id'] = y if isinstance(ele,str): key = key.replace('-','_') id_list.append(ele) d1.update({'accession_number':id_list}) elif isinstance(value,dict) or isinstance(value,collections.OrderedDict): for x,y in iter(value.items()): if x == '#text': key = key.replace('-','_') id_list.append(y) d1.update({key:id_list}) restr_dict['_id'] = y elif key == 'description': d1.update({'pharmacology':{key:value}}) elif key == 'groups': for i,j in iter(value.items()): d1[key] = j elif key == 'indication': d1['pharmacology'].update({key:value}) elif key == 'pharmacodynamics': d1['pharmacology'].update({key:value}) elif key == 'mechanism-of-action': key = key.replace('-','_') d1['pharmacology'].update({key:value}) elif key == 'toxicity': d1['pharmacology'].update({key:value}) elif key == 'metabolism': d1['pharmacology'].update({key:value}) elif key == 'absorption': d1['pharmacology'].update({key:value}) elif key == 'half-life': key = key.replace('-','_') d1['pharmacology'].update({key:value}) elif key == 'protein-binding': key = key.replace('-','_') d1['pharmacology'].update({key:value}) elif key == 'route-of-elimination': key = key.replace('-','_') d1['pharmacology'].update({key:value}) elif key == 'volume-of-distribution': key = key.replace('-','_') d1['pharmacology'].update({key:value}) elif key == 'clearance': d1['pharmacology'].update({key:value}) elif key == 'classification' and value: for m,n in iter(value.items()): m = m.lower().replace('-','_') d1.update({'taxonomy':value}) elif key == 'salts'and value: salts_list = [] for m,n in iter(value.items()): if isinstance(n,list): for ele in n: for k in ele: if k == 'name': salts_list.append(ele[k]) d1.update({key:salts_list}) elif isinstance(n,dict) or isinstance(n,collections.OrderedDict): d1.update({key:n['name']}) elif key == 'synonyms' and value: synonym_list = [] if isinstance(value,collections.OrderedDict): for x,y in iter(value.items()): for ele in y: for name in ele: if name == '#text': synonym_list.append(ele[name]) d1.update({key:synonym_list}) elif key == 'products'and value: def restr_product_dict(dictionary): products_dict = {} for x in dictionary: if x == 'name': products_dict[x] = dictionary[x] elif x == 'dosage-form': products_dict['dosage_form'] = dictionary[x] elif x == 'strength': products_dict[x] = dictionary[x] elif x == 'route': products_dict[x] = dictionary[x] elif x == 'over-the-counter': products_dict['otc'] = dictionary[x] elif x == 'generic': products_dict[x] = dictionary[x] elif x == 'ndc-id': products_dict['ndc_id'] = dictionary[x] elif x == 'ndc-product-code': products_dict['ndc_product_code'] = dictionary[x] elif x == 'dpd-id': products_dict['dpd'] = dictionary[x] elif x == 'started-marketing-on': products_dict[x.replace('-','_')] = dictionary[x] elif x == 'ended-marketing-on': products_dict[x.replace('-','_')] = dictionary[x] elif x == 'fda-application-number': products_dict[x.replace('-','_')] = dictionary[x] elif x == 'approved': products_dict[x] = dictionary[x] elif x == 'country': products_dict[x] = dictionary[x] elif x == 'source': products_dict[x] = dictionary[x] return products_dict for x,y in iter(value.items()): if isinstance(y,dict) or isinstance(y,collections.OrderedDict): _d = restr_product_dict(y) products_list.append(_d) elif isinstance(y,list): for _d in y: products_list.append(restr_product_dict(_d)) elif key == 'packagers' and value: pack_list = [] for pack in value: for pack1 in value[pack]: for s in pack1: if s == 'name' and pack1[s]: pack_list.append(pack1[s]) d1.update({key:pack_list}) elif key == 'manufacturers' and value: manuf_list = [] for x,y in iter(value.items()): if isinstance(y,dict) or isinstance(y,collections.OrderedDict): for i in y: if i == '#text': manuf_list.append(y[i]) d1.update({key:manuf_list}) if isinstance(y,list): for i in y: for m,n in iter(i.items()): if m == '#text': manuf_list.append(n) d1.update({key:manuf_list}) elif key == 'categories' and value: for x,y in iter(value.items()): d1.update({key:y}) elif key == "snp-effects" and value: key = key.replace('-','_') d1['pharmacology'].update({key:value}) elif key == "snp-adverse-drug-reactions" and value: key = key.replace('-','_') d1['pharmacology'].update({key:value}) elif key == 'affected-organisms' and value: for x,y in iter(value.items()): key = key.replace('-','_') d1['pharmacology'].update({key:value["affected-organism"]}) elif key == 'ahfs-codes' and value: for x in value: key = key.replace('-','_') d1.update({key:value[x]}) elif key == 'food-interactions' and value: food_interaction_list = [] for x,y in iter(value.items()): if isinstance(y,list): key = key.replace('-','_') for i in y: food_interaction_list.append(i) d1.update({key:food_interaction_list}) else: d1.update({key:y}) elif key == 'drug-interactions' and value: key = key.replace('-','_') for x,y in iter(value.items()): d1.update({key:y}) elif key == 'sequences'and value: for x,y in iter(value.items()): for i in y: if i == '@format': str1 = y[i]+'_sequences' d1[str1] = y['#text'].replace('\n',' ') elif key == 'experimental-properties' and value: d1_exp_properties = {} def restr_properties_dict(dictionary): for x,y in iter(dictionary.items()): k1 = dictionary['kind'] k1 = k1.lower().replace(' ','_').replace('-','_') d1_exp_properties[k1] = dictionary['value'] return d1_exp_properties for ele in value: key = key.replace('-','_') if isinstance(value[ele],list): for _d in value[ele]: _d = restr_properties_dict(_d) d1.update({key:_d}) if isinstance(value[ele],dict) or isinstance(value[ele],collections.OrderedDict): _d = restr_properties_dict(value[ele]) d1.update({key:_d}) elif key == 'calculated-properties' and value: def restr_properties_dict(dictionary): for x in dictionary: k = dictionary['kind'] k = k.lower().replace(' ','_').replace('-','_') pred_properties_dict[k] = dictionary['value'] if dictionary['kind'] == "IUPAC Name": d1.update({'iupac':dictionary['value']}) elif dictionary['kind'] == "SMILES": d1.update({'smiles':dictionary['value']}) elif dictionary['kind'] == "Molecular Formula": d1.update({'formula':dictionary['value']}) elif dictionary['kind'] == "InChI": d1.update({'inchi':dictionary['value']}) elif dictionary['kind'] == "InChIKey": if dictionary['value'][0:9] == 'InChIKey=': d1.update({'inchi_key':dictionary['value'][9:]}) else: d1.update({'inchi_key':dictionary['value']}) elif dictionary['kind'] == "Molecular Weight": d1.update({'weight':{'average':dictionary['value']}}) elif dictionary['kind'] == "Monoisotopic Weight": d1['weight'].update({'monoisotopic':dictionary['value']}) for x,y in iter(value.items()): if isinstance(y,list): for _d in y: _d = restr_properties_dict(_d) if isinstance(y,dict) or isinstance(y,collections.OrderedDict): _d = restr_properties_dict(y) elif key == 'external-identifiers' and value: for ele in value['external-identifier']: for x in ele: if x == 'resource': if ele[x] == "Drugs Product Database (DPD)": d1['dpd'] = ele['identifier'] elif ele[x] == "KEGG Drug": d1['kegg_drug'] = ele['identifier'] elif ele[x] == "KEGG Compound": d1['kegg_compound'] = ele['identifier'] elif ele[x] == "National Drug Code Directory": d1['ndc_directory'] = ele['identifier'] elif ele[x] == "PharmGKB": d1['pharmgkb'] = ele['identifier'] elif ele[x] == "UniProtKB": d1['uniprotkb'] = ele['identifier'] elif ele[x] == "Wikipedia": d1['wikipedia'] = ele['identifier'] elif ele[x] == "ChemSpider": d1['chemspider'] = ele['identifier'] elif ele[x] == "ChEBI": d1['chebi'] = ele['identifier'] elif ele[x] == "PubChem Compound": d1['pubchem_compound'] = ele['identifier'] elif ele[x] == "PubChem Substance": d1['pubchem_substance'] = ele['identifier'] elif ele[x] == "UniProtKB": d1['uniprotkb'] = ele['identifier'] elif ele[x] == "GenBank": d1['genbank'] = ele['identifier'] else: source = ele[x].lower().replace('-','_').replace(' ','_') d1[source]=ele['identifier'] elif key == 'external-links' and value: if isinstance(value['external-link'],list): for ele in value['external-link']: for x in ele: #print ele['resource'] try: resource = ele['resource'] d1[resource.lower().replace('.','_')] = ele['url'] except: pass else: try: resource = ele['resource'] d1[resource.lower().replace('.','_')] = ele['url'] except: pass elif key == 'patents'and value: if isinstance(value,dict): for x in value: d1.update({key:value[x]}) elif key == 'international-brands' and value: key = key.lower().replace('-','_') d1.update({key:value['international-brand']}) elif key == 'mixtures' and value: d1.update({key:value['mixture']}) elif key == 'pathways' and value: _li = [] def restr_pathway_dict(dictionary): _dict = {} for x,y in iter(dictionary.items()): if x == 'smpdb-id': _dict.update({'smpdb_id':y}) elif x == 'name': _dict.update({x:y}) elif x == 'drugs': _dict.update({x:y['drug']}) elif x == 'enzymes': _dict.update({x:y}) return _dict if isinstance(value['pathway'],list): for ele in value['pathway']: _dict = restr_pathway_dict(ele) _li.append(_dict) d1.update({key:_li}) elif isinstance(value['pathway'],dict) or isinstance(value['pathway'],OrderedDict): _dict = restr_pathway_dict(value['pathway']) d1.update({key:_dict}) elif key == 'targets' and value: if isinstance(value['target'],list): for dictionary in value['target']: _dict = restr_protein_dict(dictionary) targets_list.append(_dict) elif isinstance(value['target'],dict) or isinstance(value['target'],OrderedDict): _dict = restr_protein_dict(value['target']) targets_list.append(_dict) elif key == 'enzymes' and value: if isinstance(value['enzyme'],list): for dictionary in value['enzyme']: _dict = restr_protein_dict(dictionary) enzymes_list.append(_dict) elif isinstance(value['enzyme'],dict) or isinstance(value['enzyme'],OrderedDict): _dict = restr_protein_dict(value['enzyme']) enzymes_list.append(_dict) elif key == 'transporters' and value: if isinstance(value['transporter'],list): for dictionary in value['transporter']: _dict = restr_protein_dict(dictionary) transporters_list.append(_dict) elif isinstance(value['transporter'],dict) or isinstance(value['transporter'],OrderedDict): _dict = restr_protein_dict(value['transporter']) transporters_list.append(_dict) elif key == 'carriers' and value: if isinstance(value['carrier'],list): for dictionary in value['carrier']: _dict = restr_protein_dict(dictionary) carriers_list.append(_dict) elif isinstance(value['carrier'],dict) or isinstance(value['carrier'],OrderedDict): _dict = restr_protein_dict(value['carrier']) carriers_list.append(_dict) elif key == 'atc-codes' and value: def restr_atccode_dict(dictionary): for x in dictionary: if x == '@code': atccode_list.append(dictionary[x]) return atccode_list if isinstance(value['atc-code'], list): for _d in value['atc-code']: restr_atccode_dict(_d) elif isinstance(value['atc-code'], dict) or isinstance(value['atc-code'], OrderedDict): restr_atccode_dict(value['atc-code']) d1['atc_codes'] = atccode_list d1['targets'] = targets_list d1['carriers'] = carriers_list d1['enzymes'] = enzymes_list d1['transporters'] = transporters_list d1['predicted_properties'] = pred_properties_dict d1['products'] = products_list restr_dict['drugbank'] = d1 restr_dict = unlist(restr_dict) restr_dict = dict_sweep(restr_dict,vals=[None,".", "-", "", "NA", "none", " ", "Not Available", "unknown","null","None"]) restr_dict = boolean_convert(restr_dict,added_keys=["mddr_like_rule","bioavailability","ghose_filter","rule_of_five"]) restr_dict = value_convert(restr_dict,skipped_keys=["dpd","chemspider","chebi","pubchem_compound","pubchem_substance","bindingdb"]) return restr_dict
def load_pharmgkb(data_folder): datafile = os.path.join(data_folder, 'genes.zip') gene2pharmgkb = tab2dict((datafile, 'genes.tsv'), (0, 1), 1, header=1, includefn=lambda ld: ld[1] != '') fn = lambda value: {'pharmgkb': value} gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False) return gene2pharmgkb
def load_all(data_folder): '''Load "uniprot" using yield, while building "PDB" and "PIR" data dict while reading data file. These dict are then dumped (pickled) and stored later''' def cvt_fn(pdb_id): return pdb_id.split(':')[0] def merge(xli, transcode=False): xli2 = [] uniprot_acc, section, entrez_id, ensembl_id = xli if entrez_id: xli2.append((uniprot_acc, section, entrez_id)) elif ensembl_id: if not transcode: raise KeyError(ensembl_id) try: entrez_id = ensembl2geneid[ensembl_id] #if ensembl_id can be mapped to entrez_id for _eid in entrez_id: xli2.append((uniprot_acc, section, _eid)) except KeyError: xli2.append((uniprot_acc, section, ensembl_id)) return xli2 def transform(xli2): gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True) gene2uniprot = value_convert(gene2uniprot, _dict_convert, traverse_list=False) gid, uniprot = list(gene2uniprot.items())[0] docs = [] for gid, uniprot in gene2uniprot.items(): doc = {"_id": gid} doc.update(uniprot) docs.append(doc) return docs def merge_x(xli, gene2x, transcode=False, cvt_fn=None, k=None): xli2 = [] entrez_id, ensembl_id, x_value = xli if not x_value: return if cvt_fn: x_value = cvt_fn(x_value) if entrez_id: xli2.append((entrez_id, x_value)) elif ensembl_id: if not transcode: raise KeyError(ensembl_id) try: entrez_id = x_ensembl2geneid[ensembl_id] #if ensembl_id can be mapped to entrez_id for _eid in entrez_id: xli2.append((_eid, x_value)) except KeyError: xli2.append((ensembl_id, x_value)) for x in xli2: gene2x.setdefault(x[0], []).append(x[1]) uniprot_datafile = os.path.join(data_folder, 'idmapping_selected.tab.gz') t0 = time.time() # cache for uniprot ensembl2geneid = {} # cache for PDB and PIR x_ensembl2geneid = {} remains = [] pdb_remains = [] pir_remains = [] # once filled, will be dumped for later storage gene2pdb = {} gene2pir = {} # store all PDB & PIR data while looping, the whole will be stored later for ld in tabfile_feeder(uniprot_datafile, header=1, assert_column_no=VALID_COLUMN_NO): # Uniprot data will be stored as we read line by line xlis = [] pdbxlis = [] pirxlis = [] # raw lines for each sources uniprotld = [ld[0], ld[1], ld[2], ld[18]] pdbld = [ld[2], ld[19], ld[5]] pirld = [ld[2], ld[19], ld[11]] # UniProt # GeneID and EnsemblID columns may have duplicates for value in dupline_seperator(dupline=uniprotld, dup_idx=[2, 3], dup_sep='; '): value = list(value) value[1] = get_uniprot_section(value[1]) value = tuple(value) xlis.append(value) # PDB for value in dupline_seperator(dupline=pdbld, dup_sep='; '): pdbxlis.append(value) # PIR for value in dupline_seperator(dupline=pirld, dup_sep='; '): pirxlis.append(value) for xli in xlis: # feed mapping if xli[2] != '' and xli[3] != '': ensembl2geneid.setdefault(xli[3], []).append(xli[2]) try: # postpone ensemblid->entrezid resolution while parsing uniprot as the # full transcodification dict is only correct at the end. # ex: # 1. UniprotID-A EntrezID-A EnsemblID # 2. UniprotID-B EnsemblID # 3. UniprotID-C EntrezID-B EnsemblID # # UniprotID-B should associated to both EntrezID-A and EntrezID-B # but we need to read up to line 3 to do so xli2 = merge(xli, transcode=False) if not xli2: continue docs = transform(xli2) for doc in docs: yield doc except KeyError: remains.append(xli) for xli in pdbxlis: if xli[0] != '' and xli[1] != '': x_ensembl2geneid.setdefault(xli[1], []).append(xli[0]) try: merge_x(xli, gene2pdb, transcode=False, cvt_fn=cvt_fn, k="pdb") except KeyError: pdb_remains.append(xli) for xli in pirxlis: if xli[0] != '' and xli[1] != '': x_ensembl2geneid.setdefault(xli[1], []).append(xli[0]) try: merge_x(xli, gene2pir, transcode=False) except KeyError: pir_remains.append(xli) # now transcode with what we have for remain in remains: try: xli2 = merge(remain, transcode=True) if not xli2: continue docs = transform(xli2) for doc in docs: yield doc except KeyError: pass for remain in pdb_remains: try: merge_x(remain, gene2pdb, transcode=True, cvt_fn=cvt_fn) except KeyError: pass for remain in pir_remains: try: merge_x(remain, gene2pir, transcode=True) except KeyError: pass # PDB def normalize(value, keyname): res = None uniq = sorted(set(value)) if len(uniq) > 1: res = {keyname: uniq} else: res = {keyname: uniq[0]} return res def normalize_pdb(value): return normalize(value, "pdb") def normalize_pir(value): return normalize(value, "pir") # PDB gene2pdb = value_convert(gene2pdb, normalize_pdb, traverse_list=False) pdb_dumpfile = os.path.join(data_folder, 'gene2pdb.pyobj') dump(gene2pdb, pdb_dumpfile) # PIR gene2pir = value_convert(gene2pir, normalize_pir, traverse_list=False) pir_dumpfile = os.path.join(data_folder, 'gene2pir.pyobj') dump(gene2pir, pir_dumpfile)