Exemplo n.º 1
0
 def load_ensembl2interpro(self):
     #Interpro
     datafile = os.path.join(self.data_folder,
                             'gene_ensembl__prot_interpro__dm.txt')
     for datadict in tab2dict_iter(datafile, (1, 4, 5, 6), 0):
         datadict = dict_nodup(datadict)
         # optimize with on call/convert
         datadict = value_convert(
             datadict, lambda x: {
                 'id': x[0],
                 'short_desc': x[1],
                 'desc': x[2]
             })
         # __aslistofdict__ : merge to 'interpro' as list of dict, not merging keys as list
         # (these are merging instructions for later called merge_struct)
         # 'interpro' : {'a': 1, 'b': 2} and 'interpro' : {'a': 3, 'b': 4} should result in:
         # => 'interpro' : [{'a': 1, 'b': 2},{'a': 3, 'b': 4}]
         # or not:
         # => 'interpro' : {'a': [1,3], 'b': [2,4]}
         datadict = value_convert(datadict,
                                  lambda x: {
                                      'interpro': x,
                                      '__aslistofdict__': 'interpro'
                                  },
                                  traverse_list=False)
         for doc in map_id(datadict, self.ensembl2entrez):
             yield doc
Exemplo n.º 2
0
    def load_ensembl_main(self):
        """loading ensembl gene to symbol+name mapping"""
        def _fn(x):
            import logging
            out = {'taxid': int(x[0])}
            if x[1].strip() not in ['', '\\N']:
                out['symbol'] = x[1].strip()
            if x[2].strip() not in ['', '\\N']:
                _name = SubStr(x[2].strip(), '', ' [Source:').strip()
                if _name:
                    out['name'] = _name
            return out

        skip_count = 0
        datafile = os.path.join(
            self.data_folder, 'gene_ensembl__gene__main.txt')
        for datadict in tab2dict_iter(datafile, (0, 1, 2, 7, 8), 1, includefn=_not_LRG):
            datadict = value_convert(datadict, _fn)
            for id, doc in datadict.items():
                if id.isdigit():
                    if skip_count < ERR_THRESHOLD:
                        skip_count += 1
                    else:
                        raise ValueError('Too many ensembl ids are entirely numeric')
                    self.logger.warning(
                        "Document Skipped: All-digit id {}".format(id))
                    continue
                doc['_id'] = id
                yield doc
Exemplo n.º 3
0
 def _cvt(pli):
     _d = list2dict(pli, 2)
     _d = value_convert(_d, _inner_cvt)
     for p_source in _d:
         if isinstance(_d[p_source], list):
             _d[p_source].sort(key=lambda e: e["id"])
     return {'pathway': _d}
Exemplo n.º 4
0
def restructure_dict(dictionary):
    restr_dict = dict()
    restr_dict['_id'] = dictionary['ChEBI ID']    
    restr_dict['chebi']= dictionary
    restr_dict['chebi'] = clean_up(restr_dict['chebi'])
    restr_dict = dict_sweep(restr_dict,vals=[None,".", "-", "", "NA", "none", " ", "Not Available", "unknown","null","None"]) 
    restr_dict = value_convert(unlist(restr_dict),skipped_keys=["beilstein_registry_numbers","pubchem_database_links","pubmed_citation_links","sabio_rk_database_links","gmelin_registry_numbers","molbase_database_links"])
    return restr_dict        
Exemplo n.º 5
0
 def _load_ensembl_2taxid(self):
     """ensembl2taxid"""
     datafile = os.path.join(self.data_folder,
                             'gene_ensembl__translation__main.txt')
     ensembl2taxid = dict_nodup(
         tab2dict(datafile, (0, 1), 1, includefn=_not_LRG))
     # need to convert taxid to integer here
     ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x))
     return ensembl2taxid
Exemplo n.º 6
0
 def load_ensembl2pos(self):
     datafile = os.path.join(
         self.data_folder, 'gene_ensembl__gene__main.txt')
     # Twice 1 because first is the dict key, the second because we need gene id within genomic_pos
     ensembl2pos = dict_nodup(
         tab2dict(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG))
     ensembl2pos = value_convert(ensembl2pos, lambda x: {
                                 'ensemblgene': x[0], 'chr': x[3], 'start': int(x[1]), 'end': int(x[2]), 'strand': int(x[4])})
     ensembl2pos = value_convert(ensembl2pos, lambda x: {
                                 'genomic_pos': x}, traverse_list=False)
     for datadict in tab2dict_iter(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG):
         datadict = dict_nodup(datadict)
         datadict = value_convert(datadict, lambda x: {'ensemblgene': x[0], 'chr': x[3], 'start': int(
             x[1]), 'end': int(x[2]), 'strand': int(x[4])})
         datadict = value_convert(datadict, lambda x: {
                                  'genomic_pos': x, '__aslistofdict__': 'genomic_pos'}, traverse_list=False)
         for doc in map_id(datadict, self.ensembl2entrez):
             yield doc
Exemplo n.º 7
0
 def load_data(self, data_folder):
     raise Exception("Collection-only resource, no more dataload")
     reporter_d = {}
     for module in reporter_modules:
         reporter_d.update(module.loaddata(data_folder))
     platform_li = reporter_d.keys()
     genedoc_d = merge_dict([reporter_d[k] for k in platform_li], platform_li)
     fn = lambda value: {'reporter': value}
     genedoc_d = value_convert(genedoc_d, fn, traverse_list=False)
     return genedoc_d
Exemplo n.º 8
0
 def load_ensembl2pfam(self):
     # Prosite
     datafile = os.path.join(
         self.data_folder, 'gene_ensembl__prot_pfam__dm.txt')
     for datadict in tab2dict_iter(datafile, (1, 4), 0):
         datadict = dict_nodup(datadict)
         datadict = value_convert(datadict, lambda x: {
                                  'pfam': x}, traverse_list=False)
         for doc in map_id(datadict, self.ensembl2entrez):
             yield doc
Exemplo n.º 9
0
 def transform(xli2):
     gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True)
     gene2uniprot = value_convert(gene2uniprot,
                                  _dict_convert,
                                  traverse_list=False)
     gid, uniprot = list(gene2uniprot.items())[0]
     docs = []
     for gid, uniprot in gene2uniprot.items():
         doc = {"_id": gid}
         doc.update(uniprot)
         docs.append(doc)
     return docs
Exemplo n.º 10
0
    def _load_ensembl2name(self):
        """loading ensembl gene to symbol+name mapping"""
        datafile = os.path.join(
            self.data_folder, 'gene_ensembl__gene__main.txt')
        ensembl2name = tab2dict(datafile, (1, 2, 7), 0, includefn=_not_LRG)

        def _fn(x):
            out = {}
            if x[0].strip() not in ['', '\\N']:
                out['symbol'] = x[0].strip()
            if x[1].strip() not in ['', '\\N']:
                _name = SubStr(x[1].strip(), '', ' [Source:').strip()
                if _name:
                    out['name'] = _name
            return out
        ensembl2name = value_convert(ensembl2name, _fn)
        return ensembl2name
Exemplo n.º 11
0
    def convert2entrez(self, ensembl2x):
        '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.'''
        if not self.ensembl2entrez_li:
            self._load_ensembl2entrez_li()

        if not self.ensembl_main:
            self.ensembl_main = self.load_ensembl_main()

        ensembl2entrez = list2dict(self.ensembl2entrez_li, 0)
        entrez2ensembl = list2dict(self.ensembl2entrez_li, 1)

        # Now make a dictionary indexed by entrez gene id
        print('# of ensembl IDs in total: %d' %
              len(set(ensembl2x) | set(ensembl2entrez)))
        print('# of ensembl IDs match entrez Gene IDs: %d' %
              len(set(ensembl2x) & set(ensembl2entrez)))
        print('# of ensembl IDs DO NOT match entrez Gene IDs: %d' %
              len(set(ensembl2x) - set(ensembl2entrez)))

        # all genes with matched entrez
        def _fn(eid, taxid=None):
            # need to make a copy of the value here.
            d = copy.copy(ensembl2x.get(eid, {}))
            # otherwise, it will cause issue when multiple entrezgene ids
            return d
            # match the same ensembl gene, for example,
            #      ENSMUSG00000027104 --> (11909, 100047997)

        data = value_convert(entrez2ensembl, _fn)

        # add those has no matched entrez geneid, using ensembl id as the key
        for eid in (set(ensembl2x) - set(ensembl2entrez)):
            _g = ensembl2x[eid]
            #_g.update(self.ensembl_main.get(eid, {}))
            data[eid] = _g

        for id in data:
            if isinstance(data[id], dict):
                _doc = dict_nodup(data[id], sort=True)
            else:
                # if one entrez gene matches multiple ensembl genes
                _doc = dict_attrmerge(data[id], removedup=True, sort=True)
            data[id] = _doc

        return data
Exemplo n.º 12
0
    def load_ensembl_main(self):
        """loading ensembl gene to symbol+name mapping"""
        def _fn(x):
            import logging
            out = {'taxid': int(x[0])}
            if x[1].strip() not in ['', '\\N']:
                out['symbol'] = x[1].strip()
            if x[2].strip() not in ['', '\\N']:
                _name = SubStr(x[2].strip(), '', ' [Source:').strip()
                if _name:
                    out['name'] = _name
            return out

        datafile = os.path.join(self.data_folder,
                                'gene_ensembl__gene__main.txt')
        for datadict in tab2dict_iter(datafile, (0, 1, 2, 7, 8),
                                      1,
                                      includefn=_not_LRG):
            datadict = value_convert(datadict, _fn)
            for id, doc in datadict.items():
                doc['_id'] = id
                yield doc
Exemplo n.º 13
0
def restructure_dict(dictionary):
    restr_dict = dict()
    _flag = 0
    for key in list(dictionary):  # this is for 1
        if key == 'molecule_chembl_id':
            restr_dict['_id'] = dictionary[key]
        if key == 'molecule_structures' and type(
                dictionary['molecule_structures']) == dict:
            restr_dict['chembl'] = dictionary
            _flag = 1
            for x, y in iter(dictionary['molecule_structures'].items()):
                if x == 'standard_inchi_key':
                    restr_dict['chembl'].update(dictionary)
                    restr_dict['chembl'].update({'inchi_key': y})
                if x == 'canonical_smiles':
                    restr_dict['chembl']['smiles'] = y
                if x == 'standard_inchi':
                    restr_dict['chembl']['inchi'] = y

    if _flag == 0:
        restr_dict['chembl'] = dictionary
    del restr_dict['chembl']['molecule_structures']
    restr_dict = unlist(restr_dict)
    restr_dict = dict_sweep(restr_dict,
                            vals=[
                                None, ".", "-", "", "NA", "None", "none", " ",
                                "Not Available", "unknown", "null"
                            ])
    restr_dict = value_convert(restr_dict,
                               skipped_keys=["chebi_par_id", "first_approval"])
    restr_dict = boolean_convert(restr_dict,
                                 added_keys=[
                                     "topical", "oral", "parenteral",
                                     "dosed_ingredient", "polymer_flag",
                                     "therapeutic_flag", "med_chem_friendly",
                                     "ro3_pass"
                                 ])
    return restr_dict
Exemplo n.º 14
0
    def load(self, aslist=False):
        '''
        loading ncbi "gene_info" file
        This must be called first to create basic gene documents
        with all basic fields, e.g., name, symbol, synonyms, etc.

        format of gene_info file:
        #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs
                 map_location description type_of_gene Symbol_from
                 nomenclature_authority Full_name_from_nomenclature_authority
        Nomenclature_status Other_designations Modification_da
        te (tab is used as a separator, pound sign - start of a comment)

        '''
        gene_d = tab2dict_iter(self.datafile,
                               (0, 1, 2, 3, 4, 5, 7, 8, 9, 13, 14),
                               key=1,
                               alwayslist=0,
                               includefn=self.species_filter)

        def _ff(d):
            (taxid, symbol, locus_tag, synonyms, dbxrefs, map_location,
             description, type_of_gene, other_designations,
             modification_date) = d
            out = dict(taxid=int(taxid), symbol=symbol, name=description)
            if map_location != '-':
                out['map_location'] = map_location
            if type_of_gene != '-':
                out['type_of_gene'] = type_of_gene
            if synonyms != '-':
                out['alias'] = normalized_value(synonyms.split('|'))
            if locus_tag != '-':
                out['locus_tag'] = locus_tag
            if other_designations != "-":
                out['other_names'] = normalized_value(
                    other_designations.split('|'))

            ### when merged, this will become the default timestamp
            ### as of 2017/12/10, some timestamps can have different formats
            ##if len(modification_date) > 8:
            ##    out["_timestamp"] = datetime.datetime.strptime(modification_date,"%m/%d/%Y %H:%M:%S")
            ##else:
            ##    out["_timestamp"] = datetime.datetime.strptime(modification_date,"%Y%m%d")

            for x in dbxrefs.split('|'):
                if x == '-':
                    continue
                xd = x.split(':')
                if len(xd) == 3 and xd[0] == xd[1] and \
                        xd[0] in ['VGNC', 'HGNC', 'MGI']:
                    # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328'
                    xd = xd[1:]
                try:
                    _db, _id = xd
                except:
                    print(repr(x))
                    raise
                # we don't need ensembl xref from here, we will get it from
                # Ensembl directly
                if _db.lower() in ['ensembl', 'imgt/gene-db']:
                    # we don't need 'IMGT/GENE-DB" xref either, because they
                    # are mostly the same as gene symbol
                    continue
                # add "MGI:" prefix for MGI ids.
                if _db.lower() == 'mgi':
                    _id = "MGI:" + _id
                out[_db] = _id
            return out

        # add entrezgene field
        cnt = 0
        for d in gene_d:
            d = value_convert(d, _ff)
            yield self.format(d)
            cnt += 1
Exemplo n.º 15
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[0]
    chromStart = fields[1]
    ref = fields[2]
    alt = fields[4]
    HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)

    # load as json data
    if HGVS is None:
        return
    one_snp_json = {
        "_id": HGVS,
        "cadd": {
            'chrom': fields[0],
            'pos': fields[1],
            'ref': fields[2],
            'anc': fields[3],
            'alt': fields[4],
            'type': fields[5],
            'length': fields[6],
            'istv': fields[7],
            'isderived': fields[8],
            'annotype': fields[9],
            'consequence': fields[10],
            'consscore': fields[11],
            'consdetail': fields[12],
            'gc': fields[13],
            'cpg': fields[14],
            'mapability': {
                '20bp': fields[15],
                '35bp': fields[16]
            },
            'scoresegdup': fields[17],
            'phast_cons': {
                'primate': fields[18],
                'mammalian': fields[19],
                'vertebrate': fields[20]
            },
            'phylop': {
                'primate': fields[21],
                'mammalian': fields[22],
                'vertebrate': fields[23]
            },
            'gerp': {
                'n': fields[24],
                's': fields[25],
                'rs': fields[26],
                'rs_pval': fields[27]
            },
            'bstatistic': fields[28],
            'mutindex': fields[29],
            'dna': {
                'helt': fields[30],
                'mgw': fields[31],
                'prot': fields[32],
                'roll': fields[33]
            },
            'mirsvr': {
                'score': fields[34],
                'e': fields[35],
                'aln': fields[36]
            },
            'targetscans': fields[37],
            'fitcons': fields[38],
            'chmm': {
                'tssa': fields[39],
                'tssaflnk': fields[40],
                'txflnk': fields[41],
                'tx': fields[42],
                'txwk': fields[43],
                'enh': fields[44],
                # 'enh': fields[45],
                'znfrpts': fields[46],
                'het': fields[47],
                'tssbiv': fields[48],
                'bivflnk': fields[49],
                'enhbiv': fields[50],
                'reprpc': fields[51],
                'reprpcwk': fields[52],
                'quies': fields[53],
            },
            'encode': {
                'exp': fields[54],
                'h3k27ac': fields[55],
                'h3k4me1': fields[56],
                'h3k4me3': fields[57],
                'nucleo': fields[58],
                'occ': fields[59],
                'p_val': {
                    'comb': fields[60],
                    'dnas': fields[61],
                    'faire': fields[62],
                    'polii': fields[63],
                    'ctcf': fields[64],
                    'mycp': fields[65]
                },
                'sig': {
                    'dnase': fields[66],
                    'faire': fields[67],
                    'polii': fields[68],
                    'ctcf': fields[69],
                    'myc': fields[70]
                },
            },
            'segway': fields[71],
            'motif': {
                'toverlap': fields[72],
                'dist': fields[73],
                'ecount': fields[74],
                'ename': fields[75],
                'ehipos': fields[76],
                'escorechng': fields[77]
            },
            'tf': {
                'bs': fields[78],
                'bs_peaks': fields[79],
                'bs_peaks_max': fields[80]
            },
            'isknownvariant': fields[81],
            'esp': {
                'af': fields[82],
                'afr': fields[83],
                'eur': fields[84]
            },
            '1000g': {
                'af': fields[85],
                'asn': fields[86],
                'amr': fields[87],
                'afr': fields[88],
                'eur': fields[89]
            },
            'min_dist_tss': fields[90],
            'min_dist_tse': fields[91],
            'gene': {
                'gene_id': fields[92],
                'feature_id': fields[93],
                'ccds_id': fields[94],
                'genename': fields[95],
                'cds': {
                    'cdna_pos': fields[96],
                    'rel_cdna_pos': fields[97],
                    'cds_pos': fields[98],
                    'rel_cds_pos': fields[99]
                },
                'prot': {
                    'protpos': fields[100],
                    'rel_prot_pos': fields[101],
                    'domain': fields[102]
                }
            },
            'dst2splice': fields[103],
            'dst2spltype': fields[104],
            'exon': fields[105],
            'intron': fields[106],
            'oaa': fields[107],   # ref aa
            'naa': fields[108],   # alt aa
            'grantham': fields[109],
            'polyphen': {
                'cat': fields[110],
                'val': fields[111]
            },
            'sift': {
                'cat': fields[112],
                'val': fields[113]
            },
            'rawscore': fields[114],    # raw CADD score
            'phred': fields[115]        # log-percentile of raw CADD score
        }
    }

    obj = dict_sweep(unlist(value_convert(one_snp_json)), ["NA"])
    yield obj
Exemplo n.º 16
0
def restructure_dict(dictionary):
    smile_dict = dict()
    iupac_dict = dict()
    d = dict()

    for key, value in iter(dictionary.items()):
        if key == "PC-Compound_id":
            for cnt in value:
                for m, n in iter(value[cnt].items()):
                    for x, y in iter(n.items()):
                        d["cid"] = y

        elif key == "PC-Compound_charge":
            d["formal_charge"] = dictionary[key]

        elif key == "PC-Compound_props":
            for cnt in value:
                for ele in value[cnt]:
                    for x, y in iter(ele.items()):
                        if x == "PC-InfoData_urn":
                            for i, j in iter(y.items()):
                                if i == "PC-Urn":
                                    val = ele["PC-InfoData_value"]
                                    for z in val:
                                        val1 = val[z]
                                    for k, l in iter(j.items()):
                                        if l == "Hydrogen Bond Acceptor":
                                            d["hydrogen_bond_acceptor_count"] = val1

                                        elif l == "Hydrogen Bond Donor":
                                            d["hydrogen_bond_donor_count"] = val1

                                        elif l == "Rotatable Bond":
                                            d["rotatable_bond_count"] = val1

                                        elif l == "IUPAC Name":
                                            IUPAC = j["PC-Urn_name"]
                                            IUPAC = IUPAC.lower()
                                            iupac_dict[IUPAC] = val1
                                            d["iupac"] = iupac_dict
                                            iupac_dict = {}

                                        elif l == "InChI":
                                            d["inchi"] = val1
                                            break

                                        elif l == "InChIKey":
                                            d["inchi_key"] = val1
                                            break

                                        elif l == "Log P":
                                            d["xlogp"] = val1

                                        elif l == "Mass":
                                            d["exact_mass"] = val1

                                        elif l == "Molecular Formula":
                                            d["molecular_formula"] = val1

                                        elif l == "Molecular Weight":
                                            d["molecular_weight"] = val1

                                        elif l == "SMILES":
                                            smiles = j["PC-Urn_name"]
                                            smiles = smiles.lower()
                                            smile_dict[smiles] = val1
                                            d["smiles"] = smile_dict
                                            smile_dict = {}

                                        elif l == "Topological":
                                            d["topological_polar_surface_area"] = val1

                                        elif l == "Weight":
                                            d["monoisotopic_weight"] = val1

                                        elif l == "Compound Complexity":
                                            d["complexity"] = val1

        elif key == "PC-Compound_count":
            for cnt in value:
                for x, y in iter(value[cnt].items()):
                    if x == "PC-Count_heavy-atom":
                        d["heavy_atom_count"] = y

                    elif x == "PC-Count_atom-chiral":
                        d["chiral_atom_count"] = y

                    elif x == "PC-Count_atom-chiral-def":
                        d["defined_atom_stereoceter_count"] = y

                    elif x == "PC-Count_atom-chiral-undef":
                        d["undefined_atom_stereoceter_count"] = y

                    elif x == "PC-Count_bond-chiral":
                        d["chiral_bond_count"] = y

                    elif x == "PC-Count_bond-chiral-def":
                        d["defined_bond_stereocenter_count"] = y

                    elif x == "PC-Count_bond-chiral-undef":
                        d["undefined_bond_stereocenter_count"] = y

                    elif x == "PC-Count_isotope-atom":
                        d["isotope_atom_count"] = y

                    elif x == "PC-Count_covalent-unit":
                        d["covalently-bonded_unit_count"] = y

                    elif x == "PC-Count_tautomers":
                        d["tautomers_count"] = y

    restr_dict = {}
    restr_dict['_id'] = d["cid"]
    d["cid"] = 'CID' + restr_dict['_id']
    restr_dict["pubchem"] = d
    restr_dict = value_convert(restr_dict)
    return restr_dict
Exemplo n.º 17
0
def restructure_dict(dictionary):
    restr_dict = dict()
    d1 = dict()   
    pred_properties_dict = {} 
    products_list = []
    categories_list = []
    enzymes_list = []
    targets_list = []  
    carriers_list = [] 
    transporters_list = []
    atccode_list = []
        
    for key,value in iter(dictionary.items()):
        if key == 'name' and value:
            d1[key] = value
            
        elif key == 'drugbank-id' and value:
            id_list = []
            if isinstance(value,list):
                for ele in value:                  
                    if isinstance(ele,collections.OrderedDict):
                        for x,y in iter(ele.items()):                            
                            if x == '#text':
                                key = key.replace('-','_')
                                id_list.append(y)
                                d1.update({'accession_number':id_list})                                
                                restr_dict['_id'] = y
                                
                    if isinstance(ele,str):
                        key = key.replace('-','_')
                        id_list.append(ele)
                        d1.update({'accession_number':id_list}) 
                        
            elif isinstance(value,dict) or isinstance(value,collections.OrderedDict):
                for x,y in iter(value.items()):
                    if x == '#text':
                        key = key.replace('-','_')
                        id_list.append(y)
                        d1.update({key:id_list})                        
                        restr_dict['_id'] = y
                
        elif key == 'description':            
            d1.update({'pharmacology':{key:value}})      
            
        elif key == 'groups':
            for i,j in iter(value.items()):
                d1[key] = j
                
        elif key == 'indication':                       
            d1['pharmacology'].update({key:value})

        elif key == 'pharmacodynamics':
            d1['pharmacology'].update({key:value})                    
            
        elif key == 'mechanism-of-action':            
            key = key.replace('-','_')           
            d1['pharmacology'].update({key:value})

        elif key == 'toxicity':            
            d1['pharmacology'].update({key:value})           
            
        elif key == 'metabolism':            
            d1['pharmacology'].update({key:value})            
            
        elif key == 'absorption':            
            d1['pharmacology'].update({key:value})            
            
        elif key == 'half-life':            
            key = key.replace('-','_')
            d1['pharmacology'].update({key:value})           
            
        elif key == 'protein-binding':             
            key = key.replace('-','_')
            d1['pharmacology'].update({key:value})           
            
        elif key == 'route-of-elimination':            
            key = key.replace('-','_')
            d1['pharmacology'].update({key:value})           
            
        elif key == 'volume-of-distribution':            
            key = key.replace('-','_')
            d1['pharmacology'].update({key:value})            
            
        elif key == 'clearance':                              
            d1['pharmacology'].update({key:value})

        elif key == 'classification' and value:
            for m,n in iter(value.items()):
                m = m.lower().replace('-','_')                
                d1.update({'taxonomy':value})                    
        
        elif key == 'salts'and value:
            salts_list = [] 
            
            for m,n in iter(value.items()):
                if isinstance(n,list):
                    for ele in n:
                        for k in ele:
                            if k == 'name':
                                salts_list.append(ele[k])
                                d1.update({key:salts_list})       
                                
                elif isinstance(n,dict) or isinstance(n,collections.OrderedDict):
                    d1.update({key:n['name']}) 
                               
        elif key == 'synonyms' and value:
            synonym_list = []                          
            if isinstance(value,collections.OrderedDict):
                for x,y in iter(value.items()):
                    for ele in y:
                        for name in ele:
                            if name == '#text':
                                synonym_list.append(ele[name])                                
                                d1.update({key:synonym_list})                              
         
        elif key == 'products'and value: 
            def restr_product_dict(dictionary):
                products_dict = {}
                for x in dictionary:                
                    if x == 'name':
                        products_dict[x] = dictionary[x]                            
                    elif x == 'dosage-form':
                        products_dict['dosage_form'] = dictionary[x]                           
                    elif x == 'strength':
                        products_dict[x] = dictionary[x]                            
                    elif x == 'route':
                        products_dict[x] = dictionary[x]                            
                    elif x == 'over-the-counter':
                        products_dict['otc'] = dictionary[x]                            
                    elif x == 'generic':
                        products_dict[x] = dictionary[x]
                    elif x == 'ndc-id':
                        products_dict['ndc_id'] = dictionary[x]
                    elif x == 'ndc-product-code':
                        products_dict['ndc_product_code'] = dictionary[x]
                    elif x == 'dpd-id':
                        products_dict['dpd'] = dictionary[x]
                    elif x == 'started-marketing-on':
                        products_dict[x.replace('-','_')] = dictionary[x]
                    elif x == 'ended-marketing-on':
                        products_dict[x.replace('-','_')] = dictionary[x]
                    elif x == 'fda-application-number':
                        products_dict[x.replace('-','_')] = dictionary[x]
                    elif x == 'approved':
                        products_dict[x] = dictionary[x]
                    elif x == 'country':
                        products_dict[x] = dictionary[x]
                    elif x == 'source':
                        products_dict[x] = dictionary[x]
                return products_dict
                
            for x,y in iter(value.items()):
                if isinstance(y,dict) or isinstance(y,collections.OrderedDict):                    
                    _d = restr_product_dict(y)
                    products_list.append(_d)                        
                    
                elif isinstance(y,list):
                    for _d in y:                        
                        products_list.append(restr_product_dict(_d))                                  

        elif key == 'packagers' and value:
            pack_list = []
            for pack in value:
                for pack1 in value[pack]:
                    for s in pack1:
                        if s == 'name' and pack1[s]:                            
                            pack_list.append(pack1[s])                          
                            d1.update({key:pack_list})                             

        elif key == 'manufacturers' and value:
            manuf_list = []
            for x,y in iter(value.items()):
                if isinstance(y,dict) or isinstance(y,collections.OrderedDict):
                    for i in y:
                        if i == '#text':                            
                            manuf_list.append(y[i]) 
                            d1.update({key:manuf_list})   
                     
                if isinstance(y,list):
                    for i in y:
                        for m,n in iter(i.items()):
                            if m == '#text':                                 
                                manuf_list.append(n)
                                d1.update({key:manuf_list})                                  
                             
        elif key == 'categories' and value:
            for x,y in iter(value.items()):
                d1.update({key:y}) 
            
        elif key == "snp-effects" and value:            
            key = key.replace('-','_')
            d1['pharmacology'].update({key:value})           
                             
        elif key == "snp-adverse-drug-reactions" and value:                       
            key = key.replace('-','_')
            d1['pharmacology'].update({key:value})
                
        elif key == 'affected-organisms' and value:
            for x,y in iter(value.items()):                
                key = key.replace('-','_')
                d1['pharmacology'].update({key:value["affected-organism"]})               
                                             
        elif key == 'ahfs-codes' and value:
            for x in value:
                key = key.replace('-','_')
                d1.update({key:value[x]})       

        elif key == 'food-interactions' and value:
            food_interaction_list = []
            for x,y in iter(value.items()):
                if isinstance(y,list):
                    key = key.replace('-','_')
                    for i in y:
                        food_interaction_list.append(i)                        
                        d1.update({key:food_interaction_list})
                else:
                    d1.update({key:y})                      
        
        elif key == 'drug-interactions' and value:
            key = key.replace('-','_')            
            for x,y in iter(value.items()):
                d1.update({key:y})                

        elif key == 'sequences'and value:
            for x,y in iter(value.items()):
                for i in y:
                    if i == '@format':
                        str1 = y[i]+'_sequences'
                        d1[str1] = y['#text'].replace('\n',' ')
        
        elif key == 'experimental-properties' and value:
            d1_exp_properties = {}            
            def restr_properties_dict(dictionary):
                for x,y in iter(dictionary.items()):
                    k1 = dictionary['kind']
                    k1 = k1.lower().replace(' ','_').replace('-','_')                        
                    d1_exp_properties[k1] = dictionary['value'] 
                return d1_exp_properties
                                        
            for ele in value:
                key = key.replace('-','_')
                if isinstance(value[ele],list):
                    for _d in value[ele]:
                        _d = restr_properties_dict(_d)                        
                        d1.update({key:_d})    
                        
                if isinstance(value[ele],dict) or isinstance(value[ele],collections.OrderedDict):
                    _d = restr_properties_dict(value[ele]) 
                    d1.update({key:_d})                       
                       
        elif key == 'calculated-properties' and value:           
            def restr_properties_dict(dictionary):
                for x in dictionary:
                    k = dictionary['kind']
                    k = k.lower().replace(' ','_').replace('-','_')
                    pred_properties_dict[k] = dictionary['value']                               
                       
                    if dictionary['kind'] == "IUPAC Name":
                        d1.update({'iupac':dictionary['value']})                            
                    elif dictionary['kind'] == "SMILES":
                        d1.update({'smiles':dictionary['value']})                            
                    elif dictionary['kind'] == "Molecular Formula":
                        d1.update({'formula':dictionary['value']})                            
                    elif dictionary['kind'] == "InChI":
                        d1.update({'inchi':dictionary['value']})                            
                    elif dictionary['kind'] == "InChIKey":
                        if dictionary['value'][0:9] == 'InChIKey=':    
                            d1.update({'inchi_key':dictionary['value'][9:]})                                
                        else:
                            d1.update({'inchi_key':dictionary['value']})                                 
                    elif dictionary['kind'] == "Molecular Weight":                            
                        d1.update({'weight':{'average':dictionary['value']}})                           
                    elif dictionary['kind'] == "Monoisotopic Weight":
                        d1['weight'].update({'monoisotopic':dictionary['value']})  
                
            for x,y in iter(value.items()):
                if isinstance(y,list):
                    for _d in y:
                        _d = restr_properties_dict(_d)
                        
                if isinstance(y,dict) or isinstance(y,collections.OrderedDict):
                    _d = restr_properties_dict(y)                                                                                          
                            
        elif key == 'external-identifiers' and value:
            for ele in value['external-identifier']:
                for x in ele:
                    if x == 'resource':
                        if ele[x] == "Drugs Product Database (DPD)":
                            d1['dpd'] = ele['identifier']
                        elif ele[x] == "KEGG Drug":
                            d1['kegg_drug'] = ele['identifier']
                        elif ele[x] == "KEGG Compound":
                            d1['kegg_compound'] = ele['identifier']
                        elif ele[x] == "National Drug Code Directory":
                            d1['ndc_directory'] = ele['identifier']
                        elif ele[x] == "PharmGKB":
                            d1['pharmgkb'] = ele['identifier']
                        elif ele[x] == "UniProtKB":
                            d1['uniprotkb'] = ele['identifier']
                        elif ele[x] == "Wikipedia":
                                d1['wikipedia'] = ele['identifier']
                        elif ele[x] == "ChemSpider":
                                d1['chemspider'] = ele['identifier']
                        elif ele[x] == "ChEBI":
                                d1['chebi'] = ele['identifier']
                        elif ele[x] == "PubChem Compound":
                                d1['pubchem_compound'] = ele['identifier']
                        elif ele[x] == "PubChem Substance":
                                d1['pubchem_substance'] = ele['identifier']
                        elif ele[x] == "UniProtKB":
                                d1['uniprotkb'] = ele['identifier']
                        elif ele[x] == "GenBank":
                                d1['genbank'] = ele['identifier']
                        else:
                            source = ele[x].lower().replace('-','_').replace(' ','_')
                            d1[source]=ele['identifier']
                                
        elif key == 'external-links' and value:
            if isinstance(value['external-link'],list):
                for ele in value['external-link']:
                    for x in ele:
                        #print ele['resource']                        
                        try:
                            resource = ele['resource']
                            d1[resource.lower().replace('.','_')] = ele['url']
                        except:
                            pass
            else:                
                try:
                    resource = ele['resource']
                    d1[resource.lower().replace('.','_')] = ele['url']
                except:
                    pass
            
            

        elif key == 'patents'and value:           
            if isinstance(value,dict):                
                for x in value:
                    d1.update({key:value[x]})   
                    
        elif key == 'international-brands' and value:
            key = key.lower().replace('-','_')
            d1.update({key:value['international-brand']})
            
        elif key == 'mixtures' and value:            
            d1.update({key:value['mixture']})  
         
        elif key == 'pathways' and value:
            _li = []               
            def restr_pathway_dict(dictionary):
                _dict = {}
                for x,y in iter(dictionary.items()):
                    if x == 'smpdb-id':
                        _dict.update({'smpdb_id':y})
                    elif x == 'name':
                        _dict.update({x:y})
                    elif x == 'drugs':
                        _dict.update({x:y['drug']})
                    elif x == 'enzymes':
                        _dict.update({x:y}) 
                return _dict

            if isinstance(value['pathway'],list):
                for ele in value['pathway']:
                    _dict = restr_pathway_dict(ele)
                    _li.append(_dict)
                    d1.update({key:_li})
                
            elif isinstance(value['pathway'],dict) or isinstance(value['pathway'],OrderedDict):
                _dict = restr_pathway_dict(value['pathway'])
                d1.update({key:_dict})    
           
        elif key == 'targets' and value:                           
            if isinstance(value['target'],list):        
                for dictionary in value['target']:
                    _dict = restr_protein_dict(dictionary)
                    targets_list.append(_dict)                    
                                
            elif isinstance(value['target'],dict) or isinstance(value['target'],OrderedDict):
                _dict = restr_protein_dict(value['target'])
                targets_list.append(_dict)          
        
        elif key == 'enzymes' and value:                           
            if isinstance(value['enzyme'],list):        
                for dictionary in value['enzyme']:
                    _dict = restr_protein_dict(dictionary)
                    enzymes_list.append(_dict)                    
                                
            elif isinstance(value['enzyme'],dict) or isinstance(value['enzyme'],OrderedDict):
                _dict = restr_protein_dict(value['enzyme'])
                enzymes_list.append(_dict)         
        
        elif key == 'transporters' and value:                        
            if isinstance(value['transporter'],list):        
                for dictionary in value['transporter']:
                    _dict = restr_protein_dict(dictionary)
                    transporters_list.append(_dict)                    
                                
            elif isinstance(value['transporter'],dict) or isinstance(value['transporter'],OrderedDict):
                _dict = restr_protein_dict(value['transporter'])
                transporters_list.append(_dict)                    
        
        elif key == 'carriers' and value:                           
            if isinstance(value['carrier'],list):        
                for dictionary in value['carrier']:
                    _dict = restr_protein_dict(dictionary)
                    carriers_list.append(_dict)                  
                                
            elif isinstance(value['carrier'],dict) or isinstance(value['carrier'],OrderedDict):
                _dict = restr_protein_dict(value['carrier'])
                carriers_list.append(_dict)              
        
        elif key == 'atc-codes' and value:                        
            def restr_atccode_dict(dictionary):               
                for x in dictionary:                    
                    if x == '@code':
                        atccode_list.append(dictionary[x])                        
                return atccode_list                       
                    
            if isinstance(value['atc-code'], list):
                for _d in value['atc-code']:                    
                    restr_atccode_dict(_d)  
                    
            elif isinstance(value['atc-code'], dict) or isinstance(value['atc-code'], OrderedDict):                
                restr_atccode_dict(value['atc-code'])
                
       
    d1['atc_codes'] = atccode_list
    d1['targets'] = targets_list
    d1['carriers'] = carriers_list
    d1['enzymes'] = enzymes_list
    d1['transporters'] = transporters_list    
    d1['predicted_properties'] = pred_properties_dict  
    d1['products'] = products_list            
    restr_dict['drugbank'] = d1     
    restr_dict = unlist(restr_dict) 
    restr_dict = dict_sweep(restr_dict,vals=[None,".", "-", "", "NA", "none", " ", "Not Available", "unknown","null","None"])      
    restr_dict = boolean_convert(restr_dict,added_keys=["mddr_like_rule","bioavailability","ghose_filter","rule_of_five"])
    restr_dict = value_convert(restr_dict,skipped_keys=["dpd","chemspider","chebi","pubchem_compound","pubchem_substance","bindingdb"])    
    return restr_dict       
Exemplo n.º 18
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[0]
    chromStart = fields[1]
    ref = fields[2]
    alt = fields[4]
    HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)

    # load as json data
    if HGVS is None:
        return
    one_snp_json = {
        "_id": HGVS,
        "cadd": {
            'chrom': fields[0],
            'pos': fields[1],
            'ref': fields[2],
            'anc': fields[3],
            'alt': fields[4],
            'type': fields[5],
            'length': fields[6],
            'istv': fields[7],
            'isderived': fields[8],
            'annotype': fields[9],
            'consequence': fields[10],
            'consscore': fields[11],
            'consdetail': fields[12],
            'gc': fields[13],
            'cpg': fields[14],
            'mapability': {
                '20bp': fields[15],
                '35bp': fields[16]
            },
            'scoresegdup': fields[17],
            'phast_cons': {
                'primate': fields[18],
                'mammalian': fields[19],
                'vertebrate': fields[20]
            },
            'phylop': {
                'primate': fields[21],
                'mammalian': fields[22],
                'vertebrate': fields[23]
            },
            'gerp': {
                'n': fields[24],
                's': fields[25],
                'rs': fields[26],
                'rs_pval': fields[27]
            },
            'bstatistic': fields[28],
            'mutindex': fields[29],
            'dna': {
                'helt': fields[30],
                'mgw': fields[31],
                'prot': fields[32],
                'roll': fields[33]
            },
            'mirsvr': {
                'score': fields[34],
                'e': fields[35],
                'aln': fields[36]
            },
            'targetscans': fields[37],
            'fitcons': fields[38],
            'chmm': {
                'tssa': fields[39],
                'tssaflnk': fields[40],
                'txflnk': fields[41],
                'tx': fields[42],
                'txwk': fields[43],
                'enh': fields[44],
                # 'enh': fields[45],
                'znfrpts': fields[46],
                'het': fields[47],
                'tssbiv': fields[48],
                'bivflnk': fields[49],
                'enhbiv': fields[50],
                'reprpc': fields[51],
                'reprpcwk': fields[52],
                'quies': fields[53],
            },
            'encode': {
                'exp': fields[54],
                'h3k27ac': fields[55],
                'h3k4me1': fields[56],
                'h3k4me3': fields[57],
                'nucleo': fields[58],
                'occ': fields[59],
                'p_val': {
                    'comb': fields[60],
                    'dnas': fields[61],
                    'faire': fields[62],
                    'polii': fields[63],
                    'ctcf': fields[64],
                    'mycp': fields[65]
                },
                'sig': {
                    'dnase': fields[66],
                    'faire': fields[67],
                    'polii': fields[68],
                    'ctcf': fields[69],
                    'myc': fields[70]
                },
            },
            'segway': fields[71],
            'motif': {
                'toverlap': fields[72],
                'dist': fields[73],
                'ecount': fields[74],
                'ename': fields[75],
                'ehipos': fields[76],
                'escorechng': fields[77]
            },
            'tf': {
                'bs': fields[78],
                'bs_peaks': fields[79],
                'bs_peaks_max': fields[80]
            },
            'isknownvariant': fields[81],
            'esp': {
                'af': fields[82],
                'afr': fields[83],
                'eur': fields[84]
            },
            '1000g': {
                'af': fields[85],
                'asn': fields[86],
                'amr': fields[87],
                'afr': fields[88],
                'eur': fields[89]
            },
            'min_dist_tss': fields[90],
            'min_dist_tse': fields[91],
            'gene': {
                'gene_id': fields[92],
                'feature_id': fields[93],
                'ccds_id': fields[94],
                'genename': fields[95],
                'cds': {
                    'cdna_pos': fields[96],
                    'rel_cdna_pos': fields[97],
                    'cds_pos': fields[98],
                    'rel_cds_pos': fields[99]
                },
                'prot': {
                    'protpos': fields[100],
                    'rel_prot_pos': fields[101],
                    'domain': fields[102]
                }
            },
            'dst2splice': fields[103],
            'dst2spltype': fields[104],
            'exon': fields[105],
            'intron': fields[106],
            'oaa': fields[107],  # ref aa
            'naa': fields[108],  # alt aa
            'grantham': fields[109],
            'polyphen': {
                'cat': fields[110],
                'val': fields[111]
            },
            'sift': {
                'cat': fields[112],
                'val': fields[113]
            },
            'rawscore': fields[114],  # raw CADD score
            'phred': fields[115]  # log-percentile of raw CADD score
        }
    }

    obj = dict_sweep(unlist(value_convert(one_snp_json)), ["NA"])
    yield obj
Exemplo n.º 19
0
def load_pharmgkb(data_folder):
    datafile = os.path.join(data_folder, 'genes.zip')
    gene2pharmgkb = tab2dict((datafile, 'genes.tsv'), (0, 1), 1, header=1, includefn=lambda ld: ld[1] != '')
    fn = lambda value: {'pharmgkb': value}
    gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False)
    return gene2pharmgkb
Exemplo n.º 20
0
def load_all(data_folder):
    '''Load "uniprot" using yield, while building "PDB" and "PIR"
    data dict while reading data file. These dict are then dumped
    (pickled) and stored later'''
    def cvt_fn(pdb_id):
        return pdb_id.split(':')[0]

    def merge(xli, transcode=False):
        xli2 = []
        uniprot_acc, section, entrez_id, ensembl_id = xli
        if entrez_id:
            xli2.append((uniprot_acc, section, entrez_id))
        elif ensembl_id:
            if not transcode:
                raise KeyError(ensembl_id)
            try:
                entrez_id = ensembl2geneid[ensembl_id]
                #if ensembl_id can be mapped to entrez_id
                for _eid in entrez_id:
                    xli2.append((uniprot_acc, section, _eid))
            except KeyError:
                xli2.append((uniprot_acc, section, ensembl_id))
        return xli2

    def transform(xli2):
        gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True)
        gene2uniprot = value_convert(gene2uniprot,
                                     _dict_convert,
                                     traverse_list=False)
        gid, uniprot = list(gene2uniprot.items())[0]
        docs = []
        for gid, uniprot in gene2uniprot.items():
            doc = {"_id": gid}
            doc.update(uniprot)
            docs.append(doc)
        return docs

    def merge_x(xli, gene2x, transcode=False, cvt_fn=None, k=None):
        xli2 = []
        entrez_id, ensembl_id, x_value = xli

        if not x_value:
            return

        if cvt_fn:
            x_value = cvt_fn(x_value)

        if entrez_id:
            xli2.append((entrez_id, x_value))
        elif ensembl_id:
            if not transcode:
                raise KeyError(ensembl_id)
            try:
                entrez_id = x_ensembl2geneid[ensembl_id]
                #if ensembl_id can be mapped to entrez_id
                for _eid in entrez_id:
                    xli2.append((_eid, x_value))
            except KeyError:
                xli2.append((ensembl_id, x_value))
        for x in xli2:
            gene2x.setdefault(x[0], []).append(x[1])

    uniprot_datafile = os.path.join(data_folder, 'idmapping_selected.tab.gz')
    t0 = time.time()

    # cache for uniprot
    ensembl2geneid = {}
    # cache for PDB and PIR
    x_ensembl2geneid = {}

    remains = []
    pdb_remains = []
    pir_remains = []

    # once filled, will be dumped for later storage
    gene2pdb = {}
    gene2pir = {}

    # store all PDB & PIR data while looping, the whole will be stored later
    for ld in tabfile_feeder(uniprot_datafile,
                             header=1,
                             assert_column_no=VALID_COLUMN_NO):
        # Uniprot data will be stored as we read line by line
        xlis = []
        pdbxlis = []
        pirxlis = []

        # raw lines for each sources
        uniprotld = [ld[0], ld[1], ld[2], ld[18]]
        pdbld = [ld[2], ld[19], ld[5]]
        pirld = [ld[2], ld[19], ld[11]]

        # UniProt
        # GeneID and EnsemblID columns may have duplicates
        for value in dupline_seperator(dupline=uniprotld,
                                       dup_idx=[2, 3],
                                       dup_sep='; '):
            value = list(value)
            value[1] = get_uniprot_section(value[1])
            value = tuple(value)
            xlis.append(value)
        # PDB
        for value in dupline_seperator(dupline=pdbld, dup_sep='; '):
            pdbxlis.append(value)

        # PIR
        for value in dupline_seperator(dupline=pirld, dup_sep='; '):
            pirxlis.append(value)

        for xli in xlis:
            # feed mapping
            if xli[2] != '' and xli[3] != '':
                ensembl2geneid.setdefault(xli[3], []).append(xli[2])
            try:
                # postpone ensemblid->entrezid resolution while parsing uniprot as the
                # full transcodification dict is only correct at the end.
                # ex:
                #     1. UniprotID-A    EntrezID-A  EnsemblID
                #     2. UniprotID-B                EnsemblID
                #     3. UniprotID-C    EntrezID-B  EnsemblID
                #
                #     UniprotID-B should associated to both EntrezID-A and EntrezID-B
                #     but we need to read up to line 3 to do so
                xli2 = merge(xli, transcode=False)
                if not xli2:
                    continue
                docs = transform(xli2)
                for doc in docs:
                    yield doc
            except KeyError:
                remains.append(xli)

        for xli in pdbxlis:
            if xli[0] != '' and xli[1] != '':
                x_ensembl2geneid.setdefault(xli[1], []).append(xli[0])
            try:
                merge_x(xli, gene2pdb, transcode=False, cvt_fn=cvt_fn, k="pdb")
            except KeyError:
                pdb_remains.append(xli)

        for xli in pirxlis:
            if xli[0] != '' and xli[1] != '':
                x_ensembl2geneid.setdefault(xli[1], []).append(xli[0])
            try:
                merge_x(xli, gene2pir, transcode=False)
            except KeyError:
                pir_remains.append(xli)

    # now transcode with what we have
    for remain in remains:
        try:
            xli2 = merge(remain, transcode=True)
            if not xli2:
                continue
            docs = transform(xli2)
            for doc in docs:
                yield doc
        except KeyError:
            pass

    for remain in pdb_remains:
        try:
            merge_x(remain, gene2pdb, transcode=True, cvt_fn=cvt_fn)
        except KeyError:
            pass

    for remain in pir_remains:
        try:
            merge_x(remain, gene2pir, transcode=True)
        except KeyError:
            pass

    # PDB
    def normalize(value, keyname):
        res = None
        uniq = sorted(set(value))
        if len(uniq) > 1:
            res = {keyname: uniq}
        else:
            res = {keyname: uniq[0]}
        return res

    def normalize_pdb(value):
        return normalize(value, "pdb")

    def normalize_pir(value):
        return normalize(value, "pir")

    # PDB
    gene2pdb = value_convert(gene2pdb, normalize_pdb, traverse_list=False)
    pdb_dumpfile = os.path.join(data_folder, 'gene2pdb.pyobj')
    dump(gene2pdb, pdb_dumpfile)

    # PIR
    gene2pir = value_convert(gene2pir, normalize_pir, traverse_list=False)
    pir_dumpfile = os.path.join(data_folder, 'gene2pir.pyobj')
    dump(gene2pir, pir_dumpfile)