示例#1
0
 def _load_ensembl2entrez_li(self):
     """gene_ensembl__xref_entrezgene__dm"""
     CUSTOM_MAPPING_FILE = os.path.join(DATA_FOLDER,
                                        'gene_ensembl__gene__extra.txt')
     if not os.path.exists(CUSTOM_MAPPING_FILE):
         print("Missing extra mapping file, now generating")
         from . import ensembl_ncbi_mapping
         ensembl_ncbi_mapping.main(confirm=False)
     load_start(CUSTOM_MAPPING_FILE)
     extra = tab2dict(CUSTOM_MAPPING_FILE, (0, 1), 0, alwayslist=True)
     DATAFILE = os.path.join(DATA_FOLDER,
                             'gene_ensembl__xref_entrezgene__dm.txt')
     load_start(DATAFILE)
     ensembl2entrez = tab2dict(
         DATAFILE, (1, 2), 0, includefn=_not_LRG,
         alwayslist=True)  # [(ensembl_gid, entrez_gid),...]
     # replace with our custom mapping
     for k in extra:
         ensembl2entrez[k] = extra[k]
     # back to list of tuples
     ensembl2entrez_li = []
     for ensembl_id, entrez_ids in ensembl2entrez.items():
         for entrez_id in entrez_ids:
             ensembl2entrez_li.append((ensembl_id, entrez_id))
     load_done('[%d]' % len(ensembl2entrez_li))
     self.ensembl2entrez_li = ensembl2entrez_li
示例#2
0
 def _load_ensembl2entrez_li(self):
     """gene_ensembl__xref_entrezgene__dm"""
     CUSTOM_MAPPING_FILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__extra.txt')
     if not os.path.exists(CUSTOM_MAPPING_FILE):
         print("Missing extra mapping file, now generating")
         from . import ensembl_ncbi_mapping
         ensembl_ncbi_mapping.main(confirm=False)
     load_start(CUSTOM_MAPPING_FILE)
     extra = tab2dict(CUSTOM_MAPPING_FILE,(0, 1), 0, alwayslist=True)
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt')
     load_start(DATAFILE)
     ensembl2entrez = tab2dict(DATAFILE, (1, 2), 0, includefn=_not_LRG, alwayslist=True)   # [(ensembl_gid, entrez_gid),...]
     # replace with our custom mapping
     for k in extra:
         # when all ensemblIDs are resolved to other EntrezIDs different from those defined in xref file, there can
         # be "orphan" EntrezID with no more EnsemblID associated (because there were resolved). Make sure we keep ensembl data
         # for those. (so we extend mapping instead of replacing with the ones we previously found)
         ensembl2entrez[k].extend(extra[k])
     # back to list of tuples
     ensembl2entrez_li = []
     for ensembl_id, entrez_ids in ensembl2entrez.items():
         for entrez_id in entrez_ids:
             ensembl2entrez_li.append((ensembl_id, entrez_id))
     load_done('[%d]' % len(ensembl2entrez_li))
     self.ensembl2entrez_li = ensembl2entrez_li
示例#3
0
def loaddata():
    #GNF1H
    DATAFILE = os.path.join(DATA_FOLDER, 'gnf', 'GNF1H.ANNO7.LOAD_20130402.tab')
    load_start(DATAFILE)
    gene2gnf1h = tab2dict(DATAFILE, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '')
    load_done('[%d]' % len(gene2gnf1h))
    #GNF1m
    DATAFILE = os.path.join(DATA_FOLDER, 'gnf', 'gnf1m.NEW_ANNO6.LOAD_20130402.tab')
    load_start(DATAFILE)
    gene2gnf1m = tab2dict(DATAFILE, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '')
    load_done('[%d]' % len(gene2gnf1m))

    return {'GNF1H': gene2gnf1h,
            'GNF1M': gene2gnf1m}
示例#4
0
def loaddata():
    #Snowball array
    DATAFILE = os.path.join(DATA_FOLDER, 'pigatlas', 'snowball_array_annotation.txt')
    load_start(DATAFILE)
    gene2snowball = tab2dict(DATAFILE, (0, 1), 1,header=0)
    load_done('[%d]' % len(gene2snowball))
    return {'snowball': gene2snowball}
示例#5
0
 def load(self):
     load_start(self.datafile)
     gene2generif = tab2dict(self.datafile, (1, 2, 4), 0, alwayslist=1)
     gene2generif = dict_convert(gene2generif, valuefn=lambda v: {
         'generif': [dict(pubmed=self._cvt_pubmed(x[0]), text=x[1]) for x in v]})
     load_done('[%d]' % len(gene2generif))
     return gene2generif
示例#6
0
def load_broadinstitute_exac_any(one_file,key):
    print("Loading file %s (%s)" % (one_file,key))
    data = tab2dict(os.path.join(DATA_FOLDER, one_file), (0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21), 0)
    exacs = {}
    for transcript in data:
        tupleexac = data[transcript]
        # remove version in key so we can search the dict easily later
        exacs[transcript.split(".")[0]] = {"exac" : 
                {
                    "transcript" : transcript,  # but keep version here
                    "n_exons" : int(tupleexac[0]),
                    "cds_start" : int(tupleexac[1]),
                    "cds_end" : int(tupleexac[2]),
                    "bp" : int(tupleexac[3]),
                    key : {
                        "mu_syn" : float(tupleexac[4]),
                        "mu_mis" : float(tupleexac[5]),
                        "mu_lof" : float(tupleexac[6]),
                        "n_syn" : float(tupleexac[7]),
                        "n_mis" : float(tupleexac[8]),
                        "n_lof" : float(tupleexac[9]),
                        "exp_syn" : float(tupleexac[10]),
                        "exp_mis" : float(tupleexac[11]),
                        "exp_lof" : float(tupleexac[12]),
                        "syn_z" : float(tupleexac[13]),
                        "mis_z" : float(tupleexac[14]),
                        "lof_z" : float(tupleexac[15]),
                        "p_li" : float(tupleexac[16]),
                        "p_rec" : float(tupleexac[17]),
                        "p_null" : float(tupleexac[18])
                        }
                    }
                }
    return exacs
示例#7
0
    def load_ensembl2acc(self):
        """
        loading ensembl to transcripts/proteins data
        """
        #Loading all ensembl GeneIDs, TranscriptIDs and ProteinIDs
        DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')
        load_start(DATAFILE)
        ensembl2acc = tab2dict(DATAFILE, (1,2,3), 0, includefn=_not_LRG)
        def _fn(x, eid):
            out={'gene': eid}
            if type(x) is types.ListType:
                transcript_li = []
                protein_li = []
                for _x in x:
                    if _x[0] and _x[0]!='\\N':
                        transcript_li.append(_x[0])
                    if _x[0] and _x[1]!='\\N':
                        protein_li.append(_x[1])

                if transcript_li:
                    out['transcript']=normalized_value(transcript_li)
                if protein_li:
                    out['protein'] = normalized_value(protein_li)
            else:
                if x[0] and x[0]!='\\N':
                    out['transcript'] = x[0]
                if x[1] and x[1]!='\\N':
                    out['protein'] = x[1]
            return out

        for k in ensembl2acc:
            ensembl2acc[k] = {'ensembl': _fn(ensembl2acc[k], k)}

        load_done('[%d]' % len(ensembl2acc))
        return self.convert2entrez(ensembl2acc)
示例#8
0
 def load_ensembl2pfam(self):
     #Prosite
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')
     load_start(DATAFILE)
     ensembl2pfam = dict_nodup(tab2dict(DATAFILE, (1, 4), 0))
     ensembl2pfam = value_convert(ensembl2pfam, lambda x: {'pfam': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2pfam))
     return self.convert2entrez(ensembl2pfam)
示例#9
0
 def load_ensembl2pfam(self):
     #Prosite
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')
     load_start(DATAFILE)
     ensembl2pfam = dict_nodup(tab2dict(DATAFILE, (1, 4), 0))
     ensembl2pfam = value_convert(ensembl2pfam, lambda x: {'pfam': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2pfam))
     return self.convert2entrez(ensembl2pfam)
def loaddata():
    #Snowball array
    DATAFILE = os.path.join(DATA_FOLDER, 'pigatlas',
                            'snowball_array_annotation.txt')
    load_start(DATAFILE)
    gene2snowball = tab2dict(DATAFILE, (0, 1), 1, header=0)
    load_done('[%d]' % len(gene2snowball))
    return {'snowball': gene2snowball}
示例#11
0
 def load_ensembl2pos(self):
     #Genomic position
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')
     load_start(DATAFILE)
     ensembl2pos = dict_nodup(tab2dict(DATAFILE, (1, 3, 4, 5, 6), 0, includefn=_not_LRG))
     ensembl2pos = value_convert(ensembl2pos, lambda x: {'chr': x[2], 'start': int(x[0]), 'end': int(x[1]), 'strand': int(x[3])})
     ensembl2pos = value_convert(ensembl2pos, lambda x: {'genomic_pos': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2pos))
     return self.convert2entrez(ensembl2pos)
示例#12
0
 def _load_ensembl_2taxid(self):
     """ensembl2taxid"""
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')
     load_start(DATAFILE)
     ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG))
     # need to convert taxid to integer here
     ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x))
     load_done('[%d]' % len(ensembl2taxid))
     return ensembl2taxid
示例#13
0
 def load_ensembl2pos(self):
     #Genomic position
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')
     load_start(DATAFILE)
     ensembl2pos = dict_nodup(tab2dict(DATAFILE, (1, 3, 4, 5, 6), 0, includefn=_not_LRG))
     ensembl2pos = value_convert(ensembl2pos, lambda x: {'chr': x[2], 'start': int(x[0]), 'end': int(x[1]), 'strand': int(x[3])})
     ensembl2pos = value_convert(ensembl2pos, lambda x: {'genomic_pos': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2pos))
     return self.convert2entrez(ensembl2pos)
示例#14
0
 def load_ensembl2interpro(self):
     #Interpro
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')
     load_start(DATAFILE)
     ensembl2interpro = dict_nodup(tab2dict(DATAFILE, (1, 4, 5, 6), 0))
     ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'id': x[0], 'short_desc': x[1], 'desc': x[2]})
     ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'interpro': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2interpro))
     return self.convert2entrez(ensembl2interpro)
示例#15
0
 def _load_ensembl_2taxid(self):
     """ensembl2taxid"""
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')
     load_start(DATAFILE)
     ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG))
     # need to convert taxid to integer here
     ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x))
     load_done('[%d]' % len(ensembl2taxid))
     return ensembl2taxid
示例#16
0
 def load_ensembl2interpro(self):
     #Interpro
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')
     load_start(DATAFILE)
     ensembl2interpro = dict_nodup(tab2dict(DATAFILE, (1, 4, 5, 6), 0))
     ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'id': x[0], 'short_desc': x[1], 'desc': x[2]})
     ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'interpro': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2interpro))
     return self.convert2entrez(ensembl2interpro)
示例#17
0
def load_pharmgkb():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'genes.zip')
    load_start(DATAFILE)
    gene2pharmgkb = tab2dict((DATAFILE, 'genes.tsv'), (0, 1), 1, header=1, includefn=lambda ld: ld[1] != '')
    fn = lambda value: {'pharmgkb': value}
    gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False)

    load_done('[%d]' % len(gene2pharmgkb))

    return gene2pharmgkb
示例#18
0
 def _load_ensembl2entrez_li(self):
     """gene_ensembl__xref_entrezgene__dm"""
     CUSTOM_MAPPING_FILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__extra.txt')
     if not os.path.exists(CUSTOM_MAPPING_FILE):
         print("Missing extra mapping file, now generating")
         from . import ensembl_ncbi_mapping
         ensembl_ncbi_mapping.main(confirm=False)
     load_start(CUSTOM_MAPPING_FILE)
     extra = tab2dict(CUSTOM_MAPPING_FILE,(0, 1), 0, alwayslist=True)
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt')
     load_start(DATAFILE)
     ensembl2entrez = tab2dict(DATAFILE, (1, 2), 0, includefn=_not_LRG, alwayslist=True)   # [(ensembl_gid, entrez_gid),...]
     # replace with our custom mapping
     for k in extra:
         ensembl2entrez[k] = extra[k]
     # back to list of tuples
     ensembl2entrez_li = []
     for ensembl_id, entrez_ids in ensembl2entrez.items():
         for entrez_id in entrez_ids:
             ensembl2entrez_li.append((ensembl_id, entrez_id))
     load_done('[%d]' % len(ensembl2entrez_li))
     self.ensembl2entrez_li = ensembl2entrez_li
示例#19
0
def _load_affy(df):
    filename = os.path.split(df)[1]
    rawfile, ext = os.path.splitext(filename)
    if ext.lower() == '.zip':
        df = (df, rawfile)
    dd = tab2dict(df, (0, 18), 1, sep=',', header=1, includefn=lambda ld: len(ld) > 18 and ld[18] != '---' and ld[18] != 'Entrez Gene')
    #fix for keys like "472 /// 4863" for mulitple geneids
    gene2affy = {}
    for k in dd:
        if len(k.split(' /// ')) > 1:
            for kk in k.split(' /// '):
                dict_apply(gene2affy, kk.strip(), dd[k])
        else:
            dict_apply(gene2affy, k.strip(), dd[k])
    return gene2affy
示例#20
0
    def load(self, aslist=False):
        load_start(self.datafile)
        print()
        geneid_d = get_geneid_d(self.species_li)
        gene2unigene = tab2dict(self.datafile, (0, 1), 0, alwayslist=0,
                                includefn=lambda ld: int(ld[0]) in geneid_d)
        gene_d = {}
        for gid, unigene in gene2unigene.items():
            gene_d[gid] = {'unigene': unigene}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
示例#21
0
    def load(self, aslist=False):
        load_start(self.datafile)
        print()
        geneid_d = get_geneid_d(self.species_li)
        gene2unigene = tab2dict(self.datafile, (0, 1), 0, alwayslist=0,
                                includefn=lambda ld: int(ld[0]) in geneid_d)
        gene_d = {}
        for gid, unigene in gene2unigene.items():
            gene_d[gid] = {'unigene': unigene}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
示例#22
0
    def load_ensembl2acc(self):
        """
        loading ensembl to transcripts/proteins data
        """
        #Loading all ensembl GeneIDs, TranscriptIDs and ProteinIDs
        DATAFILE = os.path.join(DATA_FOLDER,
                                'gene_ensembl__translation__main.txt')
        load_start(DATAFILE)
        ensembl2acc = tab2dict(DATAFILE, (1, 2, 3), 0, includefn=_not_LRG)

        def _fn(x, eid):
            out = {'gene': eid, 'translation': []}

            def mapping(transcript_id, protein_id):
                trid = transcript_id and transcript_id != '\\N' and transcript_id or None
                pid = protein_id and protein_id != '\\N' and protein_id or None
                if trid and pid:
                    out['translation'].append({"rna": trid, "protein": pid})

            if isinstance(x, list):
                transcript_li = []
                protein_li = []
                for _x in x:
                    if _x[0] and _x[0] != '\\N':
                        transcript_li.append(_x[0])
                    if _x[1] and _x[1] != '\\N':
                        protein_li.append(_x[1])
                    mapping(_x[0], _x[1])

                if transcript_li:
                    out['transcript'] = normalized_value(transcript_li)
                if protein_li:
                    out['protein'] = normalized_value(protein_li)
            else:
                if x[0] and x[0] != '\\N':
                    out['transcript'] = x[0]
                if x[1] and x[1] != '\\N':
                    out['protein'] = x[1]
                mapping(x[0], x[1])

            return out

        for k in ensembl2acc:
            ensembl2acc[k] = {'ensembl': _fn(ensembl2acc[k], k)}

        load_done('[%d]' % len(ensembl2acc))
        return self.convert2entrez(ensembl2acc)
示例#23
0
 def _load_ensembl2name(self):
     """loading ensembl gene to symbol+name mapping"""
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')
     load_start(DATAFILE)
     ensembl2name = tab2dict(DATAFILE, (1,2,7), 0, includefn=_not_LRG)
     def _fn(x):
         out={}
         if x[0].strip() not in ['', '\\N']:
             out['symbol'] = x[0].strip()
         if x[1].strip() not in ['', '\\N']:
             _name = SubStr(x[1].strip(), '', ' [Source:').strip()
             if _name:
                 out['name'] = _name
         return out
     ensembl2name = value_convert(ensembl2name, _fn)
     load_done('[%d]' % len(ensembl2name))
     return ensembl2name
示例#24
0
    def load(self, aslist=False):
        load_start(self.datafile)
        gene2go = tab2dict(self.datafile, (1, 2, 3, 4, 5, 6, 7),
                           0,
                           alwayslist=1,
                           includefn=self.species_filter)
        category_d = {'Function': 'MF', 'Process': 'BP', 'Component': 'CC'}

        def _ff(d):
            out = {}
            for goid, evidence, qualifier, goterm, pubmed, gocategory in d:
                _gocategory = category_d[gocategory]
                _d = out.get(_gocategory, [])
                _rec = dict(id=goid, term=goterm)
                if evidence != '-':
                    _rec['evidence'] = evidence
                if qualifier != '-':
                    # here I also fixing some inconsistency issues in NCBI data
                    # Colocalizes_with -> colocalizes_with
                    # Contributes_with -> contributes_with
                    # Not -> NOT
                    _rec['qualifier'] = qualifier.replace('Co', 'co').replace(
                        'Not', 'NOT')
                if pubmed != '-':
                    if pubmed.find('|') != -1:
                        pubmed = [int(pid) for pid in pubmed.split('|')]
                    else:
                        pubmed = int(pubmed)
                    _rec['pubmed'] = pubmed
                _d.append(_rec)
                out[_gocategory] = _d
            for k in out:
                if len(out[k]) == 1:
                    out[k] = out[k][0]
            return out

        gene2go = dict_convert(gene2go, valuefn=_ff)
        gene_d = {}
        for gid, go in gene2go.items():
            gene_d[gid] = {'go': go}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
示例#25
0
    def load_ensembl2acc(self):
        """
        loading ensembl to transcripts/proteins data
        """
        #Loading all ensembl GeneIDs, TranscriptIDs and ProteinIDs
        DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')
        load_start(DATAFILE)
        ensembl2acc = tab2dict(DATAFILE, (1, 2, 3), 0, includefn=_not_LRG)

        def _fn(x, eid):
            out = {'gene': eid, 'translation' : []}
            def mapping(transcript_id, protein_id):
                trid = transcript_id and transcript_id != '\\N' and transcript_id or None
                pid = protein_id and protein_id != '\\N' and protein_id or None
                if trid and pid:
                    out['translation'].append({"rna" : trid, "protein" : pid})

            if isinstance(x, list):
                transcript_li = []
                protein_li = []
                for _x in x:
                    if _x[0] and _x[0] != '\\N':
                        transcript_li.append(_x[0])
                    if _x[1] and _x[1] != '\\N':
                        protein_li.append(_x[1])
                    mapping(_x[0],_x[1])

                if transcript_li:
                    out['transcript'] = normalized_value(transcript_li)
                if protein_li:
                    out['protein'] = normalized_value(protein_li)
            else:
                if x[0] and x[0] != '\\N':
                    out['transcript'] = x[0]
                if x[1] and x[1] != '\\N':
                    out['protein'] = x[1]
                mapping(x[0],x[1])

            return out

        for k in ensembl2acc:
            ensembl2acc[k] = {'ensembl': _fn(ensembl2acc[k], k)}

        load_done('[%d]' % len(ensembl2acc))
        return self.convert2entrez(ensembl2acc)
示例#26
0
    def load(self, aslist=False):
        load_start(self.datafile)
        gene2go = tab2dict(self.datafile, (1, 2, 3, 4, 5, 6, 7), 0, alwayslist=1,
                           includefn=self.species_filter)
        category_d = {'Function': 'MF',
                      'Process': 'BP',
                      'Component': 'CC'}

        def _ff(d):
            out = {}
            for goid, evidence, qualifier, goterm, pubmed, gocategory in d:
                _gocategory = category_d[gocategory]
                _d = out.get(_gocategory, [])
                _rec = dict(id=goid, term=goterm)
                if evidence != '-':
                    _rec['evidence'] = evidence
                if qualifier != '-':
                    # here I also fixing some inconsistency issues in NCBI data
                    # Colocalizes_with -> colocalizes_with
                    # Contributes_with -> contributes_with
                    # Not -> NOT
                    _rec['qualifier'] = qualifier.replace('Co', 'co').replace('Not', 'NOT')
                if pubmed != '-':
                    if pubmed.find('|') != -1:
                        pubmed = [int(pid) for pid in pubmed.split('|')]
                    else:
                        pubmed = int(pubmed)
                    _rec['pubmed'] = pubmed
                _d.append(_rec)
                out[_gocategory] = _d
            for k in out:
                if len(out[k]) == 1:
                    out[k] = out[k][0]
            return out

        gene2go = dict_convert(gene2go, valuefn=_ff)
        gene_d = {}
        for gid, go in gene2go.items():
            gene_d[gid] = {'go': go}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
示例#27
0
    def _load_ensembl2name(self):
        """loading ensembl gene to symbol+name mapping"""
        DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')
        load_start(DATAFILE)
        ensembl2name = tab2dict(DATAFILE, (1, 2, 7), 0, includefn=_not_LRG)

        def _fn(x):
            out = {}
            if x[0].strip() not in ['', '\\N']:
                out['symbol'] = x[0].strip()
            if x[1].strip() not in ['', '\\N']:
                _name = SubStr(x[1].strip(), '', ' [Source:').strip()
                if _name:
                    out['name'] = _name
            return out
        ensembl2name = value_convert(ensembl2name, _fn)
        load_done('[%d]' % len(ensembl2name))
        return ensembl2name
示例#28
0
    def load(self, aslist=False):
        load_start(self.datafile)
        if self.species_li:
            _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[1] != '-'
        else:
            _includefn = lambda ld: ld[1] != '-'
        gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1,
                                includefn=_includefn)
        gene2retired = dict_convert(gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x]))

        gene_d = {}
        for gid, retired in gene2retired.items():
            gene_d[gid] = {'retired': retired}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
示例#29
0
    def load(self, aslist=False):
        load_start(self.datafile)
        gene2acc = tab2dict(self.datafile, (1, 3, 5, 7), 0, alwayslist=1,
                            includefn=self.species_filter)

        def _ff(d):
            out = {
                'rna': [],
                'protein': [],
                'genomic': [],
                'translation': []
            }
            for rna, prot, dna in d:
                if rna == '-': rna = None
                if prot == '-': prot = None
                if dna == '-': dna = None
                if rna is not None:
                    out['rna'].append(rna)
                if prot is not None:
                    out['protein'].append(prot)
                if dna is not None:
                    out['genomic'].append(dna)
                if rna and prot:
                    out['translation'].append({'rna' : rna, 'protein' : prot})
            # remove dup
            for k in out:
                out[k] = normalized_value(out[k])
            # remove empty rna/protein/genomic field
            _out = {}
            for k, v in out.items():
                if v:
                    _out[k] = v
            if _out:
                _out = {self.fieldname: _out}
            return _out

        gene2acc = dict_convert(gene2acc, valuefn=_ff)
        load_done('[%d]' % len(gene2acc))

        if aslist:
            return dict_to_list(gene2acc)
        else:
            return gene2acc
示例#30
0
    def load(self, aslist=False):
        load_start(self.datafile)
        if self.species_li:
            _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[1] != '-'
        else:
            _includefn = lambda ld: ld[1] != '-'
        gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1,
                                includefn=_includefn)
        gene2retired = dict_convert(gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x]))

        gene_d = {}
        for gid, retired in gene2retired.items():
            gene_d[gid] = {'retired': retired}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
示例#31
0
 def load_ensembl2pos(self):
     #Genomic position
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')
     load_start(DATAFILE)
     # Twice 1 because first is the dict key, the second because we need gene id within genomic_pos
     ensembl2pos = dict_nodup(
         tab2dict(DATAFILE, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG))
     ensembl2pos = value_convert(
         ensembl2pos, lambda x: {
             'ensemblgene': x[0],
             'chr': x[3],
             'start': int(x[1]),
             'end': int(x[2]),
             'strand': int(x[4])
         })
     ensembl2pos = value_convert(ensembl2pos,
                                 lambda x: {'genomic_pos': x},
                                 traverse_list=False)
     load_done('[%d]' % len(ensembl2pos))
     return self.convert2entrez(ensembl2pos)
示例#32
0
def _load_affy(df):
    filename = os.path.split(df)[1]
    rawfile, ext = os.path.splitext(filename)
    if ext.lower() == '.zip':
        df = (df, rawfile)
    dd = tab2dict(df, (0, 18),
                  1,
                  sep=',',
                  header=1,
                  includefn=lambda ld: len(ld) > 18 and ld[18] != '---' and ld[
                      18] != 'Entrez Gene')
    #fix for keys like "472 /// 4863" for mulitple geneids
    gene2affy = {}
    for k in dd:
        if len(k.split(' /// ')) > 1:
            for kk in k.split(' /// '):
                dict_apply(gene2affy, kk.strip(), dd[k])
        else:
            dict_apply(gene2affy, k.strip(), dd[k])
    return gene2affy
示例#33
0
def load_exons_for_species(species, exons_key='exons'):
    refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz')
    reflink_file = os.path.join(DATA_FOLDER, species, 'database/refLink.txt.gz')

    load_start(refflat_file)
    t0 = time.time()

    refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False)
    ref2exons = []
    for ld in tabfile_feeder(refflat_file, header=0):
        refseq = ld[1]
        chr = ld[2]
        if chr.startswith('chr'):
            chr = chr[3:]
        exons = zip([int(x) for x in ld[9].split(',') if x],
                    [int(x) for x in ld[10].split(',') if x])
        assert len(exons) == int(ld[8]), (len(exons), int(ld[8]))
        ref2exons.append((refseq, {
            'chr': chr,
            'strand': -1 if ld[3] == '-' else 1,
            'txstart': int(ld[4]),
            'txend': int(ld[5]),
            'cdsstart': int(ld[6]),
            'cdsend': int(ld[7]),
            'exons': exons
        }))
    ref2exons = list2dict(ref2exons, 0)

    gene2exons = {}
    for refseq in sorted(ref2exons.keys()):
        geneid = refseq2gene.get(refseq, None)
        if geneid and geneid != '0':
            if geneid not in gene2exons:
                gene2exons[geneid] = {exons_key: {refseq: ref2exons[refseq]}}
            else:
                gene2exons[geneid][exons_key][refseq] = ref2exons[refseq]

    load_done('[%d, %s]' % (len(gene2exons), timesofar(t0)))

    return gene2exons
示例#34
0
def _load_affy(df):
    filename = os.path.split(df)[1]
    rawfile, ext = os.path.splitext(filename)
    if ext.lower() == '.zip':
        df = (df, rawfile)
    dd = tab2dict(df, (0, 7), 1, sep=',', header=1, includefn=lambda ld: len(ld) > 7 and ld[7] != '---' and ld[7] != 'gene_assignment')
    #fix for keys like "472 /// 4863" for mulitple geneids
    gene2affy = {}
    for k in dd:
        kk = k.split('///')
        if len(kk) > 1:
            for kkk in kk:
                k4 = kkk.split('//')
                if k4[len(k4) - 1].strip() != '---':
                    dict_apply(gene2affy, k4[len(k4) - 1].strip(), dd[k])
        else:
            k4 = k.split('//')
            if len(k4) > 1:
                if k4[len(k4) - 1].strip() != '---':
                    dict_apply(gene2affy, k4[len(k4) - 1].strip(), dd[k])

    return gene2affy
示例#35
0
    def load(self, aslist=False):
        load_start(self.datafile)
        gene2acc = tab2dict(self.datafile, (1, 3, 5, 7), 0, alwayslist=1,
                            includefn=self.species_filter)

        def _ff(d):
            out = {
                'rna': [],
                'protein': [],
                'genomic': []
            }
            for x1, x2, x3 in d:
                if x1 != '-':
                    out['rna'].append(x1.split('.')[0])   # trim version number after dot
                if x2 != '-':
                    out['protein'].append(x2.split('.')[0])
                if x3 != '-':
                    out['genomic'].append(x3.split('.')[0])
            # remove dup
            for k in out:
                out[k] = normalized_value(out[k])
            # remove empty rna/protein/genomic field
            _out = {}
            for k, v in out.items():
                if v:
                    _out[k] = v
            if _out:
                _out = {self.fieldname: _out}
            return _out

        gene2acc = dict_convert(gene2acc, valuefn=_ff)
        load_done('[%d]' % len(gene2acc))

        if aslist:
            return dict_to_list(gene2acc)
        else:
            return gene2acc
示例#36
0
def load_broadinstitute_exac_any(one_file, key):
    print("Loading file %s (%s)" % (one_file, key))
    data = tab2dict(os.path.join(DATA_FOLDER, one_file),
                    (0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
                     18, 19, 20, 21), 0)
    exacs = {}
    for transcript in data:
        tupleexac = data[transcript]
        # remove version in key so we can search the dict easily later
        exacs[transcript.split(".")[0]] = {
            "exac": {
                "transcript": transcript,  # but keep version here
                "n_exons": int(tupleexac[0]),
                "cds_start": int(tupleexac[1]),
                "cds_end": int(tupleexac[2]),
                "bp": int(tupleexac[3]),
                key: {
                    "mu_syn": float(tupleexac[4]),
                    "mu_mis": float(tupleexac[5]),
                    "mu_lof": float(tupleexac[6]),
                    "n_syn": float(tupleexac[7]),
                    "n_mis": float(tupleexac[8]),
                    "n_lof": float(tupleexac[9]),
                    "exp_syn": float(tupleexac[10]),
                    "exp_mis": float(tupleexac[11]),
                    "exp_lof": float(tupleexac[12]),
                    "syn_z": float(tupleexac[13]),
                    "mis_z": float(tupleexac[14]),
                    "lof_z": float(tupleexac[15]),
                    "p_li": float(tupleexac[16]),
                    "p_rec": float(tupleexac[17]),
                    "p_null": float(tupleexac[18])
                }
            }
        }
    return exacs
示例#37
0
import os.path
import time
from biothings.utils.common import timesofar
from utils.dataload import (load_start, load_done, tab2dict,
                            tabfile_feeder, list2dict)

from dataload import get_data_folder

# DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot')
DATA_FOLDER = os.path.join(get_data_folder('ucsc'), 'goldenPath/currentGenomes')
REFLINK_FILE = os.path.join(get_data_folder('ucsc'), 'goldenPath/hgFixed/database/refLink.txt.gz')
refseq2gene = tab2dict(REFLINK_FILE, (2, 6), 0, alwayslist=False)


def load_exons_for_species(species, exons_key='exons'):
    refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz')

    load_start(refflat_file)
    t0 = time.time()

    ref2exons = {}
    for ld in tabfile_feeder(refflat_file, header=0):
        refseq = ld[1]
        chr = ld[2]
        if chr.startswith('chr'):
            chr = chr[3:]
        exons = list(zip([int(x) for x in ld[9].split(',') if x],
                     [int(x) for x in ld[10].split(',') if x]))
        assert len(exons) == int(ld[8]), (len(exons), int(ld[8]))
        ref2exons.setdefault(refseq,[]).append({
            'transcript' : refseq,
示例#38
0
    def load(self, aslist=False):
        '''
        loading ncbi "gene_info" file
        This must be called first to create basic gene documents
        with all basic fields, e.g., name, symbol, synonyms, etc.

        format of gene_info file:
        #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs
                 map_location description type_of_gene Symbol_from
                 nomenclature_authority Full_name_from_nomenclature_authority
        Nomenclature_status Other_designations Modification_da
        te (tab is used as a separator, pound sign - start of a comment)

        '''
        load_start(self.datafile)
        gene_d = tab2dict(self.datafile, (0, 1, 2, 3, 4, 5, 7, 8, 9, 13, 14),
                          key=1,
                          alwayslist=0,
                          includefn=self.species_filter)

        def _ff(d):
            (taxid, symbol, locus_tag, synonyms, dbxrefs, map_location,
             description, type_of_gene, other_designations,
             modification_date) = d
            out = dict(taxid=int(taxid), symbol=symbol, name=description)
            if map_location != '-':
                out['map_location'] = map_location
            if type_of_gene != '-':
                out['type_of_gene'] = type_of_gene
            if synonyms != '-':
                out['alias'] = normalized_value(synonyms.split('|'))
            if locus_tag != '-':
                out['locus_tag'] = locus_tag
            if other_designations != "-":
                out['other_names'] = normalized_value(
                    other_designations.split('|'))

            # when merged, this will become the default timestamp
            out["_timestamp"] = datetime.datetime.strptime(
                modification_date, "%Y%m%d")

            for x in dbxrefs.split('|'):
                if x == '-':
                    continue
                xd = x.split(':')
                if len(xd) == 3 and xd[0] == xd[1] and \
                        xd[0] in ['VGNC', 'HGNC', 'MGI']:
                    # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328'
                    xd = xd[1:]
                try:
                    _db, _id = xd
                except:
                    print(repr(x))
                    raise
                # we don't need ensembl xref from here, we will get it from
                # Ensembl directly
                if _db.lower() in ['ensembl', 'imgt/gene-db']:
                    # we don't need 'IMGT/GENE-DB" xref either, because they
                    # are mostly the same as gene symbol
                    continue
                # add "MGI:" prefix for MGI ids.
                if _db.lower() == 'mgi':
                    _id = "MGI:" + _id
                out[_db] = _id
            return out

        gene_d = value_convert(gene_d, _ff)

        # add entrezgene field
        for geneid in gene_d:
            d = gene_d[geneid]
            d['entrezgene'] = int(geneid)
            gene_d[geneid] = d

        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
示例#39
0
def get_geneid_d(species_li=None, load_cache=True, save_cache=True):
    '''return a dictionary of current/retired geneid to current geneid mapping.
       This is useful, when other annotations were mapped to geneids may contain
       retired gene ids.

       if species_li is None, genes from all species are loaded.

       Note that all ids are int type.
    '''
    if species_li:
        taxid_set = set([taxid_d[species] for species in species_li])
    else:
        taxid_set = None

    orig_cwd = os.getcwd()
    os.chdir(DATA_FOLDER)

    # check cache file
    _cache_file = 'gene/geneid_d.pyobj'
    if load_cache and os.path.exists(_cache_file) and \
       file_newer(_cache_file, 'gene/gene_info.gz') and \
       file_newer(_cache_file, 'gene/gene_history.gz'):

        print('Loading "geneid_d" from cache file...', end='')
        _taxid_set, out_d = loadobj(_cache_file)
        assert _taxid_set == taxid_set
        print('Done.')
        os.chdir(orig_cwd)
        return out_d

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz')
    load_start(DATAFILE)
    if species_li:
        species_filter = lambda ld: int(ld[0]) in taxid_set
    else:
        species_filter = None
    geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter))
    load_done('[%d]' % len(geneid_li))

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz')
    load_start(DATAFILE)

    if species_li:
        _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li
    else:
        _includefn = lambda ld: ld[1] in geneid_li    # include all species
    retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn)
    # includefn above makes sure taxid is for species_li and filters out those mapped_to geneid exists in gene_info list

    load_done('[%d]' % len(retired2gene))

    out_d = dict_convert(retired2gene, keyfn=int, valuefn=int)    # convert key/value to int
    for g in geneid_li:
        _g = int(g)
        out_d[_g] = _g

    if save_cache:
        if species_li:
            dump((taxid_set, out_d), _cache_file)
        else:
            dump((None, out_d), _cache_file)

    os.chdir(orig_cwd)
    return out_d
示例#40
0
    def load(self, aslist=False):
        '''
        loading ncbi "gene_info" file
        This must be called first to create basic gene documents
        with all basic fields, e.g., name, symbol, synonyms, etc.

        format of gene_info file:
        #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs chromosome map_location description type_of_gene Symbol_from
        _nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_da
        te (tab is used as a separator, pound sign - start of a comment)

        '''
        load_start(self.datafile)
        gene_d = tab2dict(self.datafile, (0, 1, 2, 4, 5, 7, 8, 9), key=1, alwayslist=0, includefn=self.species_filter)

        def _ff(d):
            (
                taxid, symbol, synonyms,
                dbxrefs, map_location,
                description, type_of_gene
            ) = d
            out = dict(taxid=int(taxid),
                       symbol=symbol,
                       name=description)
            if map_location != '-':
                out['map_location'] = map_location
            if type_of_gene != '-':
                out['type_of_gene'] = type_of_gene
            if synonyms != '-':
                out['alias'] = normalized_value(synonyms.split('|'))

            for x in dbxrefs.split('|'):
                if x == '-':
                    continue
                xd = x.split(':')
                if len(xd) == 3 and xd[0] == xd[1] and xd[0] in ['HGNC', 'MGI']:
                    xd = xd[1:]      # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328'
                try:
                    _db, _id = xd
                except:
                    print(x)
                    raise
                if _db.lower() in ['ensembl', 'imgt/gene-db']:      # we don't need ensembl xref from here, we will get it from Ensembl directly
                    continue                                        # we don't need 'IMGT/GENE-DB" xref either, because they are mostly the same as gene symbol
                if _db.lower() == 'mgi':            # add "MGI:" prefix for MGI ids.
                    _id = "MGI:"+_id
                out[_db] = _id
            return out

        gene_d = value_convert(gene_d, _ff)

        # add entrezgene field
        for geneid in gene_d:
            d = gene_d[geneid]
            d['entrezgene'] = int(geneid)
            gene_d[geneid] = d

        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
示例#41
0
def get_geneid_d(species_li=None, load_cache=True, save_cache=True):
    '''return a dictionary of current/retired geneid to current geneid mapping.
       This is useful, when other annotations were mapped to geneids may
       contain retired gene ids.

       if species_li is None, genes from all species are loaded.

       Note that all ids are int type.
    '''
    if species_li:
        taxid_set = set([TAXONOMY[species] for species in species_li])
    else:
        taxid_set = None

    orig_cwd = os.getcwd()
    os.chdir(DATA_FOLDER)

    # check cache file
    _cache_file = 'gene/geneid_d.pyobj'
    if load_cache and os.path.exists(_cache_file) and \
       file_newer(_cache_file, 'gene/gene_info.gz') and \
       file_newer(_cache_file, 'gene/gene_history.gz'):

        print('Loading "geneid_d" from cache file...', end='')
        _taxid_set, out_d = loadobj(_cache_file)
        assert _taxid_set == taxid_set
        print('Done.')
        os.chdir(orig_cwd)
        return out_d

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz')
    load_start(DATAFILE)
    if species_li:
        species_filter = lambda ld: int(ld[0]) in taxid_set
    else:
        species_filter = None
    geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter))
    load_done('[%d]' % len(geneid_li))

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz')
    load_start(DATAFILE)

    if species_li:
        _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li
    else:
        _includefn = lambda ld: ld[1] in geneid_li  # include all species
    retired2gene = tab2dict(DATAFILE, (1, 2),
                            1,
                            alwayslist=0,
                            includefn=_includefn)
    # includefn above makes sure taxid is for species_li and filters out those
    # mapped_to geneid exists in gene_info list

    load_done('[%d]' % len(retired2gene))
    # convert key/value to int
    out_d = dict_convert(retired2gene, keyfn=int, valuefn=int)
    for g in geneid_li:
        _g = int(g)
        out_d[_g] = _g

    if save_cache:
        if species_li:
            dump((taxid_set, out_d), _cache_file)
        else:
            dump((None, out_d), _cache_file)

    os.chdir(orig_cwd)
    return out_d
示例#42
0
import os.path
import time
from biothings.utils.common import timesofar
from utils.dataload import (load_start, load_done, tab2dict, tabfile_feeder,
                            list2dict)

from dataload import get_data_folder

# DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot')
DATA_FOLDER = os.path.join(get_data_folder('ucsc'),
                           'goldenPath/currentGenomes')
REFLINK_FILE = os.path.join(get_data_folder('ucsc'),
                            'goldenPath/hgFixed/database/refLink.txt.gz')
refseq2gene = tab2dict(REFLINK_FILE, (2, 6), 0, alwayslist=False)


def load_exons_for_species(species, exons_key='exons'):
    refflat_file = os.path.join(DATA_FOLDER, species,
                                'database/refFlat.txt.gz')

    load_start(refflat_file)
    t0 = time.time()

    ref2exons = {}
    for ld in tabfile_feeder(refflat_file, header=0):
        refseq = ld[1]
        chr = ld[2]
        if chr.startswith('chr'):
            chr = chr[3:]
        exons = list(
            zip([int(x) for x in ld[9].split(',') if x],