Пример #1
0
def load_x(idx, fieldname, cvt_fn=None):
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz')
    load_start(DATAFILE)
    t0 = time.time()
    xli = []
    for ld in tabfile_feeder(DATAFILE, header=1):
        ld = listitems(ld, *(2,19,idx))    # GeneID Ensembl(Gene) target_value
        for value in dupline_seperator(dupline=ld,
                                       dup_sep='; '):
            xli.append(value)

    ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli if x[0]!='' and x[1]!='']), 0, alwayslist=True)
    xli2 = []
    for entrez_id, ensembl_id, x_value in xli:
        if x_value:
            if cvt_fn:
                x_value = cvt_fn(x_value)
            if entrez_id:
                xli2.append((entrez_id, x_value))
            elif ensembl_id:
                entrez_id = ensembl2geneid.get(ensembl_id, None)
                if entrez_id:
                    for _eid in entrez_id:
                        xli2.append((_eid, x_value))
                else:
                    xli2.append((ensembl_id, x_value))

    gene2x = list2dict(list_nondup(xli2), 0)
    fn = lambda value: {fieldname: sorted(value) if type(value) is types.ListType else value}
    gene2x = value_convert(gene2x, fn, traverse_list=False)
    load_done('[%d, %s]' % (len(gene2x), timesofar(t0)))

    return gene2x
Пример #2
0
def loaddata():
    #Snowball array
    DATAFILE = os.path.join(DATA_FOLDER, 'pigatlas', 'snowball_array_annotation.txt')
    load_start(DATAFILE)
    gene2snowball = tab2dict(DATAFILE, (0, 1), 1,header=0)
    load_done('[%d]' % len(gene2snowball))
    return {'snowball': gene2snowball}
Пример #3
0
 def load(self):
     load_start(self.datafile)
     gene2generif = tab2dict(self.datafile, (1, 2, 4), 0, alwayslist=1)
     gene2generif = dict_convert(gene2generif, valuefn=lambda v: {
         'generif': [dict(pubmed=self._cvt_pubmed(x[0]), text=x[1]) for x in v]})
     load_done('[%d]' % len(gene2generif))
     return gene2generif
Пример #4
0
def load_cpdb():
        
    print('DATA_FOLDER: '+ DATA_FOLDER)
    DATA_FILES = []
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_mouse.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_yeast.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_human.tab'))
    arr = {}
    for DATA_FILE in DATA_FILES:
        load_start(DATA_FILE)
        f= open(DATA_FILE,"r")
        lines = f.readlines()
        for line in lines:
            line = line.rstrip('\n')
            cols = line.split("\t")
            genes = cols[len(cols)-1].split(",")
            for gene in genes:
                if gene != "entrez_gene_ids" and gene in arr.keys():
                    if cols[len(cols)-2] not in arr[gene]['pathway'].keys():
                        arr[gene]['pathway'][cols[len(cols)-2].lower()]={'name':''}
                    arr[gene]['pathway'][cols[len(cols)-2].lower()]['name'] = cols[len(cols)-4]
                    if cols[len(cols)-3] != "None":
                        if cols[len(cols)-2].lower() == "kegg":
                            arr[gene]['pathway'][cols[len(cols)-2].lower()]['id'] = cols[len(cols)-3].replace("path:","")
                        else :
                            arr[gene]['pathway'][cols[len(cols)-2].lower()]['id'] = cols[len(cols)-3]
                else:
                    if cols[len(cols)-3] != "None":
                        arr[gene]= {'pathway':{cols[len(cols)-2].lower():{'name':cols[len(cols)-4], 'id': cols[len(cols)-3].replace("path:","")}}}

        load_done('[%d]' % len(arr))

    return arr
Пример #5
0
def load_broadinstitute_exac():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    t0 = time.time()
    exacs = load_broadinstitute_exac_all()
    for k,v in load_broadinstitute_exac_nontcga().items():
        try:
            exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"]
        except KeyError:
            exacs[k] = v
    for k,v in load_broadinstitute_exac_nonpsych().items():
        try:
            exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"]
        except KeyError:
            exacs[k] = v

    logging.info("Convert transcript ID to EntrezID")
    import dataload.sources.ensembl.ensembl_base as ensembl_base
    ensembl_parser = ensembl_base.EnsemblParser()
    ensembl_parser._load_ensembl2entrez_li()
    ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True)
    ensembl_dir = get_data_folder("ensembl")  
    for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")):
        _,ensid,transid,_ = line
        if transid in exacs:
            data = exacs.pop(transid) # pop so no-match means no data in the end
            for entrezid in ensembl2entrez.get(ensid,[ensid]):
                exacs[entrezid] = data

    load_done('[%d, %s]' % (len(exacs), timesofar(t0)))

    return exacs
Пример #6
0
 def _load_ensembl2entrez_li(self):
     """gene_ensembl__xref_entrezgene__dm"""
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt')
     load_start(DATAFILE)
     ensembl2entrez_li = tab2list(DATAFILE, (1, 2), includefn=_not_LRG)   # [(ensembl_gid, entrez_gid),...]
     load_done('[%d]' % len(ensembl2entrez_li))
     self.ensembl2entrez_li = ensembl2entrez_li
Пример #7
0
    def load_ensembl2acc(self):
        """
        loading ensembl to transcripts/proteins data
        """
        #Loading all ensembl GeneIDs, TranscriptIDs and ProteinIDs
        DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')
        load_start(DATAFILE)
        ensembl2acc = tab2dict(DATAFILE, (1,2,3), 0, includefn=_not_LRG)
        def _fn(x, eid):
            out={'gene': eid}
            if type(x) is types.ListType:
                transcript_li = []
                protein_li = []
                for _x in x:
                    if _x[0] and _x[0]!='\\N':
                        transcript_li.append(_x[0])
                    if _x[0] and _x[1]!='\\N':
                        protein_li.append(_x[1])

                if transcript_li:
                    out['transcript']=normalized_value(transcript_li)
                if protein_li:
                    out['protein'] = normalized_value(protein_li)
            else:
                if x[0] and x[0]!='\\N':
                    out['transcript'] = x[0]
                if x[1] and x[1]!='\\N':
                    out['protein'] = x[1]
            return out

        for k in ensembl2acc:
            ensembl2acc[k] = {'ensembl': _fn(ensembl2acc[k], k)}

        load_done('[%d]' % len(ensembl2acc))
        return self.convert2entrez(ensembl2acc)
Пример #8
0
 def load_ensembl2pfam(self):
     #Prosite
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')
     load_start(DATAFILE)
     ensembl2pfam = dict_nodup(tab2dict(DATAFILE, (1, 4), 0))
     ensembl2pfam = value_convert(ensembl2pfam, lambda x: {'pfam': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2pfam))
     return self.convert2entrez(ensembl2pfam)
Пример #9
0
 def load_ensembl2pos(self):
     #Genomic position
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')
     load_start(DATAFILE)
     ensembl2pos = dict_nodup(tab2dict(DATAFILE, (1, 3, 4, 5, 6), 0, includefn=_not_LRG))
     ensembl2pos = value_convert(ensembl2pos, lambda x: {'chr': x[2], 'start': int(x[0]), 'end': int(x[1]), 'strand': int(x[3])})
     ensembl2pos = value_convert(ensembl2pos, lambda x: {'genomic_pos': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2pos))
     return self.convert2entrez(ensembl2pos)
Пример #10
0
 def load_ensembl2interpro(self):
     #Interpro
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')
     load_start(DATAFILE)
     ensembl2interpro = dict_nodup(tab2dict(DATAFILE, (1, 4, 5, 6), 0))
     ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'id': x[0], 'short_desc': x[1], 'desc': x[2]})
     ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'interpro': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2interpro))
     return self.convert2entrez(ensembl2interpro)
Пример #11
0
 def _load_ensembl_2taxid(self):
     """ensembl2taxid"""
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')
     load_start(DATAFILE)
     ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG))
     # need to convert taxid to integer here
     ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x))
     load_done('[%d]' % len(ensembl2taxid))
     return ensembl2taxid
Пример #12
0
def load_pharmgkb():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'genes.zip')
    load_start(DATAFILE)
    gene2pharmgkb = tab2dict((DATAFILE, 'genes.tsv'), (0, 1), 1, header=1, includefn=lambda ld: ld[1] != '')
    fn = lambda value: {'pharmgkb': value}
    gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False)

    load_done('[%d]' % len(gene2pharmgkb))

    return gene2pharmgkb
Пример #13
0
def loaddata():
    affy_d = {}
    for annot in AFFY_ANNOT_FILES:
        name = annot['name']
        DATAFILE = os.path.join(AFFY_DATA_FOLDER, annot['file'] % AFFY_RELEASE)
        load_start(DATAFILE)
        d = _load_affy(DATAFILE)
        affy_d[name] = d
        load_done('[%d]' % len(d))

    return affy_d
Пример #14
0
def load_ucsc_exons():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    species_li = os.listdir(DATA_FOLDER)
    print "Found {} species folders.".format(len(species_li))
    t0 = time.time()
    gene2exons = {}
    for species in species_li:
        print species, '...'
        gene2exons.update(load_exons_for_species(species))

    load_done('[%d, %s]' % (len(gene2exons), timesofar(t0)))

    return gene2exons
Пример #15
0
def loaddata():
    #GNF1H
    DATAFILE = os.path.join(DATA_FOLDER, 'gnf', 'GNF1H.ANNO7.LOAD_20130402.tab')
    load_start(DATAFILE)
    gene2gnf1h = tab2dict(DATAFILE, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '')
    load_done('[%d]' % len(gene2gnf1h))
    #GNF1m
    DATAFILE = os.path.join(DATA_FOLDER, 'gnf', 'gnf1m.NEW_ANNO6.LOAD_20130402.tab')
    load_start(DATAFILE)
    gene2gnf1m = tab2dict(DATAFILE, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '')
    load_done('[%d]' % len(gene2gnf1m))

    return {'GNF1H': gene2gnf1h,
            'GNF1M': gene2gnf1m}
Пример #16
0
    def load(self, aslist=False):
        load_start(self.datafile)
        print()
        geneid_d = get_geneid_d(self.species_li)
        gene2unigene = tab2dict(self.datafile, (0, 1), 0, alwayslist=0,
                                includefn=lambda ld: int(ld[0]) in geneid_d)
        gene_d = {}
        for gid, unigene in gene2unigene.items():
            gene_d[gid] = {'unigene': unigene}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Пример #17
0
    def load(self, aslist=False):
        load_start(self.datafile)
        with file(self.datafile) as df:
            geneid_set = set()
            doc_li = []
            for line in df:
                geneid, summary = line.strip().split('\t')
                if geneid not in geneid_set:
                    doc_li.append(dict(_id=geneid, summary=unicode(summary)))
                    geneid_set.add(geneid)
        load_done('[%d]' % len(doc_li))

        if aslist:
            return doc_li
        else:
            gene_d = dict([(d['_id'], d) for d in doc_li])
            return gene_d
Пример #18
0
 def _load_ensembl2name(self):
     """loading ensembl gene to symbol+name mapping"""
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')
     load_start(DATAFILE)
     ensembl2name = tab2dict(DATAFILE, (1,2,7), 0, includefn=_not_LRG)
     def _fn(x):
         out={}
         if x[0].strip() not in ['', '\\N']:
             out['symbol'] = x[0].strip()
         if x[1].strip() not in ['', '\\N']:
             _name = SubStr(x[1].strip(), '', ' [Source:').strip()
             if _name:
                 out['name'] = _name
         return out
     ensembl2name = value_convert(ensembl2name, _fn)
     load_done('[%d]' % len(ensembl2name))
     return ensembl2name
Пример #19
0
def loaddata():
    affy_d = {}
    for annot in AFFY_ANNOT_FILES:
        name = annot['name']
        DATAFILE = annot['file']
        if DATAFILE.find('%s') != -1:
            if DATAFILE.startswith('extra'):
                DATAFILE = DATAFILE % AFFY_RELEASE_EXTRA
            else:
                DATAFILE = DATAFILE % AFFY_RELEASE
        DATAFILE = os.path.join(AFFY_DATA_FOLDER, DATAFILE)
        load_start(DATAFILE)
        d = _load_affy(DATAFILE)
        affy_d[name] = d
        load_done('[%d]' % len(d))

    return affy_d
Пример #20
0
def load_cpdb(__metadata__):
    # only import pathways from these sources
    PATHWAY_SOURCES_INCLUDED = __metadata__['pathway_sources_included']
    VALID_COLUMN_NO = 4

    t0 = time.time()
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATA_FILES = []
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_mouse.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_yeast.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_human.tab'))

    _out = []
    for DATA_FILE in DATA_FILES:
        load_start(DATA_FILE)
        for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO):
            p_name, p_id, p_source = ld[:3]
            p_source = p_source.lower()
            if p_source == 'kegg' and p_id.startswith('path:'):
                p_id = p_id[5:]
            if p_source in PATHWAY_SOURCES_INCLUDED:
                genes = ld[-1].split(",")
                for gene in genes:
                    _out.append((gene, p_name, p_id, p_source))
        load_done()
    _out = list2dict(_out, 0, alwayslist=True)

    def _inner_cvt(p):
        p_name, p_id = p
        _d = {'name': p_name}
        if p_id != 'None':
            _d['id'] = p_id
        return _d

    def _cvt(pli):
        _d = list2dict(pli, 2)
        _d = value_convert(_d, _inner_cvt)
        for p_source in _d:
            if isinstance(_d[p_source], list):
                _d[p_source].sort()
        return {'pathway': _d}

    _out = dict_convert(_out, valuefn=_cvt)
    load_done('[%d, %s]' % (len(_out), timesofar(t0)))

    return _out
Пример #21
0
    def load_ensembl2acc(self):
        """
        loading ensembl to transcripts/proteins data
        """
        #Loading all ensembl GeneIDs, TranscriptIDs and ProteinIDs
        DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')
        load_start(DATAFILE)
        ensembl2acc = tab2dict(DATAFILE, (1, 2, 3), 0, includefn=_not_LRG)

        def _fn(x, eid):
            out = {'gene': eid, 'translation' : []}
            def mapping(transcript_id, protein_id):
                trid = transcript_id and transcript_id != '\\N' and transcript_id or None
                pid = protein_id and protein_id != '\\N' and protein_id or None
                if trid and pid:
                    out['translation'].append({"rna" : trid, "protein" : pid})

            if isinstance(x, list):
                transcript_li = []
                protein_li = []
                for _x in x:
                    if _x[0] and _x[0] != '\\N':
                        transcript_li.append(_x[0])
                    if _x[1] and _x[1] != '\\N':
                        protein_li.append(_x[1])
                    mapping(_x[0],_x[1])

                if transcript_li:
                    out['transcript'] = normalized_value(transcript_li)
                if protein_li:
                    out['protein'] = normalized_value(protein_li)
            else:
                if x[0] and x[0] != '\\N':
                    out['transcript'] = x[0]
                if x[1] and x[1] != '\\N':
                    out['protein'] = x[1]
                mapping(x[0],x[1])

            return out

        for k in ensembl2acc:
            ensembl2acc[k] = {'ensembl': _fn(ensembl2acc[k], k)}

        load_done('[%d]' % len(ensembl2acc))
        return self.convert2entrez(ensembl2acc)
Пример #22
0
    def load(self, aslist=False):
        load_start(self.datafile)
        gene2go = tab2dict(self.datafile, (1, 2, 3, 4, 5, 6, 7), 0, alwayslist=1,
                           includefn=self.species_filter)
        category_d = {'Function': 'MF',
                      'Process': 'BP',
                      'Component': 'CC'}

        def _ff(d):
            out = {}
            for goid, evidence, qualifier, goterm, pubmed, gocategory in d:
                _gocategory = category_d[gocategory]
                _d = out.get(_gocategory, [])
                _rec = dict(id=goid, term=goterm)
                if evidence != '-':
                    _rec['evidence'] = evidence
                if qualifier != '-':
                    # here I also fixing some inconsistency issues in NCBI data
                    # Colocalizes_with -> colocalizes_with
                    # Contributes_with -> contributes_with
                    # Not -> NOT
                    _rec['qualifier'] = qualifier.replace('Co', 'co').replace('Not', 'NOT')
                if pubmed != '-':
                    if pubmed.find('|') != -1:
                        pubmed = [int(pid) for pid in pubmed.split('|')]
                    else:
                        pubmed = int(pubmed)
                    _rec['pubmed'] = pubmed
                _d.append(_rec)
                out[_gocategory] = _d
            for k in out:
                if len(out[k]) == 1:
                    out[k] = out[k][0]
            return out

        gene2go = dict_convert(gene2go, valuefn=_ff)
        gene_d = {}
        for gid, go in gene2go.items():
            gene_d[gid] = {'go': go}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Пример #23
0
def load_ucsc_exons():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    species_li = os.listdir(DATA_FOLDER)
    print "Found {} species folders.".format(len(species_li))
    t0 = time.time()
    gene2exons = {}
    for species in species_li:
        print species, '...'
        if species == 'Homo_sapiens':
            gene2exons.update(load_exons_for_human())
        elif species == 'Mus_musculus':
            gene2exons.update(load_exons_for_mouse())
        else:
            gene2exons.update(load_exons_for_species(species))

    load_done('[%d, %s]' % (len(gene2exons), timesofar(t0)))

    return gene2exons
Пример #24
0
    def load(self, aslist=False):
        '''
        loading ncbi "homologene.data" file
        adding "homologene" field in gene doc
        '''

        load_start(self.datafile)
        with open(self.datafile) as df:
            homologene_d = {}
            doc_li = []
            print()
            geneid_d = get_geneid_d(self.species_li)

            for line in df:
                ld = line.strip().split('\t')
                hm_id, tax_id, geneid = [int(x) for x in ld[:3]]
                if (self.taxid_set is None or tax_id in self.taxid_set) and \
                        geneid in geneid_d:
                    # for selected species only
                    # and also ignore those geneid does not match any
                    # existing gene doc
                    # in case of orignal geneid is retired, replaced with the
                    # new one, if available.
                    geneid = geneid_d[geneid]
                    genes = homologene_d.get(hm_id, [])
                    genes.append((tax_id, geneid))
                    homologene_d[hm_id] = genes

                    doc_li.append(dict(_id=str(geneid), taxid=tax_id,
                                       homologene={'id': hm_id}))

            for i, gdoc in enumerate(doc_li):
                gdoc['homologene']['genes'] = self._sorted_homologenes(
                    set(homologene_d[gdoc['homologene']['id']]))
                doc_li[i] = gdoc

            load_done('[%d]' % len(doc_li))

        if aslist:
            return doc_li
        else:
            gene_d = dict([(d['_id'], d) for d in doc_li])
            return gene_d
Пример #25
0
    def load(self, aslist=False):
        load_start(self.datafile)
        gene2acc = tab2dict(self.datafile, (1, 3, 5, 7), 0, alwayslist=1,
                            includefn=self.species_filter)

        def _ff(d):
            out = {
                'rna': [],
                'protein': [],
                'genomic': [],
                'translation': []
            }
            for rna, prot, dna in d:
                if rna == '-': rna = None
                if prot == '-': prot = None
                if dna == '-': dna = None
                if rna is not None:
                    out['rna'].append(rna)
                if prot is not None:
                    out['protein'].append(prot)
                if dna is not None:
                    out['genomic'].append(dna)
                if rna and prot:
                    out['translation'].append({'rna' : rna, 'protein' : prot})
            # remove dup
            for k in out:
                out[k] = normalized_value(out[k])
            # remove empty rna/protein/genomic field
            _out = {}
            for k, v in out.items():
                if v:
                    _out[k] = v
            if _out:
                _out = {self.fieldname: _out}
            return _out

        gene2acc = dict_convert(gene2acc, valuefn=_ff)
        load_done('[%d]' % len(gene2acc))

        if aslist:
            return dict_to_list(gene2acc)
        else:
            return gene2acc
Пример #26
0
    def load(self, aslist=False):
        load_start(self.datafile)
        if self.species_li:
            _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[1] != '-'
        else:
            _includefn = lambda ld: ld[1] != '-'
        gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1,
                                includefn=_includefn)
        gene2retired = dict_convert(gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x]))

        gene_d = {}
        for gid, retired in gene2retired.items():
            gene_d[gid] = {'retired': retired}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Пример #27
0
def load_x(idx, fieldname, cvt_fn=None):
    '''idx is 0-based column number'''
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz')
    load_start(DATAFILE)
    t0 = time.time()
    xli = []
    for ld in tabfile_feeder(DATAFILE,
                             header=1,
                             assert_column_no=VALID_COLUMN_NO):
        ld = listitems(ld, *(2, 19, idx))  # GeneID Ensembl(Gene) target_value
        for value in dupline_seperator(dupline=ld, dup_sep='; '):
            xli.append(value)

    ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli
                                            if x[0] != '' and x[1] != '']),
                               0,
                               alwayslist=True)
    xli2 = []
    for entrez_id, ensembl_id, x_value in xli:
        if x_value:
            if cvt_fn:
                x_value = cvt_fn(x_value)
            if entrez_id:
                xli2.append((entrez_id, x_value))
            elif ensembl_id:
                entrez_id = ensembl2geneid.get(ensembl_id, None)
                if entrez_id:
                    for _eid in entrez_id:
                        xli2.append((_eid, x_value))
                else:
                    xli2.append((ensembl_id, x_value))

    gene2x = list2dict(list_nondup(xli2), 0)
    fn = lambda value: {
        fieldname: sorted(value) if isinstance(value, list) else value
    }
    gene2x = value_convert(gene2x, fn, traverse_list=False)
    load_done('[%d, %s]' % (len(gene2x), timesofar(t0)))

    return gene2x
Пример #28
0
    def load(self, aslist=False):
        load_start(self.datafile)
        gene2acc = tab2dict(self.datafile, (1, 3, 5, 7),
                            0,
                            alwayslist=1,
                            includefn=self.species_filter)

        def _ff(d):
            out = {'rna': [], 'protein': [], 'genomic': [], 'translation': []}
            for rna, prot, dna in d:
                if rna == '-': rna = None
                if prot == '-': prot = None
                if dna == '-': dna = None
                if rna is not None:
                    out['rna'].append(rna)
                if prot is not None:
                    out['protein'].append(prot)
                if dna is not None:
                    out['genomic'].append(dna)
                if rna and prot:
                    out['translation'].append({'rna': rna, 'protein': prot})
            # remove dup
            for k in out:
                out[k] = normalized_value(out[k])
            # remove empty rna/protein/genomic field
            _out = {}
            for k, v in out.items():
                if v:
                    _out[k] = v
            if _out:
                _out = {self.fieldname: _out}
            return _out

        gene2acc = dict_convert(gene2acc, valuefn=_ff)
        load_done('[%d]' % len(gene2acc))

        if aslist:
            return dict_to_list(gene2acc)
        else:
            return gene2acc
Пример #29
0
def load_exons_for_species(species, exons_key='exons'):
    refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz')
    reflink_file = os.path.join(DATA_FOLDER, species, 'database/refLink.txt.gz')

    load_start(refflat_file)
    t0 = time.time()

    refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False)
    ref2exons = []
    for ld in tabfile_feeder(refflat_file, header=0):
        refseq = ld[1]
        chr = ld[2]
        if chr.startswith('chr'):
            chr = chr[3:]
        exons = zip([int(x) for x in ld[9].split(',') if x],
                    [int(x) for x in ld[10].split(',') if x])
        assert len(exons) == int(ld[8]), (len(exons), int(ld[8]))
        ref2exons.append((refseq, {
            'chr': chr,
            'strand': -1 if ld[3] == '-' else 1,
            'txstart': int(ld[4]),
            'txend': int(ld[5]),
            'cdsstart': int(ld[6]),
            'cdsend': int(ld[7]),
            'exons': exons
        }))
    ref2exons = list2dict(ref2exons, 0)

    gene2exons = {}
    for refseq in sorted(ref2exons.keys()):
        geneid = refseq2gene.get(refseq, None)
        if geneid and geneid != '0':
            if geneid not in gene2exons:
                gene2exons[geneid] = {exons_key: {refseq: ref2exons[refseq]}}
            else:
                gene2exons[geneid][exons_key][refseq] = ref2exons[refseq]

    load_done('[%d, %s]' % (len(gene2exons), timesofar(t0)))

    return gene2exons
Пример #30
0
def load_genedoc(self):
    """
    Loads gene data from NCBI's refseq2gene.gz file.
    Parses it based on genomic position data and refseq status provided by the
    list of taxids from get_ref_microbe_taxids() as lookup table
    :return:
    """
    taxids = loadobj(TAXIDS_FILE)
    taxid_set = set(taxids)
    load_start(DATAFILE)

    def _includefn(ld):
        return ld[0] in taxid_set  # match taxid from taxid_set

    cols_included = [0, 1, 7, 9, 10, 11]  # 0-based col idx
    gene2genomic_pos_li = tab2list(DATAFILE, cols_included, header=1,
                                   includefn=_includefn)
    count = 0
    last_id = None
    for gene in gene2genomic_pos_li:
        count += 1
        strand = 1 if gene[5] == '+' else -1
        _id = gene[1]

        mgi_dict = {
            '_id': _id,
            'genomic_pos': {
                'entrezgene': _id,
                'start': int(gene[3]),
                'end': int(gene[4]),
                'chr': gene[2],
                'strand': strand
            }
        }
        if _id != last_id:
            # rows with dup _id will be skipped
            yield mgi_dict
        last_id = _id

    load_done('[%d]' % count)
Пример #31
0
 def _load_ensembl2entrez_li(self):
     """gene_ensembl__xref_entrezgene__dm"""
     CUSTOM_MAPPING_FILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__extra.txt')
     if not os.path.exists(CUSTOM_MAPPING_FILE):
         print("Missing extra mapping file, now generating")
         from . import ensembl_ncbi_mapping
         ensembl_ncbi_mapping.main(confirm=False)
     load_start(CUSTOM_MAPPING_FILE)
     extra = tab2dict(CUSTOM_MAPPING_FILE,(0, 1), 0, alwayslist=True)
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt')
     load_start(DATAFILE)
     ensembl2entrez = tab2dict(DATAFILE, (1, 2), 0, includefn=_not_LRG, alwayslist=True)   # [(ensembl_gid, entrez_gid),...]
     # replace with our custom mapping
     for k in extra:
         ensembl2entrez[k] = extra[k]
     # back to list of tuples
     ensembl2entrez_li = []
     for ensembl_id, entrez_ids in ensembl2entrez.items():
         for entrez_id in entrez_ids:
             ensembl2entrez_li.append((ensembl_id, entrez_id))
     load_done('[%d]' % len(ensembl2entrez_li))
     self.ensembl2entrez_li = ensembl2entrez_li
Пример #32
0
    def load(self, aslist=False):
        load_start(self.datafile)
        with file(self.datafile) as df:
            geneid_set = set()
            doc_li = []
            for line in df:
                geneid, ec = line.strip().split('\t')
                if ec.find(',') != -1:
                    # there are multiple EC numbers
                    ec = [unicode(x) for x in ec.split(',')]
                else:
                    ec = unicode(ec)
                if geneid not in geneid_set:
                    doc_li.append(dict(_id=geneid, ec=ec))
                    geneid_set.add(geneid)
        load_done('[%d]' % len(doc_li))

        if aslist:
            return doc_li
        else:
            gene_d = dict([(d['_id'], d) for d in doc_li])
            return gene_d
Пример #33
0
def load_genedoc(self):
    """
    Loads gene data from NCBI's refseq2gene.gz file.
    Parses it based on genomic position data and refseq status provided by the
    list of taxids from get_ref_microbe_taxids() as lookup table
    :return:
    """
    taxids = loadobj(TAXIDS_FILE)
    taxid_set = set(taxids)
    load_start(DATAFILE)

    def _includefn(ld):
        return ld[0] in taxid_set  # match taxid from taxid_set

    cols_included = [0, 1, 7, 9, 10, 11]  # 0-based col idx
    gene2genomic_pos_li = tab2list(DATAFILE, cols_included, header=1,
                                   includefn=_includefn)
    count = 0
    last_id = None
    for gene in gene2genomic_pos_li:
        count += 1
        strand = 1 if gene[5] == '+' else -1
        _id = gene[1]

        mgi_dict = {
            '_id': _id,
            'genomic_pos': {
                'start': int(gene[3]),
                'end': int(gene[4]),
                'chr': gene[2],
                'strand': strand
            }
        }
        if _id != last_id:
            # rows with dup _id will be skipped
            yield mgi_dict
        last_id = _id

    load_done('[%d]' % count)
Пример #34
0
    def load(self, aslist=False):
        load_start(self.datafile)
        with open(self.datafile) as df:
            geneid_set = set()
            doc_li = []
            for line in df:
                geneid, ec = line.strip().split('\t')
                if ec.find(',') != -1:
                    # there are multiple EC numbers
                    ec = [str(x) for x in ec.split(',')]
                else:
                    ec = str(ec)
                if geneid not in geneid_set:
                    doc_li.append(dict(_id=geneid, ec=ec))
                    geneid_set.add(geneid)
        load_done('[%d]' % len(doc_li))

        if aslist:
            return doc_li
        else:
            gene_d = dict([(d['_id'], d) for d in doc_li])
            return gene_d
Пример #35
0
    def load(self, aslist=False):
        load_start(self.datafile)
        if self.species_li:
            _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[
                1] != '-'
        else:
            _includefn = lambda ld: ld[1] != '-'
        gene2retired = tab2dict(self.datafile, (1, 2),
                                0,
                                alwayslist=1,
                                includefn=_includefn)
        gene2retired = dict_convert(
            gene2retired,
            valuefn=lambda x: normalized_value([int(xx) for xx in x]))

        gene_d = {}
        for gid, retired in gene2retired.items():
            gene_d[gid] = {'retired': retired}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Пример #36
0
def get_geneid_d(species_li=None, load_cache=True, save_cache=True):
    '''return a dictionary of current/retired geneid to current geneid mapping.
       This is useful, when other annotations were mapped to geneids may
       contain retired gene ids.

       if species_li is None, genes from all species are loaded.

       Note that all ids are int type.
    '''
    if species_li:
        taxid_set = set([TAXONOMY[species] for species in species_li])
    else:
        taxid_set = None

    orig_cwd = os.getcwd()
    os.chdir(DATA_FOLDER)

    # check cache file
    _cache_file = 'gene/geneid_d.pyobj'
    if load_cache and os.path.exists(_cache_file) and \
       file_newer(_cache_file, 'gene/gene_info.gz') and \
       file_newer(_cache_file, 'gene/gene_history.gz'):

        print('Loading "geneid_d" from cache file...', end='')
        _taxid_set, out_d = loadobj(_cache_file)
        assert _taxid_set == taxid_set
        print('Done.')
        os.chdir(orig_cwd)
        return out_d

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz')
    load_start(DATAFILE)
    if species_li:
        species_filter = lambda ld: int(ld[0]) in taxid_set
    else:
        species_filter = None
    geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter))
    load_done('[%d]' % len(geneid_li))

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz')
    load_start(DATAFILE)

    if species_li:
        _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li
    else:
        _includefn = lambda ld: ld[1] in geneid_li  # include all species
    retired2gene = tab2dict(DATAFILE, (1, 2),
                            1,
                            alwayslist=0,
                            includefn=_includefn)
    # includefn above makes sure taxid is for species_li and filters out those
    # mapped_to geneid exists in gene_info list

    load_done('[%d]' % len(retired2gene))
    # convert key/value to int
    out_d = dict_convert(retired2gene, keyfn=int, valuefn=int)
    for g in geneid_li:
        _g = int(g)
        out_d[_g] = _g

    if save_cache:
        if species_li:
            dump((taxid_set, out_d), _cache_file)
        else:
            dump((None, out_d), _cache_file)

    os.chdir(orig_cwd)
    return out_d
Пример #37
0
    def load(self, aslist=False):
        '''
        loading ncbi "gene_info" file
        This must be called first to create basic gene documents
        with all basic fields, e.g., name, symbol, synonyms, etc.

        format of gene_info file:
        #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs
                 map_location description type_of_gene Symbol_from
                 nomenclature_authority Full_name_from_nomenclature_authority
        Nomenclature_status Other_designations Modification_da
        te (tab is used as a separator, pound sign - start of a comment)

        '''
        load_start(self.datafile)
        gene_d = tab2dict(self.datafile, (0, 1, 2, 3, 4, 5, 7, 8, 9, 13, 14),
                          key=1,
                          alwayslist=0,
                          includefn=self.species_filter)

        def _ff(d):
            (taxid, symbol, locus_tag, synonyms, dbxrefs, map_location,
             description, type_of_gene, other_designations,
             modification_date) = d
            out = dict(taxid=int(taxid), symbol=symbol, name=description)
            if map_location != '-':
                out['map_location'] = map_location
            if type_of_gene != '-':
                out['type_of_gene'] = type_of_gene
            if synonyms != '-':
                out['alias'] = normalized_value(synonyms.split('|'))
            if locus_tag != '-':
                out['locus_tag'] = locus_tag
            if other_designations != "-":
                out['other_names'] = normalized_value(
                    other_designations.split('|'))

            # when merged, this will become the default timestamp
            out["_timestamp"] = datetime.datetime.strptime(
                modification_date, "%Y%m%d")

            for x in dbxrefs.split('|'):
                if x == '-':
                    continue
                xd = x.split(':')
                if len(xd) == 3 and xd[0] == xd[1] and \
                        xd[0] in ['VGNC', 'HGNC', 'MGI']:
                    # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328'
                    xd = xd[1:]
                try:
                    _db, _id = xd
                except:
                    print(repr(x))
                    raise
                # we don't need ensembl xref from here, we will get it from
                # Ensembl directly
                if _db.lower() in ['ensembl', 'imgt/gene-db']:
                    # we don't need 'IMGT/GENE-DB" xref either, because they
                    # are mostly the same as gene symbol
                    continue
                # add "MGI:" prefix for MGI ids.
                if _db.lower() == 'mgi':
                    _id = "MGI:" + _id
                out[_db] = _id
            return out

        gene_d = value_convert(gene_d, _ff)

        # add entrezgene field
        for geneid in gene_d:
            d = gene_d[geneid]
            d['entrezgene'] = int(geneid)
            gene_d[geneid] = d

        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d