示例#1
0
    def load_ensembl2acc(self):
        """
        loading ensembl to transcripts/proteins data
        """
        #Loading all ensembl GeneIDs, TranscriptIDs and ProteinIDs
        datafile = os.path.join(self.data_folder,
                                'gene_ensembl__translation__main.txt')
        genefile = os.path.join(self.data_folder,
                                'gene_ensembl__gene__main.txt')

        def _fn(x, eid):
            out = {'gene': eid, 'translation': []}

            def mapping(transcript_id, protein_id):
                trid = transcript_id and transcript_id != '\\N' and transcript_id or None
                pid = protein_id and protein_id != '\\N' and protein_id or None
                if trid and pid:
                    out['translation'].append({"rna": trid, "protein": pid})

            if isinstance(x, list):
                transcript_li = []
                protein_li = []
                for _x in x:
                    if _x[0] and _x[0] != '\\N':
                        transcript_li.append(_x[0])
                    if _x[1] and _x[1] != '\\N':
                        protein_li.append(_x[1])
                    mapping(_x[0], _x[1])

                if transcript_li:
                    out['transcript'] = normalized_value(transcript_li)
                if protein_li:
                    out['protein'] = normalized_value(protein_li)
            else:
                if x[0] and x[0] != '\\N':
                    out['transcript'] = x[0]
                if x[1] and x[1] != '\\N':
                    out['protein'] = x[1]
                mapping(x[0], x[1])

            return out

        ensembl2acc = tab2dict(datafile, (1, 2, 3), 0, includefn=_not_LRG)
        typeofgene = tab2dict(genefile, (1, 8), 0, includefn=_not_LRG)
        #for datadict in tab2dict_iter(datafile, (1, 2, 3), 0, includefn=_not_LRG):
        #    for k in datadict:
        #        datadict[k] = {'ensembl': _fn(datadict[k], k), '__aslistofdict__' : 'ensembl'}
        #    for doc in map_id(datadict,self.ensembl2entrez):
        #        yield doc

        for k in ensembl2acc:
            ensembl2acc[k] = {'ensembl': _fn(ensembl2acc[k], k)}
            if k in typeofgene:
                ensembl2acc[k]['ensembl']['type_of_gene'] = typeofgene[k]

        return self.convert2entrez(ensembl2acc)
def loaddata(data_folder):
    #GNF1H
    datafile = os.path.join(data_folder, 'gnf', 'GNF1H.ANNO7.LOAD_20130402.tab')
    gene2gnf1h = tab2dict(datafile, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '')
    #GNF1m
    datafile = os.path.join(data_folder, 'gnf', 'gnf1m.NEW_ANNO6.LOAD_20130402.tab')
    gene2gnf1m = tab2dict(datafile, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '')

    return {'GNF1H': gene2gnf1h,
            'GNF1M': gene2gnf1m}
示例#3
0
    def load_data(self, data_folder):

        # fn to skip lines with LRG records.'''
        def _not_LRG(ld):
            return not ld[1].startswith("LRG_")

        # load mapping ensembl => entrez from Ensembl
        ens2ent_file = os.path.join(data_folder,
                                    'gene_ensembl__xref_entrezgene__dm.txt')
        self.logger.info("Loading Ensembl-to-Entrez mapping file: %s" %
                         ens2ent_file)
        ens2ent = tab2dict(ens2ent_file, (1, 2),
                           0,
                           includefn=_not_LRG,
                           alwayslist=True)
        self.logger.info("# mapping Ensembl => Entrez: %s" % len(ens2ent))
        # load mapping entrez => ensembl from Entrez
        ent2ens_file = os.path.join(data_folder, 'gene2ensembl.gz')
        self.logger.info("Loading Entrez-to-Ensembl mapping file: %s" %
                         ent2ens_file)
        ent2ens = tab2dict(ent2ens_file, (1, 2), 0, alwayslist=True)
        self.logger.info("# mapping Entrez => Ensembl: %s" % len(ent2ens))

        # multual mapping
        mapping = {}
        for ensembl_id in ens2ent:
            entrez_ids_from_ensembl = ens2ent[ensembl_id]
            for entrez_id in entrez_ids_from_ensembl:
                if ensembl_id in ent2ens.get(entrez_id, []):
                    mapping.setdefault(ensembl_id, set()).add(entrez_id)
        self.logger.info("%d mutual mappings found" % len(mapping))
        for ens, ents in mapping.items():
            sents = sorted(list(ents))
            yield {
                "_id": "%s-%s" % (ens, ",".join(ents)),
                "multiplicity": len(sents),
                "ensembl": ens,
                "entrez": sents,
            }

        # last doc, sort of metadata
        src_doc = self.src_dump.find_one({"_id": self.main_source}) or {}
        release = src_doc["download"]["release"]
        ens_version, ent_version = release.split(":")
        yield {
            "_id": "_meta",
            "ensembl": {
                "file": ens2ent_file,
                "version": ens_version
            },
            "entrez": {
                "file": ent2ens_file,
                "version": ent_version
            },
        }
示例#4
0
    def load(self, aslist=False):
        '''
        loading ncbi "homologene.data" file
        adding "homologene" field in gene doc
        '''
        from biothings.utils.hub_db import get_src_dump
        homo_d = tab2dict(self.datafile,(2,1),0,header=0)
        entrez_doc = get_src_dump().find_one({"_id":"entrez"}) or {}
        entrez_dir = entrez_doc.get("data_folder")
        assert entrez_dir, "Can't find Entez data directory"
        DATAFILE = os.path.join(entrez_dir, 'gene_history.gz')
        assert os.path.exists(DATAFILE), "gene_history.gz is missing (entrez_dir: %s)" % entrez_dir
        retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0,includefn=lambda ld: ld[1] != '-')
        for id in list(homo_d.keys()):
            homo_d[retired2gene.get(id,id)] = homo_d[id]

        with open(self.datafile) as df:
            homologene_d = {}
            doc_li = []
            print()
            geneid_d = get_geneid_d(entrez_dir, self.species_li,load_cache=False,save_cache=False,only_for=homo_d)

            for line in df:
                ld = line.strip().split('\t')
                hm_id, tax_id, geneid = [int(x) for x in ld[:3]]
                if (self.taxid_set is None or tax_id in self.taxid_set) and geneid in geneid_d:
                    # for selected species only
                    # and also ignore those geneid does not match any
                    # existing gene doc
                    # in case of orignal geneid is retired, replaced with the
                    # new one, if available.
                    geneid = geneid_d[geneid]
                    genes = homologene_d.get(hm_id, [])
                    genes.append((tax_id, geneid))
                    homologene_d[hm_id] = genes

                    doc_li.append(dict(_id=str(geneid), taxid=tax_id,
                                       homologene={'id': hm_id}))

            for i, gdoc in enumerate(doc_li):
                gdoc['homologene']['genes'] = self._sorted_homologenes(
                    set(homologene_d[gdoc['homologene']['id']]))
                doc_li[i] = gdoc

        if aslist:
            return doc_li
        else:
            gene_d = dict([(d['_id'], d) for d in doc_li])
            return gene_d
示例#5
0
def load_broadinstitute_exac_any(one_file,key):
    logging.info("Loading file %s (%s)" % (one_file,key))
    data = tab2dict(one_file, (0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21), 0)
    exacs = {}
    for transcript in data:
        tupleexac = data[transcript]
        # remove version in key so we can search the dict easily later
        exacs[transcript.split(".")[0]] = {"exac" : 
                {
                    "transcript" : transcript,  # but keep version here
                    "n_exons" : int(tupleexac[0]),
                    "cds_start" : int(tupleexac[1]),
                    "cds_end" : int(tupleexac[2]),
                    "bp" : int(tupleexac[3]),
                    key : {
                        "mu_syn" : float(tupleexac[4]),
                        "mu_mis" : float(tupleexac[5]),
                        "mu_lof" : float(tupleexac[6]),
                        "n_syn" : float(tupleexac[7]),
                        "n_mis" : float(tupleexac[8]),
                        "n_lof" : float(tupleexac[9]),
                        "exp_syn" : float(tupleexac[10]),
                        "exp_mis" : float(tupleexac[11]),
                        "exp_lof" : float(tupleexac[12]),
                        "syn_z" : float(tupleexac[13]),
                        "mis_z" : float(tupleexac[14]),
                        "lof_z" : float(tupleexac[15]),
                        "p_li" : float(tupleexac[16]),
                        "p_rec" : float(tupleexac[17]),
                        "p_null" : float(tupleexac[18])
                        }
                    }
                }
    return exacs
示例#6
0
def _load_affy(df):
    filename = os.path.split(df)[1]
    rawfile, ext = os.path.splitext(filename)
    if ext.lower() == '.zip':
        df = (df, rawfile)
    dd = tab2dict(df, (0, 7),
                  1,
                  sep=',',
                  header=1,
                  includefn=lambda ld: len(ld) > 7 and ld[7] != '---' and ld[7]
                  != 'gene_assignment')
    #fix for keys like "472 /// 4863" for mulitple geneids
    gene2affy = {}
    for k in dd:
        kk = k.split('///')
        if len(kk) > 1:
            for kkk in kk:
                k4 = kkk.split('//')
                if k4[len(k4) - 1].strip() != '---':
                    dict_apply(gene2affy, k4[len(k4) - 1].strip(), dd[k])
        else:
            k4 = k.split('//')
            if len(k4) > 1:
                if k4[len(k4) - 1].strip() != '---':
                    dict_apply(gene2affy, k4[len(k4) - 1].strip(), dd[k])

    return gene2affy
示例#7
0
 def _load_ensembl_2taxid(self):
     """ensembl2taxid"""
     datafile = os.path.join(self.data_folder,
                             'gene_ensembl__translation__main.txt')
     ensembl2taxid = dict_nodup(
         tab2dict(datafile, (0, 1), 1, includefn=_not_LRG))
     # need to convert taxid to integer here
     ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x))
     return ensembl2taxid
示例#8
0
 def load(self, aslist=False):
     uni_d = tab2dict(self.datafile, (0, 1), 0, alwayslist=0)
     DATAFILE = os.path.join(self.data_folder, 'gene_history.gz')
     retired2gene = tab2dict(DATAFILE, (1, 2),
                             1,
                             alwayslist=0,
                             includefn=lambda ld: ld[1] != '-')
     for id in list(uni_d.keys()):
         uni_d[retired2gene.get(id, id)] = uni_d[id]
     geneid_d = get_geneid_d(self.data_folder,
                             self.species_li,
                             load_cache=False,
                             save_cache=False,
                             only_for=uni_d)
     gene2unigene = tab2dict_iter(
         self.datafile, (0, 1),
         0,
         alwayslist=0,
         includefn=lambda ld: int(ld[0]) in geneid_d)
     cnt = 0
     for doc in gene2unigene:
         yield self.format(doc)
         cnt += 1
示例#9
0
 def _load_ensembl2entrez_li(self):
     """gene_ensembl__xref_entrezgene__dm"""
     CUSTOM_MAPPING_FILE = os.path.join(self.data_folder,
                                        'gene_ensembl__gene__extra.txt')
     if not os.path.exists(CUSTOM_MAPPING_FILE):
         print("Missing extra mapping file, now generating")
         from . import ensembl_ncbi_mapping
         ensembl_ncbi_mapping.main(confirm=False)
     extra = tab2dict(CUSTOM_MAPPING_FILE, (0, 1), 0, alwayslist=True)
     datafile = os.path.join(self.data_folder,
                             'gene_ensembl__xref_entrezgene__dm.txt')
     ensembl2entrez = tab2dict(
         datafile, (1, 2), 0, includefn=_not_LRG,
         alwayslist=True)  # [(ensembl_gid, entrez_gid),...]
     # replace with our custom mapping
     for k in extra:
         ensembl2entrez[k] = extra[k]
     # back to list of tuples
     ensembl2entrez_li = []
     for ensembl_id, entrez_ids in ensembl2entrez.items():
         for entrez_id in entrez_ids:
             ensembl2entrez_li.append((ensembl_id, entrez_id))
     self.ensembl2entrez_li = ensembl2entrez_li
示例#10
0
 def _load_ensembl2entrez_li(self, src_name):
     """gene_ensembl__xref_entrezgene__dm"""
     CUSTOM_MAPPING_FILE = os.path.join(self.data_folder,
                                        'gene_ensembl__gene__extra.txt')
     global extra_mapping_lock
     try:
         print("Trying to acquire extra mapping lock")
         extra_mapping_lock.acquire()
         print("Lock acquired")
         if not os.path.exists(CUSTOM_MAPPING_FILE) or os.stat(
                 CUSTOM_MAPPING_FILE).st_size == 0:
             print("Missing extra mapping file, now generating")
             from . import ensembl_ncbi_mapping
             ensembl_ncbi_mapping.main(src_name, confirm=False)
     finally:
         print("Releasing lock")
         extra_mapping_lock.release()
     extra = tab2dict(CUSTOM_MAPPING_FILE, (0, 1), 0, alwayslist=True)
     datafile = os.path.join(self.data_folder,
                             'gene_ensembl__xref_entrezgene__dm.txt')
     ensembl2entrez = tab2dict(
         datafile, (1, 2), 0, includefn=_not_LRG,
         alwayslist=True)  # [(ensembl_gid, entrez_gid),...]
     # replace with our custom mapping
     ##adjusted = {}
     for k in extra:
         ##if k in ensembl2entrez:
         ##    adjusted[k] = {"ensembl2entrez":ensembl2entrez[k],"extra":extra[k]}
         ensembl2entrez[k] = extra[k]
     ##import pickle
     ##pickle.dump(adjusted,open("/tmp/adjusted","wb"))
     # back to list of tuples
     ensembl2entrez_li = []
     for ensembl_id, entrez_ids in ensembl2entrez.items():
         for entrez_id in entrez_ids:
             ensembl2entrez_li.append((ensembl_id, entrez_id))
     self.ensembl2entrez_li = ensembl2entrez_li
示例#11
0
def load_exons_for_species(data_folder, species, exons_key='exons'):
    refflat_file = os.path.join(data_folder, species,
                                'database/refFlat.txt.gz')
    t0 = time.time()
    ref2exons = {}
    for ld in tabfile_feeder(refflat_file, header=0):
        refseq = ld[1]
        chr = ld[2]
        if chr.startswith('chr'):
            chr = chr[3:]
        exons = list(
            zip([int(x) for x in ld[9].split(',') if x],
                [int(x) for x in ld[10].split(',') if x]))
        assert len(exons) == int(ld[8]), (len(exons), int(ld[8]))
        ref2exons.setdefault(refseq, []).append({
            'transcript':
            refseq,
            'chr':
            chr,
            'strand':
            -1 if ld[3] == '-' else 1,
            'txstart':
            int(ld[4]),
            'txend':
            int(ld[5]),
            'cdsstart':
            int(ld[6]),
            'cdsend':
            int(ld[7]),
            'position':
            exons
        })

    gene2exons = {}
    reflink_file = os.path.join(data_folder,
                                '../hgFixed/database/refLink.txt.gz')
    refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False)
    for refseq in sorted(ref2exons.keys()):
        geneid = refseq2gene.get(refseq, None)
        if geneid and geneid != '0':
            if geneid not in gene2exons:
                gene2exons[geneid] = {exons_key: ref2exons[refseq]}
            else:
                gene2exons[geneid][exons_key].extend(ref2exons[refseq])

    return gene2exons
示例#12
0
    def _load_ensembl2name(self):
        """loading ensembl gene to symbol+name mapping"""
        datafile = os.path.join(
            self.data_folder, 'gene_ensembl__gene__main.txt')
        ensembl2name = tab2dict(datafile, (1, 2, 7), 0, includefn=_not_LRG)

        def _fn(x):
            out = {}
            if x[0].strip() not in ['', '\\N']:
                out['symbol'] = x[0].strip()
            if x[1].strip() not in ['', '\\N']:
                _name = SubStr(x[1].strip(), '', ' [Source:').strip()
                if _name:
                    out['name'] = _name
            return out
        ensembl2name = value_convert(ensembl2name, _fn)
        return ensembl2name
示例#13
0
 def load_ensembl2pos(self):
     datafile = os.path.join(
         self.data_folder, 'gene_ensembl__gene__main.txt')
     # Twice 1 because first is the dict key, the second because we need gene id within genomic_pos
     ensembl2pos = dict_nodup(
         tab2dict(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG))
     ensembl2pos = value_convert(ensembl2pos, lambda x: {
                                 'ensemblgene': x[0], 'chr': x[3], 'start': int(x[1]), 'end': int(x[2]), 'strand': int(x[4])})
     ensembl2pos = value_convert(ensembl2pos, lambda x: {
                                 'genomic_pos': x}, traverse_list=False)
     for datadict in tab2dict_iter(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG):
         datadict = dict_nodup(datadict)
         datadict = value_convert(datadict, lambda x: {'ensemblgene': x[0], 'chr': x[3], 'start': int(
             x[1]), 'end': int(x[2]), 'strand': int(x[4])})
         datadict = value_convert(datadict, lambda x: {
                                  'genomic_pos': x, '__aslistofdict__': 'genomic_pos'}, traverse_list=False)
         for doc in map_id(datadict, self.ensembl2entrez):
             yield doc
示例#14
0
def _load_affy(df):
    filename = os.path.split(df)[1]
    rawfile, ext = os.path.splitext(filename)
    if ext.lower() == '.zip':
        df = (df, rawfile)
    dd = tab2dict(df, (0, 18),
                  1,
                  sep=',',
                  header=1,
                  includefn=lambda ld: len(ld) > 18 and ld[18] != '---' and ld[
                      18] != 'Entrez Gene')
    #fix for keys like "472 /// 4863" for mulitple geneids
    gene2affy = {}
    for k in dd:
        if len(k.split(' /// ')) > 1:
            for kk in k.split(' /// '):
                dict_apply(gene2affy, kk.strip(), dd[k])
        else:
            dict_apply(gene2affy, k.strip(), dd[k])
    return gene2affy
示例#15
0
    def load(self, aslist=False):
        if self.species_li:
            _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[
                1] != '-'
        else:
            _includefn = lambda ld: ld[1] != '-'
        gene2retired = tab2dict(self.datafile, (1, 2),
                                0,
                                alwayslist=1,
                                includefn=_includefn)
        gene2retired = dict_convert(
            gene2retired,
            valuefn=lambda x: normalized_value([int(xx) for xx in x]))

        gene_d = {}
        for gid, retired in gene2retired.items():
            gene_d[gid] = {'retired': retired}

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
示例#16
0
def get_geneid_d(data_folder,
                 species_li=None,
                 load_cache=True,
                 save_cache=True,
                 only_for={}):
    '''return a dictionary of current/retired geneid to current geneid mapping.
       This is useful, when other annotations were mapped to geneids may
       contain retired gene ids.

       if species_li is None, genes from all species are loaded.

       Note that all ids are int type.
    '''
    if species_li:
        taxid_set = set(
            [TAXONOMY[species]["tax_id"] for species in species_li])
    else:
        taxid_set = None

    orig_cwd = os.getcwd()
    os.chdir(data_folder)

    # check cache file
    _cache_file = 'geneid_d.pyobj'
    if load_cache and os.path.exists(_cache_file) and \
       file_newer(_cache_file, 'gene_info.gz') and \
       file_newer(_cache_file, 'gene_history.gz'):
        _taxid_set, out_d = loadobj(_cache_file)
        assert _taxid_set == taxid_set
        os.chdir(orig_cwd)
        return out_d

    DATAFILE = os.path.join(data_folder, 'gene_info.gz')
    if species_li:
        species_filter = lambda ld: int(ld[0]) in taxid_set and (
            only_for and ld[1] in only_for)
    elif only_for:
        species_filter = lambda ld: only_for and ld[1] in only_for
    else:
        species_filter = None
    geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter))

    DATAFILE = os.path.join(data_folder, 'gene_history.gz')

    if species_li:
        _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li
    else:
        _includefn = lambda ld: ld[1] in geneid_li  # include all species
    retired2gene = tab2dict(DATAFILE, (1, 2),
                            1,
                            alwayslist=0,
                            includefn=_includefn)
    # includefn above makes sure taxid is for species_li and filters out those
    # mapped_to geneid exists in gene_info list

    # convert key/value to int
    out_d = dict_convert(retired2gene, keyfn=int, valuefn=int)
    # TODO: this fills memory with key==value ...
    for g in geneid_li:
        _g = int(g)
        out_d[_g] = _g

    if save_cache:
        if species_li:
            dump((taxid_set, out_d), _cache_file)
        else:
            dump((None, out_d), _cache_file)

    os.chdir(orig_cwd)
    return out_d
示例#17
0
def load_pharmgkb(data_folder):
    datafile = os.path.join(data_folder, 'genes.zip')
    gene2pharmgkb = tab2dict((datafile, 'genes.tsv'), (0, 1), 1, header=1, includefn=lambda ld: ld[1] != '')
    fn = lambda value: {'pharmgkb': value}
    gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False)
    return gene2pharmgkb
示例#18
0
def loaddata(data_folder):
    #Snowball array
    datafile = os.path.join(data_folder, 'pigatlas',
                            'snowball_array_annotation.txt')
    gene2snowball = tab2dict(datafile, (0, 1), 1, header=0)
    return {'snowball': gene2snowball}