Exemplo n.º 1
0
 def load(self):
     load_start(self.datafile)
     gene2generif = tab2dict(self.datafile, (1, 2, 4), 0, alwayslist=1)
     gene2generif = dict_convert(gene2generif, valuefn=lambda v: {
         'generif': [dict(pubmed=self._cvt_pubmed(x[0]), text=x[1]) for x in v]})
     load_done('[%d]' % len(gene2generif))
     return gene2generif
Exemplo n.º 2
0
def load_cpdb(__metadata__):
    # only import pathways from these sources
    PATHWAY_SOURCES_INCLUDED = __metadata__['pathway_sources_included']
    VALID_COLUMN_NO = 4

    t0 = time.time()
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATA_FILES = []
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_mouse.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_yeast.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_human.tab'))

    _out = []
    for DATA_FILE in DATA_FILES:
        load_start(DATA_FILE)
        for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO):
            p_name, p_id, p_source = ld[:3]
            p_source = p_source.lower()
            if p_source == 'kegg' and p_id.startswith('path:'):
                p_id = p_id[5:]
            if p_source in PATHWAY_SOURCES_INCLUDED:
                genes = ld[-1].split(",")
                for gene in genes:
                    _out.append((gene, p_name, p_id, p_source))
        load_done()
    _out = list2dict(_out, 0, alwayslist=True)

    def _inner_cvt(p):
        p_name, p_id = p
        _d = {'name': p_name}
        if p_id != 'None':
            _d['id'] = p_id
        return _d

    def _cvt(pli):
        _d = list2dict(pli, 2)
        _d = value_convert(_d, _inner_cvt)
        for p_source in _d:
            if isinstance(_d[p_source], list):
                _d[p_source].sort()
        return {'pathway': _d}

    _out = dict_convert(_out, valuefn=_cvt)
    load_done('[%d, %s]' % (len(_out), timesofar(t0)))

    return _out
Exemplo n.º 3
0
def load_cpdb(__metadata__):
    # only import pathways from these sources
    PATHWAY_SOURCES_INCLUDED = __metadata__['pathway_sources_included']
    VALID_COLUMN_NO = 4

    t0 = time.time()
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATA_FILES = []
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_mouse.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_yeast.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_human.tab'))

    _out = []
    for DATA_FILE in DATA_FILES:
        load_start(DATA_FILE)
        for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO):
            p_name, p_id, p_source = ld[:3]
            p_source = p_source.lower()
            if p_source == 'kegg' and p_id.startswith('path:'):
                p_id = p_id[5:]
            if p_source in PATHWAY_SOURCES_INCLUDED:
                genes = ld[-1].split(",")
                for gene in genes:
                    _out.append((gene, p_name, p_id, p_source))
        load_done()
    _out = list2dict(_out, 0, alwayslist=True)

    def _inner_cvt(p):
        p_name, p_id = p
        _d = {'name': p_name}
        if p_id != 'None':
            _d['id'] = p_id
        return _d

    def _cvt(pli):
        _d = list2dict(pli, 2)
        _d = value_convert(_d, _inner_cvt)
        for p_source in _d:
            if isinstance(_d[p_source], list):
                _d[p_source].sort(key=lambda e: e["id"])
        return {'pathway': _d}

    _out = dict_convert(_out, valuefn=_cvt)
    load_done('[%d, %s]' % (len(_out), timesofar(t0)))

    return _out
Exemplo n.º 4
0
    def load(self, aslist=False):
        load_start(self.datafile)
        gene2go = tab2dict(self.datafile, (1, 2, 3, 4, 5, 6, 7),
                           0,
                           alwayslist=1,
                           includefn=self.species_filter)
        category_d = {'Function': 'MF', 'Process': 'BP', 'Component': 'CC'}

        def _ff(d):
            out = {}
            for goid, evidence, qualifier, goterm, pubmed, gocategory in d:
                _gocategory = category_d[gocategory]
                _d = out.get(_gocategory, [])
                _rec = dict(id=goid, term=goterm)
                if evidence != '-':
                    _rec['evidence'] = evidence
                if qualifier != '-':
                    # here I also fixing some inconsistency issues in NCBI data
                    # Colocalizes_with -> colocalizes_with
                    # Contributes_with -> contributes_with
                    # Not -> NOT
                    _rec['qualifier'] = qualifier.replace('Co', 'co').replace(
                        'Not', 'NOT')
                if pubmed != '-':
                    if pubmed.find('|') != -1:
                        pubmed = [int(pid) for pid in pubmed.split('|')]
                    else:
                        pubmed = int(pubmed)
                    _rec['pubmed'] = pubmed
                _d.append(_rec)
                out[_gocategory] = _d
            for k in out:
                if len(out[k]) == 1:
                    out[k] = out[k][0]
            return out

        gene2go = dict_convert(gene2go, valuefn=_ff)
        gene_d = {}
        for gid, go in gene2go.items():
            gene_d[gid] = {'go': go}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Exemplo n.º 5
0
    def load(self, aslist=False):
        load_start(self.datafile)
        gene2go = tab2dict(self.datafile, (1, 2, 3, 4, 5, 6, 7), 0, alwayslist=1,
                           includefn=self.species_filter)
        category_d = {'Function': 'MF',
                      'Process': 'BP',
                      'Component': 'CC'}

        def _ff(d):
            out = {}
            for goid, evidence, qualifier, goterm, pubmed, gocategory in d:
                _gocategory = category_d[gocategory]
                _d = out.get(_gocategory, [])
                _rec = dict(id=goid, term=goterm)
                if evidence != '-':
                    _rec['evidence'] = evidence
                if qualifier != '-':
                    # here I also fixing some inconsistency issues in NCBI data
                    # Colocalizes_with -> colocalizes_with
                    # Contributes_with -> contributes_with
                    # Not -> NOT
                    _rec['qualifier'] = qualifier.replace('Co', 'co').replace('Not', 'NOT')
                if pubmed != '-':
                    if pubmed.find('|') != -1:
                        pubmed = [int(pid) for pid in pubmed.split('|')]
                    else:
                        pubmed = int(pubmed)
                    _rec['pubmed'] = pubmed
                _d.append(_rec)
                out[_gocategory] = _d
            for k in out:
                if len(out[k]) == 1:
                    out[k] = out[k][0]
            return out

        gene2go = dict_convert(gene2go, valuefn=_ff)
        gene_d = {}
        for gid, go in gene2go.items():
            gene_d[gid] = {'go': go}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Exemplo n.º 6
0
    def load(self, aslist=False):
        load_start(self.datafile)
        if self.species_li:
            _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[1] != '-'
        else:
            _includefn = lambda ld: ld[1] != '-'
        gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1,
                                includefn=_includefn)
        gene2retired = dict_convert(gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x]))

        gene_d = {}
        for gid, retired in gene2retired.items():
            gene_d[gid] = {'retired': retired}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Exemplo n.º 7
0
    def load(self, aslist=False):
        load_start(self.datafile)
        if self.species_li:
            _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[1] != '-'
        else:
            _includefn = lambda ld: ld[1] != '-'
        gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1,
                                includefn=_includefn)
        gene2retired = dict_convert(gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x]))

        gene_d = {}
        for gid, retired in gene2retired.items():
            gene_d[gid] = {'retired': retired}
        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Exemplo n.º 8
0
    def load(self, aslist=False):
        load_start(self.datafile)
        gene2acc = tab2dict(self.datafile, (1, 3, 5, 7), 0, alwayslist=1,
                            includefn=self.species_filter)

        def _ff(d):
            out = {
                'rna': [],
                'protein': [],
                'genomic': [],
                'translation': []
            }
            for rna, prot, dna in d:
                if rna == '-': rna = None
                if prot == '-': prot = None
                if dna == '-': dna = None
                if rna is not None:
                    out['rna'].append(rna)
                if prot is not None:
                    out['protein'].append(prot)
                if dna is not None:
                    out['genomic'].append(dna)
                if rna and prot:
                    out['translation'].append({'rna' : rna, 'protein' : prot})
            # remove dup
            for k in out:
                out[k] = normalized_value(out[k])
            # remove empty rna/protein/genomic field
            _out = {}
            for k, v in out.items():
                if v:
                    _out[k] = v
            if _out:
                _out = {self.fieldname: _out}
            return _out

        gene2acc = dict_convert(gene2acc, valuefn=_ff)
        load_done('[%d]' % len(gene2acc))

        if aslist:
            return dict_to_list(gene2acc)
        else:
            return gene2acc
Exemplo n.º 9
0
    def load(self, aslist=False):
        load_start(self.datafile)
        gene2acc = tab2dict(self.datafile, (1, 3, 5, 7), 0, alwayslist=1,
                            includefn=self.species_filter)

        def _ff(d):
            out = {
                'rna': [],
                'protein': [],
                'genomic': []
            }
            for x1, x2, x3 in d:
                if x1 != '-':
                    out['rna'].append(x1.split('.')[0])   # trim version number after dot
                if x2 != '-':
                    out['protein'].append(x2.split('.')[0])
                if x3 != '-':
                    out['genomic'].append(x3.split('.')[0])
            # remove dup
            for k in out:
                out[k] = normalized_value(out[k])
            # remove empty rna/protein/genomic field
            _out = {}
            for k, v in out.items():
                if v:
                    _out[k] = v
            if _out:
                _out = {self.fieldname: _out}
            return _out

        gene2acc = dict_convert(gene2acc, valuefn=_ff)
        load_done('[%d]' % len(gene2acc))

        if aslist:
            return dict_to_list(gene2acc)
        else:
            return gene2acc
Exemplo n.º 10
0
def get_geneid_d(species_li=None, load_cache=True, save_cache=True):
    '''return a dictionary of current/retired geneid to current geneid mapping.
       This is useful, when other annotations were mapped to geneids may
       contain retired gene ids.

       if species_li is None, genes from all species are loaded.

       Note that all ids are int type.
    '''
    if species_li:
        taxid_set = set([TAXONOMY[species] for species in species_li])
    else:
        taxid_set = None

    orig_cwd = os.getcwd()
    os.chdir(DATA_FOLDER)

    # check cache file
    _cache_file = 'gene/geneid_d.pyobj'
    if load_cache and os.path.exists(_cache_file) and \
       file_newer(_cache_file, 'gene/gene_info.gz') and \
       file_newer(_cache_file, 'gene/gene_history.gz'):

        print('Loading "geneid_d" from cache file...', end='')
        _taxid_set, out_d = loadobj(_cache_file)
        assert _taxid_set == taxid_set
        print('Done.')
        os.chdir(orig_cwd)
        return out_d

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz')
    load_start(DATAFILE)
    if species_li:
        species_filter = lambda ld: int(ld[0]) in taxid_set
    else:
        species_filter = None
    geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter))
    load_done('[%d]' % len(geneid_li))

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz')
    load_start(DATAFILE)

    if species_li:
        _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li
    else:
        _includefn = lambda ld: ld[1] in geneid_li  # include all species
    retired2gene = tab2dict(DATAFILE, (1, 2),
                            1,
                            alwayslist=0,
                            includefn=_includefn)
    # includefn above makes sure taxid is for species_li and filters out those
    # mapped_to geneid exists in gene_info list

    load_done('[%d]' % len(retired2gene))
    # convert key/value to int
    out_d = dict_convert(retired2gene, keyfn=int, valuefn=int)
    for g in geneid_li:
        _g = int(g)
        out_d[_g] = _g

    if save_cache:
        if species_li:
            dump((taxid_set, out_d), _cache_file)
        else:
            dump((None, out_d), _cache_file)

    os.chdir(orig_cwd)
    return out_d
Exemplo n.º 11
0
def get_geneid_d(species_li=None, load_cache=True, save_cache=True):
    '''return a dictionary of current/retired geneid to current geneid mapping.
       This is useful, when other annotations were mapped to geneids may contain
       retired gene ids.

       if species_li is None, genes from all species are loaded.

       Note that all ids are int type.
    '''
    if species_li:
        taxid_set = set([taxid_d[species] for species in species_li])
    else:
        taxid_set = None

    orig_cwd = os.getcwd()
    os.chdir(DATA_FOLDER)

    # check cache file
    _cache_file = 'gene/geneid_d.pyobj'
    if load_cache and os.path.exists(_cache_file) and \
       file_newer(_cache_file, 'gene/gene_info.gz') and \
       file_newer(_cache_file, 'gene/gene_history.gz'):

        print('Loading "geneid_d" from cache file...', end='')
        _taxid_set, out_d = loadobj(_cache_file)
        assert _taxid_set == taxid_set
        print('Done.')
        os.chdir(orig_cwd)
        return out_d

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz')
    load_start(DATAFILE)
    if species_li:
        species_filter = lambda ld: int(ld[0]) in taxid_set
    else:
        species_filter = None
    geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter))
    load_done('[%d]' % len(geneid_li))

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz')
    load_start(DATAFILE)

    if species_li:
        _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li
    else:
        _includefn = lambda ld: ld[1] in geneid_li    # include all species
    retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn)
    # includefn above makes sure taxid is for species_li and filters out those mapped_to geneid exists in gene_info list

    load_done('[%d]' % len(retired2gene))

    out_d = dict_convert(retired2gene, keyfn=int, valuefn=int)    # convert key/value to int
    for g in geneid_li:
        _g = int(g)
        out_d[_g] = _g

    if save_cache:
        if species_li:
            dump((taxid_set, out_d), _cache_file)
        else:
            dump((None, out_d), _cache_file)

    os.chdir(orig_cwd)
    return out_d