def load(self, aslist=False): load_start(self.datafile) print() geneid_d = get_geneid_d(self.species_li) gene2unigene = tab2dict(self.datafile, (0, 1), 0, alwayslist=0, includefn=lambda ld: int(ld[0]) in geneid_d) gene_d = {} for gid, unigene in gene2unigene.items(): gene_d[gid] = {'unigene': unigene} load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def load(self, aslist=False): load_start(self.datafile) gene2go = tab2dict(self.datafile, (1, 2, 3, 4, 5, 6, 7), 0, alwayslist=1, includefn=self.species_filter) category_d = {'Function': 'MF', 'Process': 'BP', 'Component': 'CC'} def _ff(d): out = {} for goid, evidence, qualifier, goterm, pubmed, gocategory in d: _gocategory = category_d[gocategory] _d = out.get(_gocategory, []) _rec = dict(id=goid, term=goterm) if evidence != '-': _rec['evidence'] = evidence if qualifier != '-': # here I also fixing some inconsistency issues in NCBI data # Colocalizes_with -> colocalizes_with # Contributes_with -> contributes_with # Not -> NOT _rec['qualifier'] = qualifier.replace('Co', 'co').replace( 'Not', 'NOT') if pubmed != '-': if pubmed.find('|') != -1: pubmed = [int(pid) for pid in pubmed.split('|')] else: pubmed = int(pubmed) _rec['pubmed'] = pubmed _d.append(_rec) out[_gocategory] = _d for k in out: if len(out[k]) == 1: out[k] = out[k][0] return out gene2go = dict_convert(gene2go, valuefn=_ff) gene_d = {} for gid, go in gene2go.items(): gene_d[gid] = {'go': go} load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def load(self, aslist=False): load_start(self.datafile) gene2go = tab2dict(self.datafile, (1, 2, 3, 4, 5, 6, 7), 0, alwayslist=1, includefn=self.species_filter) category_d = {'Function': 'MF', 'Process': 'BP', 'Component': 'CC'} def _ff(d): out = {} for goid, evidence, qualifier, goterm, pubmed, gocategory in d: _gocategory = category_d[gocategory] _d = out.get(_gocategory, []) _rec = dict(id=goid, term=goterm) if evidence != '-': _rec['evidence'] = evidence if qualifier != '-': # here I also fixing some inconsistency issues in NCBI data # Colocalizes_with -> colocalizes_with # Contributes_with -> contributes_with # Not -> NOT _rec['qualifier'] = qualifier.replace('Co', 'co').replace('Not', 'NOT') if pubmed != '-': if pubmed.find('|') != -1: pubmed = [int(pid) for pid in pubmed.split('|')] else: pubmed = int(pubmed) _rec['pubmed'] = pubmed _d.append(_rec) out[_gocategory] = _d for k in out: if len(out[k]) == 1: out[k] = out[k][0] return out gene2go = dict_convert(gene2go, valuefn=_ff) gene_d = {} for gid, go in gene2go.items(): gene_d[gid] = {'go': go} load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def load(self, aslist=False): load_start(self.datafile) if self.species_li: _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[1] != '-' else: _includefn = lambda ld: ld[1] != '-' gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1, includefn=_includefn) gene2retired = dict_convert(gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x])) gene_d = {} for gid, retired in gene2retired.items(): gene_d[gid] = {'retired': retired} load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def load(self, aslist=False): load_start(self.datafile) gene2acc = tab2dict(self.datafile, (1, 3, 5, 7), 0, alwayslist=1, includefn=self.species_filter) def _ff(d): out = { 'rna': [], 'protein': [], 'genomic': [], 'translation': [] } for rna, prot, dna in d: if rna == '-': rna = None if prot == '-': prot = None if dna == '-': dna = None if rna is not None: out['rna'].append(rna) if prot is not None: out['protein'].append(prot) if dna is not None: out['genomic'].append(dna) if rna and prot: out['translation'].append({'rna' : rna, 'protein' : prot}) # remove dup for k in out: out[k] = normalized_value(out[k]) # remove empty rna/protein/genomic field _out = {} for k, v in out.items(): if v: _out[k] = v if _out: _out = {self.fieldname: _out} return _out gene2acc = dict_convert(gene2acc, valuefn=_ff) load_done('[%d]' % len(gene2acc)) if aslist: return dict_to_list(gene2acc) else: return gene2acc
def load(self, aslist=False): load_start(self.datafile) gene2acc = tab2dict(self.datafile, (1, 3, 5, 7), 0, alwayslist=1, includefn=self.species_filter) def _ff(d): out = { 'rna': [], 'protein': [], 'genomic': [] } for x1, x2, x3 in d: if x1 != '-': out['rna'].append(x1.split('.')[0]) # trim version number after dot if x2 != '-': out['protein'].append(x2.split('.')[0]) if x3 != '-': out['genomic'].append(x3.split('.')[0]) # remove dup for k in out: out[k] = normalized_value(out[k]) # remove empty rna/protein/genomic field _out = {} for k, v in out.items(): if v: _out[k] = v if _out: _out = {self.fieldname: _out} return _out gene2acc = dict_convert(gene2acc, valuefn=_ff) load_done('[%d]' % len(gene2acc)) if aslist: return dict_to_list(gene2acc) else: return gene2acc
def load(self, aslist=False): ''' loading ncbi "gene_info" file This must be called first to create basic gene documents with all basic fields, e.g., name, symbol, synonyms, etc. format of gene_info file: #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs map_location description type_of_gene Symbol_from nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_da te (tab is used as a separator, pound sign - start of a comment) ''' load_start(self.datafile) gene_d = tab2dict(self.datafile, (0, 1, 2, 3, 4, 5, 7, 8, 9, 13, 14), key=1, alwayslist=0, includefn=self.species_filter) def _ff(d): (taxid, symbol, locus_tag, synonyms, dbxrefs, map_location, description, type_of_gene, other_designations, modification_date) = d out = dict(taxid=int(taxid), symbol=symbol, name=description) if map_location != '-': out['map_location'] = map_location if type_of_gene != '-': out['type_of_gene'] = type_of_gene if synonyms != '-': out['alias'] = normalized_value(synonyms.split('|')) if locus_tag != '-': out['locus_tag'] = locus_tag if other_designations != "-": out['other_names'] = normalized_value( other_designations.split('|')) # when merged, this will become the default timestamp out["_timestamp"] = datetime.datetime.strptime( modification_date, "%Y%m%d") for x in dbxrefs.split('|'): if x == '-': continue xd = x.split(':') if len(xd) == 3 and xd[0] == xd[1] and \ xd[0] in ['VGNC', 'HGNC', 'MGI']: # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328' xd = xd[1:] try: _db, _id = xd except: print(repr(x)) raise # we don't need ensembl xref from here, we will get it from # Ensembl directly if _db.lower() in ['ensembl', 'imgt/gene-db']: # we don't need 'IMGT/GENE-DB" xref either, because they # are mostly the same as gene symbol continue # add "MGI:" prefix for MGI ids. if _db.lower() == 'mgi': _id = "MGI:" + _id out[_db] = _id return out gene_d = value_convert(gene_d, _ff) # add entrezgene field for geneid in gene_d: d = gene_d[geneid] d['entrezgene'] = int(geneid) gene_d[geneid] = d load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def load(self, aslist=False): ''' loading ncbi "gene_info" file This must be called first to create basic gene documents with all basic fields, e.g., name, symbol, synonyms, etc. format of gene_info file: #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs chromosome map_location description type_of_gene Symbol_from _nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_da te (tab is used as a separator, pound sign - start of a comment) ''' load_start(self.datafile) gene_d = tab2dict(self.datafile, (0, 1, 2, 4, 5, 7, 8, 9), key=1, alwayslist=0, includefn=self.species_filter) def _ff(d): ( taxid, symbol, synonyms, dbxrefs, map_location, description, type_of_gene ) = d out = dict(taxid=int(taxid), symbol=symbol, name=description) if map_location != '-': out['map_location'] = map_location if type_of_gene != '-': out['type_of_gene'] = type_of_gene if synonyms != '-': out['alias'] = normalized_value(synonyms.split('|')) for x in dbxrefs.split('|'): if x == '-': continue xd = x.split(':') if len(xd) == 3 and xd[0] == xd[1] and xd[0] in ['HGNC', 'MGI']: xd = xd[1:] # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328' try: _db, _id = xd except: print(x) raise if _db.lower() in ['ensembl', 'imgt/gene-db']: # we don't need ensembl xref from here, we will get it from Ensembl directly continue # we don't need 'IMGT/GENE-DB" xref either, because they are mostly the same as gene symbol if _db.lower() == 'mgi': # add "MGI:" prefix for MGI ids. _id = "MGI:"+_id out[_db] = _id return out gene_d = value_convert(gene_d, _ff) # add entrezgene field for geneid in gene_d: d = gene_d[geneid] d['entrezgene'] = int(geneid) gene_d[geneid] = d load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d