def load_ensembl2interpro(self): #Interpro datafile = os.path.join(self.data_folder, 'gene_ensembl__prot_interpro__dm.txt') for datadict in tab2dict_iter(datafile, (1, 4, 5, 6), 0): datadict = dict_nodup(datadict) # optimize with on call/convert datadict = value_convert( datadict, lambda x: { 'id': x[0], 'short_desc': x[1], 'desc': x[2] }) # __aslistofdict__ : merge to 'interpro' as list of dict, not merging keys as list # (these are merging instructions for later called merge_struct) # 'interpro' : {'a': 1, 'b': 2} and 'interpro' : {'a': 3, 'b': 4} should result in: # => 'interpro' : [{'a': 1, 'b': 2},{'a': 3, 'b': 4}] # or not: # => 'interpro' : {'a': [1,3], 'b': [2,4]} datadict = value_convert(datadict, lambda x: { 'interpro': x, '__aslistofdict__': 'interpro' }, traverse_list=False) for doc in map_id(datadict, self.ensembl2entrez): yield doc
def load_ensembl_main(self): """loading ensembl gene to symbol+name mapping""" def _fn(x): import logging out = {'taxid': int(x[0])} if x[1].strip() not in ['', '\\N']: out['symbol'] = x[1].strip() if x[2].strip() not in ['', '\\N']: _name = SubStr(x[2].strip(), '', ' [Source:').strip() if _name: out['name'] = _name return out skip_count = 0 datafile = os.path.join( self.data_folder, 'gene_ensembl__gene__main.txt') for datadict in tab2dict_iter(datafile, (0, 1, 2, 7, 8), 1, includefn=_not_LRG): datadict = value_convert(datadict, _fn) for id, doc in datadict.items(): if id.isdigit(): if skip_count < ERR_THRESHOLD: skip_count += 1 else: raise ValueError('Too many ensembl ids are entirely numeric') self.logger.warning( "Document Skipped: All-digit id {}".format(id)) continue doc['_id'] = id yield doc
def load(self): cnt = 0 for datadict in tab2dict_iter(self.datafile, (1, 2, 4), 0, alwayslist=1): datadict = dict_convert(datadict, valuefn=lambda v: { 'generif': [dict(pubmed=self._cvt_pubmed(x[0]), text=x[1]) for x in v]}) for id,doc in datadict.items(): cnt += 1 doc['_id'] = id yield doc
def load_ensembl2pfam(self): # Prosite datafile = os.path.join( self.data_folder, 'gene_ensembl__prot_pfam__dm.txt') for datadict in tab2dict_iter(datafile, (1, 4), 0): datadict = dict_nodup(datadict) datadict = value_convert(datadict, lambda x: { 'pfam': x}, traverse_list=False) for doc in map_id(datadict, self.ensembl2entrez): yield doc
def load(self, aslist=False): gene2go = tab2dict_iter(self.datafile, (1, 2, 3, 4, 5, 6, 7), 0, alwayslist=1, includefn=self.species_filter) category_d = {'Function': 'MF', 'Process': 'BP', 'Component': 'CC'} def _ff(d): out = {} for goid, evidence, qualifier, goterm, pubmed, gocategory in d: _gocategory = category_d[gocategory] _d = out.get(_gocategory, []) _rec = dict(id=goid, term=goterm) if gocategory == 'Function': _rec['category'] = 'MF' elif gocategory == 'Process': _rec['gocategory'] = 'BP' elif gocategory == 'Component': _rec['gocategory'] = 'CC' if evidence != '-': _rec['evidence'] = evidence if qualifier != '-': # here I also fixing some inconsistency issues in NCBI data # Colocalizes_with -> colocalizes_with # Contributes_with -> contributes_with # Not -> NOT _rec['qualifier'] = qualifier.replace('Co', 'co').replace( 'Not', 'NOT') if pubmed != '-': if pubmed.find('|') != -1: pubmed = [int(pid) for pid in pubmed.split('|')] else: pubmed = int(pubmed) _rec['pubmed'] = pubmed _d.append(_rec) out[_gocategory] = _d for k in out: if len(out[k]) == 1: out[k] = out[k][0] return out for gd in gene2go: convd = dict_convert(gd, valuefn=_ff) assert len(list( convd.items())) == 1, "nope: %s" % list(convd.items()) gid, go = list(convd.items())[0] gene_d = {"_id": gid, "go": go} yield gene_d
def load_ensembl2pos(self): datafile = os.path.join( self.data_folder, 'gene_ensembl__gene__main.txt') # Twice 1 because first is the dict key, the second because we need gene id within genomic_pos ensembl2pos = dict_nodup( tab2dict(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG)) ensembl2pos = value_convert(ensembl2pos, lambda x: { 'ensemblgene': x[0], 'chr': x[3], 'start': int(x[1]), 'end': int(x[2]), 'strand': int(x[4])}) ensembl2pos = value_convert(ensembl2pos, lambda x: { 'genomic_pos': x}, traverse_list=False) for datadict in tab2dict_iter(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG): datadict = dict_nodup(datadict) datadict = value_convert(datadict, lambda x: {'ensemblgene': x[0], 'chr': x[3], 'start': int( x[1]), 'end': int(x[2]), 'strand': int(x[4])}) datadict = value_convert(datadict, lambda x: { 'genomic_pos': x, '__aslistofdict__': 'genomic_pos'}, traverse_list=False) for doc in map_id(datadict, self.ensembl2entrez): yield doc
def load(self, aslist=False): gene2acc = tab2dict_iter(self.datafile, (1, 3, 5, 7), 0, alwayslist=1, includefn=self.species_filter) def _ff(d): out = {'rna': [], 'protein': [], 'genomic': [], 'translation': []} for rna, prot, dna in d: if rna == '-': rna = None if prot == '-': prot = None if dna == '-': dna = None if rna is not None: out['rna'].append(rna) if prot is not None: out['protein'].append(prot) if dna is not None: out['genomic'].append(dna) if rna and prot: out['translation'].append({'rna': rna, 'protein': prot}) # remove dup for k in out: out[k] = normalized_value(out[k]) # remove empty rna/protein/genomic field _out = {} for k, v in out.items(): if v: _out[k] = v if _out: _out = {self.fieldname: _out} return _out #gene/2acc = dict_convert(gene2acc, valuefn=_ff) cnt = 0 for gd in gene2acc: convd = self.format(dict_convert(gd, valuefn=_ff)) yield convd cnt += 1 if aslist: return dict_to_list(gene2acc) else: return gene2acc
def load_data(data_folder): datafile = os.path.join(data_folder, 'NCBI2Reactome_All_Levels.txt') data = tab2dict_iter(datafile, (0, 1, 3), 0, header=0, alwayslist=True) def convert(data): for dvalue in data: assert len(dvalue) == 1 _id = list(dvalue.keys())[0] doc = {"_id" : _id, "pathway" : {"reactome" : None} } lvals = [] for val in dvalue[_id]: lvals.append({"id" : val[0], "name" : val[1]}) if len(lvals) == 1: lvals = lvals.pop() doc["pathway"]["reactome"] = lvals yield doc return convert(data)
def load_ensembl_main(self): """loading ensembl gene to symbol+name mapping""" def _fn(x): import logging out = {'taxid': int(x[0])} if x[1].strip() not in ['', '\\N']: out['symbol'] = x[1].strip() if x[2].strip() not in ['', '\\N']: _name = SubStr(x[2].strip(), '', ' [Source:').strip() if _name: out['name'] = _name return out datafile = os.path.join(self.data_folder, 'gene_ensembl__gene__main.txt') for datadict in tab2dict_iter(datafile, (0, 1, 2, 7, 8), 1, includefn=_not_LRG): datadict = value_convert(datadict, _fn) for id, doc in datadict.items(): doc['_id'] = id yield doc
def load(self, aslist=False): uni_d = tab2dict(self.datafile, (0, 1), 0, alwayslist=0) DATAFILE = os.path.join(self.data_folder, 'gene_history.gz') retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=lambda ld: ld[1] != '-') for id in list(uni_d.keys()): uni_d[retired2gene.get(id, id)] = uni_d[id] geneid_d = get_geneid_d(self.data_folder, self.species_li, load_cache=False, save_cache=False, only_for=uni_d) gene2unigene = tab2dict_iter( self.datafile, (0, 1), 0, alwayslist=0, includefn=lambda ld: int(ld[0]) in geneid_d) cnt = 0 for doc in gene2unigene: yield self.format(doc) cnt += 1
def load(self, aslist=False): ''' loading ncbi "gene_info" file This must be called first to create basic gene documents with all basic fields, e.g., name, symbol, synonyms, etc. format of gene_info file: #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs map_location description type_of_gene Symbol_from nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_da te (tab is used as a separator, pound sign - start of a comment) ''' gene_d = tab2dict_iter(self.datafile, (0, 1, 2, 3, 4, 5, 7, 8, 9, 13, 14), key=1, alwayslist=0, includefn=self.species_filter) def _ff(d): (taxid, symbol, locus_tag, synonyms, dbxrefs, map_location, description, type_of_gene, other_designations, modification_date) = d out = dict(taxid=int(taxid), symbol=symbol, name=description) if map_location != '-': out['map_location'] = map_location if type_of_gene != '-': out['type_of_gene'] = type_of_gene if synonyms != '-': out['alias'] = normalized_value(synonyms.split('|')) if locus_tag != '-': out['locus_tag'] = locus_tag if other_designations != "-": out['other_names'] = normalized_value( other_designations.split('|')) ### when merged, this will become the default timestamp ### as of 2017/12/10, some timestamps can have different formats ##if len(modification_date) > 8: ## out["_timestamp"] = datetime.datetime.strptime(modification_date,"%m/%d/%Y %H:%M:%S") ##else: ## out["_timestamp"] = datetime.datetime.strptime(modification_date,"%Y%m%d") for x in dbxrefs.split('|'): if x == '-': continue xd = x.split(':') if len(xd) == 3 and xd[0] == xd[1] and \ xd[0] in ['VGNC', 'HGNC', 'MGI']: # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328' xd = xd[1:] try: _db, _id = xd except: print(repr(x)) raise # we don't need ensembl xref from here, we will get it from # Ensembl directly if _db.lower() in ['ensembl', 'imgt/gene-db']: # we don't need 'IMGT/GENE-DB" xref either, because they # are mostly the same as gene symbol continue # add "MGI:" prefix for MGI ids. if _db.lower() == 'mgi': _id = "MGI:" + _id out[_db] = _id return out # add entrezgene field cnt = 0 for d in gene_d: d = value_convert(d, _ff) yield self.format(d) cnt += 1