def load_x(idx, fieldname, cvt_fn=None): print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz') load_start(DATAFILE) t0 = time.time() xli = [] for ld in tabfile_feeder(DATAFILE, header=1): ld = listitems(ld, *(2,19,idx)) # GeneID Ensembl(Gene) target_value for value in dupline_seperator(dupline=ld, dup_sep='; '): xli.append(value) ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli if x[0]!='' and x[1]!='']), 0, alwayslist=True) xli2 = [] for entrez_id, ensembl_id, x_value in xli: if x_value: if cvt_fn: x_value = cvt_fn(x_value) if entrez_id: xli2.append((entrez_id, x_value)) elif ensembl_id: entrez_id = ensembl2geneid.get(ensembl_id, None) if entrez_id: for _eid in entrez_id: xli2.append((_eid, x_value)) else: xli2.append((ensembl_id, x_value)) gene2x = list2dict(list_nondup(xli2), 0) fn = lambda value: {fieldname: sorted(value) if type(value) is types.ListType else value} gene2x = value_convert(gene2x, fn, traverse_list=False) load_done('[%d, %s]' % (len(gene2x), timesofar(t0))) return gene2x
def loaddata(): #Snowball array DATAFILE = os.path.join(DATA_FOLDER, 'pigatlas', 'snowball_array_annotation.txt') load_start(DATAFILE) gene2snowball = tab2dict(DATAFILE, (0, 1), 1,header=0) load_done('[%d]' % len(gene2snowball)) return {'snowball': gene2snowball}
def load(self): load_start(self.datafile) gene2generif = tab2dict(self.datafile, (1, 2, 4), 0, alwayslist=1) gene2generif = dict_convert(gene2generif, valuefn=lambda v: { 'generif': [dict(pubmed=self._cvt_pubmed(x[0]), text=x[1]) for x in v]}) load_done('[%d]' % len(gene2generif)) return gene2generif
def load_cpdb(): print('DATA_FOLDER: '+ DATA_FOLDER) DATA_FILES = [] DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_mouse.tab')) DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_yeast.tab')) DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_human.tab')) arr = {} for DATA_FILE in DATA_FILES: load_start(DATA_FILE) f= open(DATA_FILE,"r") lines = f.readlines() for line in lines: line = line.rstrip('\n') cols = line.split("\t") genes = cols[len(cols)-1].split(",") for gene in genes: if gene != "entrez_gene_ids" and gene in arr.keys(): if cols[len(cols)-2] not in arr[gene]['pathway'].keys(): arr[gene]['pathway'][cols[len(cols)-2].lower()]={'name':''} arr[gene]['pathway'][cols[len(cols)-2].lower()]['name'] = cols[len(cols)-4] if cols[len(cols)-3] != "None": if cols[len(cols)-2].lower() == "kegg": arr[gene]['pathway'][cols[len(cols)-2].lower()]['id'] = cols[len(cols)-3].replace("path:","") else : arr[gene]['pathway'][cols[len(cols)-2].lower()]['id'] = cols[len(cols)-3] else: if cols[len(cols)-3] != "None": arr[gene]= {'pathway':{cols[len(cols)-2].lower():{'name':cols[len(cols)-4], 'id': cols[len(cols)-3].replace("path:","")}}} load_done('[%d]' % len(arr)) return arr
def load_broadinstitute_exac(): print('DATA_FOLDER: ' + DATA_FOLDER) t0 = time.time() exacs = load_broadinstitute_exac_all() for k,v in load_broadinstitute_exac_nontcga().items(): try: exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"] except KeyError: exacs[k] = v for k,v in load_broadinstitute_exac_nonpsych().items(): try: exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"] except KeyError: exacs[k] = v logging.info("Convert transcript ID to EntrezID") import dataload.sources.ensembl.ensembl_base as ensembl_base ensembl_parser = ensembl_base.EnsemblParser() ensembl_parser._load_ensembl2entrez_li() ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True) ensembl_dir = get_data_folder("ensembl") for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")): _,ensid,transid,_ = line if transid in exacs: data = exacs.pop(transid) # pop so no-match means no data in the end for entrezid in ensembl2entrez.get(ensid,[ensid]): exacs[entrezid] = data load_done('[%d, %s]' % (len(exacs), timesofar(t0))) return exacs
def _load_ensembl2entrez_li(self): """gene_ensembl__xref_entrezgene__dm""" DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt') load_start(DATAFILE) ensembl2entrez_li = tab2list(DATAFILE, (1, 2), includefn=_not_LRG) # [(ensembl_gid, entrez_gid),...] load_done('[%d]' % len(ensembl2entrez_li)) self.ensembl2entrez_li = ensembl2entrez_li
def load_ensembl2acc(self): """ loading ensembl to transcripts/proteins data """ #Loading all ensembl GeneIDs, TranscriptIDs and ProteinIDs DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt') load_start(DATAFILE) ensembl2acc = tab2dict(DATAFILE, (1,2,3), 0, includefn=_not_LRG) def _fn(x, eid): out={'gene': eid} if type(x) is types.ListType: transcript_li = [] protein_li = [] for _x in x: if _x[0] and _x[0]!='\\N': transcript_li.append(_x[0]) if _x[0] and _x[1]!='\\N': protein_li.append(_x[1]) if transcript_li: out['transcript']=normalized_value(transcript_li) if protein_li: out['protein'] = normalized_value(protein_li) else: if x[0] and x[0]!='\\N': out['transcript'] = x[0] if x[1] and x[1]!='\\N': out['protein'] = x[1] return out for k in ensembl2acc: ensembl2acc[k] = {'ensembl': _fn(ensembl2acc[k], k)} load_done('[%d]' % len(ensembl2acc)) return self.convert2entrez(ensembl2acc)
def load_ensembl2pfam(self): #Prosite DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt') load_start(DATAFILE) ensembl2pfam = dict_nodup(tab2dict(DATAFILE, (1, 4), 0)) ensembl2pfam = value_convert(ensembl2pfam, lambda x: {'pfam': x}, traverse_list=False) load_done('[%d]' % len(ensembl2pfam)) return self.convert2entrez(ensembl2pfam)
def load_ensembl2pos(self): #Genomic position DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt') load_start(DATAFILE) ensembl2pos = dict_nodup(tab2dict(DATAFILE, (1, 3, 4, 5, 6), 0, includefn=_not_LRG)) ensembl2pos = value_convert(ensembl2pos, lambda x: {'chr': x[2], 'start': int(x[0]), 'end': int(x[1]), 'strand': int(x[3])}) ensembl2pos = value_convert(ensembl2pos, lambda x: {'genomic_pos': x}, traverse_list=False) load_done('[%d]' % len(ensembl2pos)) return self.convert2entrez(ensembl2pos)
def load_ensembl2interpro(self): #Interpro DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt') load_start(DATAFILE) ensembl2interpro = dict_nodup(tab2dict(DATAFILE, (1, 4, 5, 6), 0)) ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'id': x[0], 'short_desc': x[1], 'desc': x[2]}) ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'interpro': x}, traverse_list=False) load_done('[%d]' % len(ensembl2interpro)) return self.convert2entrez(ensembl2interpro)
def _load_ensembl_2taxid(self): """ensembl2taxid""" DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt') load_start(DATAFILE) ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG)) # need to convert taxid to integer here ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x)) load_done('[%d]' % len(ensembl2taxid)) return ensembl2taxid
def load_pharmgkb(): print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'genes.zip') load_start(DATAFILE) gene2pharmgkb = tab2dict((DATAFILE, 'genes.tsv'), (0, 1), 1, header=1, includefn=lambda ld: ld[1] != '') fn = lambda value: {'pharmgkb': value} gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False) load_done('[%d]' % len(gene2pharmgkb)) return gene2pharmgkb
def loaddata(): affy_d = {} for annot in AFFY_ANNOT_FILES: name = annot['name'] DATAFILE = os.path.join(AFFY_DATA_FOLDER, annot['file'] % AFFY_RELEASE) load_start(DATAFILE) d = _load_affy(DATAFILE) affy_d[name] = d load_done('[%d]' % len(d)) return affy_d
def load_ucsc_exons(): print('DATA_FOLDER: ' + DATA_FOLDER) species_li = os.listdir(DATA_FOLDER) print "Found {} species folders.".format(len(species_li)) t0 = time.time() gene2exons = {} for species in species_li: print species, '...' gene2exons.update(load_exons_for_species(species)) load_done('[%d, %s]' % (len(gene2exons), timesofar(t0))) return gene2exons
def loaddata(): #GNF1H DATAFILE = os.path.join(DATA_FOLDER, 'gnf', 'GNF1H.ANNO7.LOAD_20130402.tab') load_start(DATAFILE) gene2gnf1h = tab2dict(DATAFILE, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '') load_done('[%d]' % len(gene2gnf1h)) #GNF1m DATAFILE = os.path.join(DATA_FOLDER, 'gnf', 'gnf1m.NEW_ANNO6.LOAD_20130402.tab') load_start(DATAFILE) gene2gnf1m = tab2dict(DATAFILE, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '') load_done('[%d]' % len(gene2gnf1m)) return {'GNF1H': gene2gnf1h, 'GNF1M': gene2gnf1m}
def load(self, aslist=False): load_start(self.datafile) print() geneid_d = get_geneid_d(self.species_li) gene2unigene = tab2dict(self.datafile, (0, 1), 0, alwayslist=0, includefn=lambda ld: int(ld[0]) in geneid_d) gene_d = {} for gid, unigene in gene2unigene.items(): gene_d[gid] = {'unigene': unigene} load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def load(self, aslist=False): load_start(self.datafile) with file(self.datafile) as df: geneid_set = set() doc_li = [] for line in df: geneid, summary = line.strip().split('\t') if geneid not in geneid_set: doc_li.append(dict(_id=geneid, summary=unicode(summary))) geneid_set.add(geneid) load_done('[%d]' % len(doc_li)) if aslist: return doc_li else: gene_d = dict([(d['_id'], d) for d in doc_li]) return gene_d
def _load_ensembl2name(self): """loading ensembl gene to symbol+name mapping""" DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt') load_start(DATAFILE) ensembl2name = tab2dict(DATAFILE, (1,2,7), 0, includefn=_not_LRG) def _fn(x): out={} if x[0].strip() not in ['', '\\N']: out['symbol'] = x[0].strip() if x[1].strip() not in ['', '\\N']: _name = SubStr(x[1].strip(), '', ' [Source:').strip() if _name: out['name'] = _name return out ensembl2name = value_convert(ensembl2name, _fn) load_done('[%d]' % len(ensembl2name)) return ensembl2name
def loaddata(): affy_d = {} for annot in AFFY_ANNOT_FILES: name = annot['name'] DATAFILE = annot['file'] if DATAFILE.find('%s') != -1: if DATAFILE.startswith('extra'): DATAFILE = DATAFILE % AFFY_RELEASE_EXTRA else: DATAFILE = DATAFILE % AFFY_RELEASE DATAFILE = os.path.join(AFFY_DATA_FOLDER, DATAFILE) load_start(DATAFILE) d = _load_affy(DATAFILE) affy_d[name] = d load_done('[%d]' % len(d)) return affy_d
def load_cpdb(__metadata__): # only import pathways from these sources PATHWAY_SOURCES_INCLUDED = __metadata__['pathway_sources_included'] VALID_COLUMN_NO = 4 t0 = time.time() print('DATA_FOLDER: ' + DATA_FOLDER) DATA_FILES = [] DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_mouse.tab')) DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_yeast.tab')) DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_human.tab')) _out = [] for DATA_FILE in DATA_FILES: load_start(DATA_FILE) for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO): p_name, p_id, p_source = ld[:3] p_source = p_source.lower() if p_source == 'kegg' and p_id.startswith('path:'): p_id = p_id[5:] if p_source in PATHWAY_SOURCES_INCLUDED: genes = ld[-1].split(",") for gene in genes: _out.append((gene, p_name, p_id, p_source)) load_done() _out = list2dict(_out, 0, alwayslist=True) def _inner_cvt(p): p_name, p_id = p _d = {'name': p_name} if p_id != 'None': _d['id'] = p_id return _d def _cvt(pli): _d = list2dict(pli, 2) _d = value_convert(_d, _inner_cvt) for p_source in _d: if isinstance(_d[p_source], list): _d[p_source].sort() return {'pathway': _d} _out = dict_convert(_out, valuefn=_cvt) load_done('[%d, %s]' % (len(_out), timesofar(t0))) return _out
def load_ensembl2acc(self): """ loading ensembl to transcripts/proteins data """ #Loading all ensembl GeneIDs, TranscriptIDs and ProteinIDs DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt') load_start(DATAFILE) ensembl2acc = tab2dict(DATAFILE, (1, 2, 3), 0, includefn=_not_LRG) def _fn(x, eid): out = {'gene': eid, 'translation' : []} def mapping(transcript_id, protein_id): trid = transcript_id and transcript_id != '\\N' and transcript_id or None pid = protein_id and protein_id != '\\N' and protein_id or None if trid and pid: out['translation'].append({"rna" : trid, "protein" : pid}) if isinstance(x, list): transcript_li = [] protein_li = [] for _x in x: if _x[0] and _x[0] != '\\N': transcript_li.append(_x[0]) if _x[1] and _x[1] != '\\N': protein_li.append(_x[1]) mapping(_x[0],_x[1]) if transcript_li: out['transcript'] = normalized_value(transcript_li) if protein_li: out['protein'] = normalized_value(protein_li) else: if x[0] and x[0] != '\\N': out['transcript'] = x[0] if x[1] and x[1] != '\\N': out['protein'] = x[1] mapping(x[0],x[1]) return out for k in ensembl2acc: ensembl2acc[k] = {'ensembl': _fn(ensembl2acc[k], k)} load_done('[%d]' % len(ensembl2acc)) return self.convert2entrez(ensembl2acc)
def load(self, aslist=False): load_start(self.datafile) gene2go = tab2dict(self.datafile, (1, 2, 3, 4, 5, 6, 7), 0, alwayslist=1, includefn=self.species_filter) category_d = {'Function': 'MF', 'Process': 'BP', 'Component': 'CC'} def _ff(d): out = {} for goid, evidence, qualifier, goterm, pubmed, gocategory in d: _gocategory = category_d[gocategory] _d = out.get(_gocategory, []) _rec = dict(id=goid, term=goterm) if evidence != '-': _rec['evidence'] = evidence if qualifier != '-': # here I also fixing some inconsistency issues in NCBI data # Colocalizes_with -> colocalizes_with # Contributes_with -> contributes_with # Not -> NOT _rec['qualifier'] = qualifier.replace('Co', 'co').replace('Not', 'NOT') if pubmed != '-': if pubmed.find('|') != -1: pubmed = [int(pid) for pid in pubmed.split('|')] else: pubmed = int(pubmed) _rec['pubmed'] = pubmed _d.append(_rec) out[_gocategory] = _d for k in out: if len(out[k]) == 1: out[k] = out[k][0] return out gene2go = dict_convert(gene2go, valuefn=_ff) gene_d = {} for gid, go in gene2go.items(): gene_d[gid] = {'go': go} load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def load_ucsc_exons(): print('DATA_FOLDER: ' + DATA_FOLDER) species_li = os.listdir(DATA_FOLDER) print "Found {} species folders.".format(len(species_li)) t0 = time.time() gene2exons = {} for species in species_li: print species, '...' if species == 'Homo_sapiens': gene2exons.update(load_exons_for_human()) elif species == 'Mus_musculus': gene2exons.update(load_exons_for_mouse()) else: gene2exons.update(load_exons_for_species(species)) load_done('[%d, %s]' % (len(gene2exons), timesofar(t0))) return gene2exons
def load(self, aslist=False): ''' loading ncbi "homologene.data" file adding "homologene" field in gene doc ''' load_start(self.datafile) with open(self.datafile) as df: homologene_d = {} doc_li = [] print() geneid_d = get_geneid_d(self.species_li) for line in df: ld = line.strip().split('\t') hm_id, tax_id, geneid = [int(x) for x in ld[:3]] if (self.taxid_set is None or tax_id in self.taxid_set) and \ geneid in geneid_d: # for selected species only # and also ignore those geneid does not match any # existing gene doc # in case of orignal geneid is retired, replaced with the # new one, if available. geneid = geneid_d[geneid] genes = homologene_d.get(hm_id, []) genes.append((tax_id, geneid)) homologene_d[hm_id] = genes doc_li.append(dict(_id=str(geneid), taxid=tax_id, homologene={'id': hm_id})) for i, gdoc in enumerate(doc_li): gdoc['homologene']['genes'] = self._sorted_homologenes( set(homologene_d[gdoc['homologene']['id']])) doc_li[i] = gdoc load_done('[%d]' % len(doc_li)) if aslist: return doc_li else: gene_d = dict([(d['_id'], d) for d in doc_li]) return gene_d
def load(self, aslist=False): load_start(self.datafile) gene2acc = tab2dict(self.datafile, (1, 3, 5, 7), 0, alwayslist=1, includefn=self.species_filter) def _ff(d): out = { 'rna': [], 'protein': [], 'genomic': [], 'translation': [] } for rna, prot, dna in d: if rna == '-': rna = None if prot == '-': prot = None if dna == '-': dna = None if rna is not None: out['rna'].append(rna) if prot is not None: out['protein'].append(prot) if dna is not None: out['genomic'].append(dna) if rna and prot: out['translation'].append({'rna' : rna, 'protein' : prot}) # remove dup for k in out: out[k] = normalized_value(out[k]) # remove empty rna/protein/genomic field _out = {} for k, v in out.items(): if v: _out[k] = v if _out: _out = {self.fieldname: _out} return _out gene2acc = dict_convert(gene2acc, valuefn=_ff) load_done('[%d]' % len(gene2acc)) if aslist: return dict_to_list(gene2acc) else: return gene2acc
def load(self, aslist=False): load_start(self.datafile) if self.species_li: _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[1] != '-' else: _includefn = lambda ld: ld[1] != '-' gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1, includefn=_includefn) gene2retired = dict_convert(gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x])) gene_d = {} for gid, retired in gene2retired.items(): gene_d[gid] = {'retired': retired} load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def load_x(idx, fieldname, cvt_fn=None): '''idx is 0-based column number''' print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz') load_start(DATAFILE) t0 = time.time() xli = [] for ld in tabfile_feeder(DATAFILE, header=1, assert_column_no=VALID_COLUMN_NO): ld = listitems(ld, *(2, 19, idx)) # GeneID Ensembl(Gene) target_value for value in dupline_seperator(dupline=ld, dup_sep='; '): xli.append(value) ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli if x[0] != '' and x[1] != '']), 0, alwayslist=True) xli2 = [] for entrez_id, ensembl_id, x_value in xli: if x_value: if cvt_fn: x_value = cvt_fn(x_value) if entrez_id: xli2.append((entrez_id, x_value)) elif ensembl_id: entrez_id = ensembl2geneid.get(ensembl_id, None) if entrez_id: for _eid in entrez_id: xli2.append((_eid, x_value)) else: xli2.append((ensembl_id, x_value)) gene2x = list2dict(list_nondup(xli2), 0) fn = lambda value: { fieldname: sorted(value) if isinstance(value, list) else value } gene2x = value_convert(gene2x, fn, traverse_list=False) load_done('[%d, %s]' % (len(gene2x), timesofar(t0))) return gene2x
def load(self, aslist=False): load_start(self.datafile) gene2acc = tab2dict(self.datafile, (1, 3, 5, 7), 0, alwayslist=1, includefn=self.species_filter) def _ff(d): out = {'rna': [], 'protein': [], 'genomic': [], 'translation': []} for rna, prot, dna in d: if rna == '-': rna = None if prot == '-': prot = None if dna == '-': dna = None if rna is not None: out['rna'].append(rna) if prot is not None: out['protein'].append(prot) if dna is not None: out['genomic'].append(dna) if rna and prot: out['translation'].append({'rna': rna, 'protein': prot}) # remove dup for k in out: out[k] = normalized_value(out[k]) # remove empty rna/protein/genomic field _out = {} for k, v in out.items(): if v: _out[k] = v if _out: _out = {self.fieldname: _out} return _out gene2acc = dict_convert(gene2acc, valuefn=_ff) load_done('[%d]' % len(gene2acc)) if aslist: return dict_to_list(gene2acc) else: return gene2acc
def load_exons_for_species(species, exons_key='exons'): refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz') reflink_file = os.path.join(DATA_FOLDER, species, 'database/refLink.txt.gz') load_start(refflat_file) t0 = time.time() refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False) ref2exons = [] for ld in tabfile_feeder(refflat_file, header=0): refseq = ld[1] chr = ld[2] if chr.startswith('chr'): chr = chr[3:] exons = zip([int(x) for x in ld[9].split(',') if x], [int(x) for x in ld[10].split(',') if x]) assert len(exons) == int(ld[8]), (len(exons), int(ld[8])) ref2exons.append((refseq, { 'chr': chr, 'strand': -1 if ld[3] == '-' else 1, 'txstart': int(ld[4]), 'txend': int(ld[5]), 'cdsstart': int(ld[6]), 'cdsend': int(ld[7]), 'exons': exons })) ref2exons = list2dict(ref2exons, 0) gene2exons = {} for refseq in sorted(ref2exons.keys()): geneid = refseq2gene.get(refseq, None) if geneid and geneid != '0': if geneid not in gene2exons: gene2exons[geneid] = {exons_key: {refseq: ref2exons[refseq]}} else: gene2exons[geneid][exons_key][refseq] = ref2exons[refseq] load_done('[%d, %s]' % (len(gene2exons), timesofar(t0))) return gene2exons
def load_genedoc(self): """ Loads gene data from NCBI's refseq2gene.gz file. Parses it based on genomic position data and refseq status provided by the list of taxids from get_ref_microbe_taxids() as lookup table :return: """ taxids = loadobj(TAXIDS_FILE) taxid_set = set(taxids) load_start(DATAFILE) def _includefn(ld): return ld[0] in taxid_set # match taxid from taxid_set cols_included = [0, 1, 7, 9, 10, 11] # 0-based col idx gene2genomic_pos_li = tab2list(DATAFILE, cols_included, header=1, includefn=_includefn) count = 0 last_id = None for gene in gene2genomic_pos_li: count += 1 strand = 1 if gene[5] == '+' else -1 _id = gene[1] mgi_dict = { '_id': _id, 'genomic_pos': { 'entrezgene': _id, 'start': int(gene[3]), 'end': int(gene[4]), 'chr': gene[2], 'strand': strand } } if _id != last_id: # rows with dup _id will be skipped yield mgi_dict last_id = _id load_done('[%d]' % count)
def _load_ensembl2entrez_li(self): """gene_ensembl__xref_entrezgene__dm""" CUSTOM_MAPPING_FILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__extra.txt') if not os.path.exists(CUSTOM_MAPPING_FILE): print("Missing extra mapping file, now generating") from . import ensembl_ncbi_mapping ensembl_ncbi_mapping.main(confirm=False) load_start(CUSTOM_MAPPING_FILE) extra = tab2dict(CUSTOM_MAPPING_FILE,(0, 1), 0, alwayslist=True) DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt') load_start(DATAFILE) ensembl2entrez = tab2dict(DATAFILE, (1, 2), 0, includefn=_not_LRG, alwayslist=True) # [(ensembl_gid, entrez_gid),...] # replace with our custom mapping for k in extra: ensembl2entrez[k] = extra[k] # back to list of tuples ensembl2entrez_li = [] for ensembl_id, entrez_ids in ensembl2entrez.items(): for entrez_id in entrez_ids: ensembl2entrez_li.append((ensembl_id, entrez_id)) load_done('[%d]' % len(ensembl2entrez_li)) self.ensembl2entrez_li = ensembl2entrez_li
def load(self, aslist=False): load_start(self.datafile) with file(self.datafile) as df: geneid_set = set() doc_li = [] for line in df: geneid, ec = line.strip().split('\t') if ec.find(',') != -1: # there are multiple EC numbers ec = [unicode(x) for x in ec.split(',')] else: ec = unicode(ec) if geneid not in geneid_set: doc_li.append(dict(_id=geneid, ec=ec)) geneid_set.add(geneid) load_done('[%d]' % len(doc_li)) if aslist: return doc_li else: gene_d = dict([(d['_id'], d) for d in doc_li]) return gene_d
def load_genedoc(self): """ Loads gene data from NCBI's refseq2gene.gz file. Parses it based on genomic position data and refseq status provided by the list of taxids from get_ref_microbe_taxids() as lookup table :return: """ taxids = loadobj(TAXIDS_FILE) taxid_set = set(taxids) load_start(DATAFILE) def _includefn(ld): return ld[0] in taxid_set # match taxid from taxid_set cols_included = [0, 1, 7, 9, 10, 11] # 0-based col idx gene2genomic_pos_li = tab2list(DATAFILE, cols_included, header=1, includefn=_includefn) count = 0 last_id = None for gene in gene2genomic_pos_li: count += 1 strand = 1 if gene[5] == '+' else -1 _id = gene[1] mgi_dict = { '_id': _id, 'genomic_pos': { 'start': int(gene[3]), 'end': int(gene[4]), 'chr': gene[2], 'strand': strand } } if _id != last_id: # rows with dup _id will be skipped yield mgi_dict last_id = _id load_done('[%d]' % count)
def load(self, aslist=False): load_start(self.datafile) with open(self.datafile) as df: geneid_set = set() doc_li = [] for line in df: geneid, ec = line.strip().split('\t') if ec.find(',') != -1: # there are multiple EC numbers ec = [str(x) for x in ec.split(',')] else: ec = str(ec) if geneid not in geneid_set: doc_li.append(dict(_id=geneid, ec=ec)) geneid_set.add(geneid) load_done('[%d]' % len(doc_li)) if aslist: return doc_li else: gene_d = dict([(d['_id'], d) for d in doc_li]) return gene_d
def load(self, aslist=False): load_start(self.datafile) if self.species_li: _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[ 1] != '-' else: _includefn = lambda ld: ld[1] != '-' gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1, includefn=_includefn) gene2retired = dict_convert( gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x])) gene_d = {} for gid, retired in gene2retired.items(): gene_d[gid] = {'retired': retired} load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def get_geneid_d(species_li=None, load_cache=True, save_cache=True): '''return a dictionary of current/retired geneid to current geneid mapping. This is useful, when other annotations were mapped to geneids may contain retired gene ids. if species_li is None, genes from all species are loaded. Note that all ids are int type. ''' if species_li: taxid_set = set([TAXONOMY[species] for species in species_li]) else: taxid_set = None orig_cwd = os.getcwd() os.chdir(DATA_FOLDER) # check cache file _cache_file = 'gene/geneid_d.pyobj' if load_cache and os.path.exists(_cache_file) and \ file_newer(_cache_file, 'gene/gene_info.gz') and \ file_newer(_cache_file, 'gene/gene_history.gz'): print('Loading "geneid_d" from cache file...', end='') _taxid_set, out_d = loadobj(_cache_file) assert _taxid_set == taxid_set print('Done.') os.chdir(orig_cwd) return out_d DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz') load_start(DATAFILE) if species_li: species_filter = lambda ld: int(ld[0]) in taxid_set else: species_filter = None geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter)) load_done('[%d]' % len(geneid_li)) DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz') load_start(DATAFILE) if species_li: _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li else: _includefn = lambda ld: ld[1] in geneid_li # include all species retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn) # includefn above makes sure taxid is for species_li and filters out those # mapped_to geneid exists in gene_info list load_done('[%d]' % len(retired2gene)) # convert key/value to int out_d = dict_convert(retired2gene, keyfn=int, valuefn=int) for g in geneid_li: _g = int(g) out_d[_g] = _g if save_cache: if species_li: dump((taxid_set, out_d), _cache_file) else: dump((None, out_d), _cache_file) os.chdir(orig_cwd) return out_d
def load(self, aslist=False): ''' loading ncbi "gene_info" file This must be called first to create basic gene documents with all basic fields, e.g., name, symbol, synonyms, etc. format of gene_info file: #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs map_location description type_of_gene Symbol_from nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_da te (tab is used as a separator, pound sign - start of a comment) ''' load_start(self.datafile) gene_d = tab2dict(self.datafile, (0, 1, 2, 3, 4, 5, 7, 8, 9, 13, 14), key=1, alwayslist=0, includefn=self.species_filter) def _ff(d): (taxid, symbol, locus_tag, synonyms, dbxrefs, map_location, description, type_of_gene, other_designations, modification_date) = d out = dict(taxid=int(taxid), symbol=symbol, name=description) if map_location != '-': out['map_location'] = map_location if type_of_gene != '-': out['type_of_gene'] = type_of_gene if synonyms != '-': out['alias'] = normalized_value(synonyms.split('|')) if locus_tag != '-': out['locus_tag'] = locus_tag if other_designations != "-": out['other_names'] = normalized_value( other_designations.split('|')) # when merged, this will become the default timestamp out["_timestamp"] = datetime.datetime.strptime( modification_date, "%Y%m%d") for x in dbxrefs.split('|'): if x == '-': continue xd = x.split(':') if len(xd) == 3 and xd[0] == xd[1] and \ xd[0] in ['VGNC', 'HGNC', 'MGI']: # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328' xd = xd[1:] try: _db, _id = xd except: print(repr(x)) raise # we don't need ensembl xref from here, we will get it from # Ensembl directly if _db.lower() in ['ensembl', 'imgt/gene-db']: # we don't need 'IMGT/GENE-DB" xref either, because they # are mostly the same as gene symbol continue # add "MGI:" prefix for MGI ids. if _db.lower() == 'mgi': _id = "MGI:" + _id out[_db] = _id return out gene_d = value_convert(gene_d, _ff) # add entrezgene field for geneid in gene_d: d = gene_d[geneid] d['entrezgene'] = int(geneid) gene_d[geneid] = d load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d