def _load_ensembl2entrez_li(self): """gene_ensembl__xref_entrezgene__dm""" CUSTOM_MAPPING_FILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__extra.txt') if not os.path.exists(CUSTOM_MAPPING_FILE): print("Missing extra mapping file, now generating") from . import ensembl_ncbi_mapping ensembl_ncbi_mapping.main(confirm=False) load_start(CUSTOM_MAPPING_FILE) extra = tab2dict(CUSTOM_MAPPING_FILE, (0, 1), 0, alwayslist=True) DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt') load_start(DATAFILE) ensembl2entrez = tab2dict( DATAFILE, (1, 2), 0, includefn=_not_LRG, alwayslist=True) # [(ensembl_gid, entrez_gid),...] # replace with our custom mapping for k in extra: ensembl2entrez[k] = extra[k] # back to list of tuples ensembl2entrez_li = [] for ensembl_id, entrez_ids in ensembl2entrez.items(): for entrez_id in entrez_ids: ensembl2entrez_li.append((ensembl_id, entrez_id)) load_done('[%d]' % len(ensembl2entrez_li)) self.ensembl2entrez_li = ensembl2entrez_li
def _load_ensembl2entrez_li(self): """gene_ensembl__xref_entrezgene__dm""" CUSTOM_MAPPING_FILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__extra.txt') if not os.path.exists(CUSTOM_MAPPING_FILE): print("Missing extra mapping file, now generating") from . import ensembl_ncbi_mapping ensembl_ncbi_mapping.main(confirm=False) load_start(CUSTOM_MAPPING_FILE) extra = tab2dict(CUSTOM_MAPPING_FILE,(0, 1), 0, alwayslist=True) DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt') load_start(DATAFILE) ensembl2entrez = tab2dict(DATAFILE, (1, 2), 0, includefn=_not_LRG, alwayslist=True) # [(ensembl_gid, entrez_gid),...] # replace with our custom mapping for k in extra: # when all ensemblIDs are resolved to other EntrezIDs different from those defined in xref file, there can # be "orphan" EntrezID with no more EnsemblID associated (because there were resolved). Make sure we keep ensembl data # for those. (so we extend mapping instead of replacing with the ones we previously found) ensembl2entrez[k].extend(extra[k]) # back to list of tuples ensembl2entrez_li = [] for ensembl_id, entrez_ids in ensembl2entrez.items(): for entrez_id in entrez_ids: ensembl2entrez_li.append((ensembl_id, entrez_id)) load_done('[%d]' % len(ensembl2entrez_li)) self.ensembl2entrez_li = ensembl2entrez_li
def loaddata(): #GNF1H DATAFILE = os.path.join(DATA_FOLDER, 'gnf', 'GNF1H.ANNO7.LOAD_20130402.tab') load_start(DATAFILE) gene2gnf1h = tab2dict(DATAFILE, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '') load_done('[%d]' % len(gene2gnf1h)) #GNF1m DATAFILE = os.path.join(DATA_FOLDER, 'gnf', 'gnf1m.NEW_ANNO6.LOAD_20130402.tab') load_start(DATAFILE) gene2gnf1m = tab2dict(DATAFILE, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '') load_done('[%d]' % len(gene2gnf1m)) return {'GNF1H': gene2gnf1h, 'GNF1M': gene2gnf1m}
def loaddata(): #Snowball array DATAFILE = os.path.join(DATA_FOLDER, 'pigatlas', 'snowball_array_annotation.txt') load_start(DATAFILE) gene2snowball = tab2dict(DATAFILE, (0, 1), 1,header=0) load_done('[%d]' % len(gene2snowball)) return {'snowball': gene2snowball}
def load(self): load_start(self.datafile) gene2generif = tab2dict(self.datafile, (1, 2, 4), 0, alwayslist=1) gene2generif = dict_convert(gene2generif, valuefn=lambda v: { 'generif': [dict(pubmed=self._cvt_pubmed(x[0]), text=x[1]) for x in v]}) load_done('[%d]' % len(gene2generif)) return gene2generif
def load_broadinstitute_exac_any(one_file,key): print("Loading file %s (%s)" % (one_file,key)) data = tab2dict(os.path.join(DATA_FOLDER, one_file), (0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21), 0) exacs = {} for transcript in data: tupleexac = data[transcript] # remove version in key so we can search the dict easily later exacs[transcript.split(".")[0]] = {"exac" : { "transcript" : transcript, # but keep version here "n_exons" : int(tupleexac[0]), "cds_start" : int(tupleexac[1]), "cds_end" : int(tupleexac[2]), "bp" : int(tupleexac[3]), key : { "mu_syn" : float(tupleexac[4]), "mu_mis" : float(tupleexac[5]), "mu_lof" : float(tupleexac[6]), "n_syn" : float(tupleexac[7]), "n_mis" : float(tupleexac[8]), "n_lof" : float(tupleexac[9]), "exp_syn" : float(tupleexac[10]), "exp_mis" : float(tupleexac[11]), "exp_lof" : float(tupleexac[12]), "syn_z" : float(tupleexac[13]), "mis_z" : float(tupleexac[14]), "lof_z" : float(tupleexac[15]), "p_li" : float(tupleexac[16]), "p_rec" : float(tupleexac[17]), "p_null" : float(tupleexac[18]) } } } return exacs
def load_ensembl2acc(self): """ loading ensembl to transcripts/proteins data """ #Loading all ensembl GeneIDs, TranscriptIDs and ProteinIDs DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt') load_start(DATAFILE) ensembl2acc = tab2dict(DATAFILE, (1,2,3), 0, includefn=_not_LRG) def _fn(x, eid): out={'gene': eid} if type(x) is types.ListType: transcript_li = [] protein_li = [] for _x in x: if _x[0] and _x[0]!='\\N': transcript_li.append(_x[0]) if _x[0] and _x[1]!='\\N': protein_li.append(_x[1]) if transcript_li: out['transcript']=normalized_value(transcript_li) if protein_li: out['protein'] = normalized_value(protein_li) else: if x[0] and x[0]!='\\N': out['transcript'] = x[0] if x[1] and x[1]!='\\N': out['protein'] = x[1] return out for k in ensembl2acc: ensembl2acc[k] = {'ensembl': _fn(ensembl2acc[k], k)} load_done('[%d]' % len(ensembl2acc)) return self.convert2entrez(ensembl2acc)
def load_ensembl2pfam(self): #Prosite DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt') load_start(DATAFILE) ensembl2pfam = dict_nodup(tab2dict(DATAFILE, (1, 4), 0)) ensembl2pfam = value_convert(ensembl2pfam, lambda x: {'pfam': x}, traverse_list=False) load_done('[%d]' % len(ensembl2pfam)) return self.convert2entrez(ensembl2pfam)
def loaddata(): #Snowball array DATAFILE = os.path.join(DATA_FOLDER, 'pigatlas', 'snowball_array_annotation.txt') load_start(DATAFILE) gene2snowball = tab2dict(DATAFILE, (0, 1), 1, header=0) load_done('[%d]' % len(gene2snowball)) return {'snowball': gene2snowball}
def load_ensembl2pos(self): #Genomic position DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt') load_start(DATAFILE) ensembl2pos = dict_nodup(tab2dict(DATAFILE, (1, 3, 4, 5, 6), 0, includefn=_not_LRG)) ensembl2pos = value_convert(ensembl2pos, lambda x: {'chr': x[2], 'start': int(x[0]), 'end': int(x[1]), 'strand': int(x[3])}) ensembl2pos = value_convert(ensembl2pos, lambda x: {'genomic_pos': x}, traverse_list=False) load_done('[%d]' % len(ensembl2pos)) return self.convert2entrez(ensembl2pos)
def _load_ensembl_2taxid(self): """ensembl2taxid""" DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt') load_start(DATAFILE) ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG)) # need to convert taxid to integer here ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x)) load_done('[%d]' % len(ensembl2taxid)) return ensembl2taxid
def load_ensembl2interpro(self): #Interpro DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt') load_start(DATAFILE) ensembl2interpro = dict_nodup(tab2dict(DATAFILE, (1, 4, 5, 6), 0)) ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'id': x[0], 'short_desc': x[1], 'desc': x[2]}) ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'interpro': x}, traverse_list=False) load_done('[%d]' % len(ensembl2interpro)) return self.convert2entrez(ensembl2interpro)
def load_pharmgkb(): print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'genes.zip') load_start(DATAFILE) gene2pharmgkb = tab2dict((DATAFILE, 'genes.tsv'), (0, 1), 1, header=1, includefn=lambda ld: ld[1] != '') fn = lambda value: {'pharmgkb': value} gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False) load_done('[%d]' % len(gene2pharmgkb)) return gene2pharmgkb
def _load_ensembl2entrez_li(self): """gene_ensembl__xref_entrezgene__dm""" CUSTOM_MAPPING_FILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__extra.txt') if not os.path.exists(CUSTOM_MAPPING_FILE): print("Missing extra mapping file, now generating") from . import ensembl_ncbi_mapping ensembl_ncbi_mapping.main(confirm=False) load_start(CUSTOM_MAPPING_FILE) extra = tab2dict(CUSTOM_MAPPING_FILE,(0, 1), 0, alwayslist=True) DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt') load_start(DATAFILE) ensembl2entrez = tab2dict(DATAFILE, (1, 2), 0, includefn=_not_LRG, alwayslist=True) # [(ensembl_gid, entrez_gid),...] # replace with our custom mapping for k in extra: ensembl2entrez[k] = extra[k] # back to list of tuples ensembl2entrez_li = [] for ensembl_id, entrez_ids in ensembl2entrez.items(): for entrez_id in entrez_ids: ensembl2entrez_li.append((ensembl_id, entrez_id)) load_done('[%d]' % len(ensembl2entrez_li)) self.ensembl2entrez_li = ensembl2entrez_li
def _load_affy(df): filename = os.path.split(df)[1] rawfile, ext = os.path.splitext(filename) if ext.lower() == '.zip': df = (df, rawfile) dd = tab2dict(df, (0, 18), 1, sep=',', header=1, includefn=lambda ld: len(ld) > 18 and ld[18] != '---' and ld[18] != 'Entrez Gene') #fix for keys like "472 /// 4863" for mulitple geneids gene2affy = {} for k in dd: if len(k.split(' /// ')) > 1: for kk in k.split(' /// '): dict_apply(gene2affy, kk.strip(), dd[k]) else: dict_apply(gene2affy, k.strip(), dd[k]) return gene2affy
def load(self, aslist=False): load_start(self.datafile) print() geneid_d = get_geneid_d(self.species_li) gene2unigene = tab2dict(self.datafile, (0, 1), 0, alwayslist=0, includefn=lambda ld: int(ld[0]) in geneid_d) gene_d = {} for gid, unigene in gene2unigene.items(): gene_d[gid] = {'unigene': unigene} load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def load_ensembl2acc(self): """ loading ensembl to transcripts/proteins data """ #Loading all ensembl GeneIDs, TranscriptIDs and ProteinIDs DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt') load_start(DATAFILE) ensembl2acc = tab2dict(DATAFILE, (1, 2, 3), 0, includefn=_not_LRG) def _fn(x, eid): out = {'gene': eid, 'translation': []} def mapping(transcript_id, protein_id): trid = transcript_id and transcript_id != '\\N' and transcript_id or None pid = protein_id and protein_id != '\\N' and protein_id or None if trid and pid: out['translation'].append({"rna": trid, "protein": pid}) if isinstance(x, list): transcript_li = [] protein_li = [] for _x in x: if _x[0] and _x[0] != '\\N': transcript_li.append(_x[0]) if _x[1] and _x[1] != '\\N': protein_li.append(_x[1]) mapping(_x[0], _x[1]) if transcript_li: out['transcript'] = normalized_value(transcript_li) if protein_li: out['protein'] = normalized_value(protein_li) else: if x[0] and x[0] != '\\N': out['transcript'] = x[0] if x[1] and x[1] != '\\N': out['protein'] = x[1] mapping(x[0], x[1]) return out for k in ensembl2acc: ensembl2acc[k] = {'ensembl': _fn(ensembl2acc[k], k)} load_done('[%d]' % len(ensembl2acc)) return self.convert2entrez(ensembl2acc)
def _load_ensembl2name(self): """loading ensembl gene to symbol+name mapping""" DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt') load_start(DATAFILE) ensembl2name = tab2dict(DATAFILE, (1,2,7), 0, includefn=_not_LRG) def _fn(x): out={} if x[0].strip() not in ['', '\\N']: out['symbol'] = x[0].strip() if x[1].strip() not in ['', '\\N']: _name = SubStr(x[1].strip(), '', ' [Source:').strip() if _name: out['name'] = _name return out ensembl2name = value_convert(ensembl2name, _fn) load_done('[%d]' % len(ensembl2name)) return ensembl2name
def load(self, aslist=False): load_start(self.datafile) gene2go = tab2dict(self.datafile, (1, 2, 3, 4, 5, 6, 7), 0, alwayslist=1, includefn=self.species_filter) category_d = {'Function': 'MF', 'Process': 'BP', 'Component': 'CC'} def _ff(d): out = {} for goid, evidence, qualifier, goterm, pubmed, gocategory in d: _gocategory = category_d[gocategory] _d = out.get(_gocategory, []) _rec = dict(id=goid, term=goterm) if evidence != '-': _rec['evidence'] = evidence if qualifier != '-': # here I also fixing some inconsistency issues in NCBI data # Colocalizes_with -> colocalizes_with # Contributes_with -> contributes_with # Not -> NOT _rec['qualifier'] = qualifier.replace('Co', 'co').replace( 'Not', 'NOT') if pubmed != '-': if pubmed.find('|') != -1: pubmed = [int(pid) for pid in pubmed.split('|')] else: pubmed = int(pubmed) _rec['pubmed'] = pubmed _d.append(_rec) out[_gocategory] = _d for k in out: if len(out[k]) == 1: out[k] = out[k][0] return out gene2go = dict_convert(gene2go, valuefn=_ff) gene_d = {} for gid, go in gene2go.items(): gene_d[gid] = {'go': go} load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def load_ensembl2acc(self): """ loading ensembl to transcripts/proteins data """ #Loading all ensembl GeneIDs, TranscriptIDs and ProteinIDs DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt') load_start(DATAFILE) ensembl2acc = tab2dict(DATAFILE, (1, 2, 3), 0, includefn=_not_LRG) def _fn(x, eid): out = {'gene': eid, 'translation' : []} def mapping(transcript_id, protein_id): trid = transcript_id and transcript_id != '\\N' and transcript_id or None pid = protein_id and protein_id != '\\N' and protein_id or None if trid and pid: out['translation'].append({"rna" : trid, "protein" : pid}) if isinstance(x, list): transcript_li = [] protein_li = [] for _x in x: if _x[0] and _x[0] != '\\N': transcript_li.append(_x[0]) if _x[1] and _x[1] != '\\N': protein_li.append(_x[1]) mapping(_x[0],_x[1]) if transcript_li: out['transcript'] = normalized_value(transcript_li) if protein_li: out['protein'] = normalized_value(protein_li) else: if x[0] and x[0] != '\\N': out['transcript'] = x[0] if x[1] and x[1] != '\\N': out['protein'] = x[1] mapping(x[0],x[1]) return out for k in ensembl2acc: ensembl2acc[k] = {'ensembl': _fn(ensembl2acc[k], k)} load_done('[%d]' % len(ensembl2acc)) return self.convert2entrez(ensembl2acc)
def load(self, aslist=False): load_start(self.datafile) gene2go = tab2dict(self.datafile, (1, 2, 3, 4, 5, 6, 7), 0, alwayslist=1, includefn=self.species_filter) category_d = {'Function': 'MF', 'Process': 'BP', 'Component': 'CC'} def _ff(d): out = {} for goid, evidence, qualifier, goterm, pubmed, gocategory in d: _gocategory = category_d[gocategory] _d = out.get(_gocategory, []) _rec = dict(id=goid, term=goterm) if evidence != '-': _rec['evidence'] = evidence if qualifier != '-': # here I also fixing some inconsistency issues in NCBI data # Colocalizes_with -> colocalizes_with # Contributes_with -> contributes_with # Not -> NOT _rec['qualifier'] = qualifier.replace('Co', 'co').replace('Not', 'NOT') if pubmed != '-': if pubmed.find('|') != -1: pubmed = [int(pid) for pid in pubmed.split('|')] else: pubmed = int(pubmed) _rec['pubmed'] = pubmed _d.append(_rec) out[_gocategory] = _d for k in out: if len(out[k]) == 1: out[k] = out[k][0] return out gene2go = dict_convert(gene2go, valuefn=_ff) gene_d = {} for gid, go in gene2go.items(): gene_d[gid] = {'go': go} load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def _load_ensembl2name(self): """loading ensembl gene to symbol+name mapping""" DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt') load_start(DATAFILE) ensembl2name = tab2dict(DATAFILE, (1, 2, 7), 0, includefn=_not_LRG) def _fn(x): out = {} if x[0].strip() not in ['', '\\N']: out['symbol'] = x[0].strip() if x[1].strip() not in ['', '\\N']: _name = SubStr(x[1].strip(), '', ' [Source:').strip() if _name: out['name'] = _name return out ensembl2name = value_convert(ensembl2name, _fn) load_done('[%d]' % len(ensembl2name)) return ensembl2name
def load(self, aslist=False): load_start(self.datafile) if self.species_li: _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[1] != '-' else: _includefn = lambda ld: ld[1] != '-' gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1, includefn=_includefn) gene2retired = dict_convert(gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x])) gene_d = {} for gid, retired in gene2retired.items(): gene_d[gid] = {'retired': retired} load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def load(self, aslist=False): load_start(self.datafile) gene2acc = tab2dict(self.datafile, (1, 3, 5, 7), 0, alwayslist=1, includefn=self.species_filter) def _ff(d): out = { 'rna': [], 'protein': [], 'genomic': [], 'translation': [] } for rna, prot, dna in d: if rna == '-': rna = None if prot == '-': prot = None if dna == '-': dna = None if rna is not None: out['rna'].append(rna) if prot is not None: out['protein'].append(prot) if dna is not None: out['genomic'].append(dna) if rna and prot: out['translation'].append({'rna' : rna, 'protein' : prot}) # remove dup for k in out: out[k] = normalized_value(out[k]) # remove empty rna/protein/genomic field _out = {} for k, v in out.items(): if v: _out[k] = v if _out: _out = {self.fieldname: _out} return _out gene2acc = dict_convert(gene2acc, valuefn=_ff) load_done('[%d]' % len(gene2acc)) if aslist: return dict_to_list(gene2acc) else: return gene2acc
def load_ensembl2pos(self): #Genomic position DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt') load_start(DATAFILE) # Twice 1 because first is the dict key, the second because we need gene id within genomic_pos ensembl2pos = dict_nodup( tab2dict(DATAFILE, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG)) ensembl2pos = value_convert( ensembl2pos, lambda x: { 'ensemblgene': x[0], 'chr': x[3], 'start': int(x[1]), 'end': int(x[2]), 'strand': int(x[4]) }) ensembl2pos = value_convert(ensembl2pos, lambda x: {'genomic_pos': x}, traverse_list=False) load_done('[%d]' % len(ensembl2pos)) return self.convert2entrez(ensembl2pos)
def _load_affy(df): filename = os.path.split(df)[1] rawfile, ext = os.path.splitext(filename) if ext.lower() == '.zip': df = (df, rawfile) dd = tab2dict(df, (0, 18), 1, sep=',', header=1, includefn=lambda ld: len(ld) > 18 and ld[18] != '---' and ld[ 18] != 'Entrez Gene') #fix for keys like "472 /// 4863" for mulitple geneids gene2affy = {} for k in dd: if len(k.split(' /// ')) > 1: for kk in k.split(' /// '): dict_apply(gene2affy, kk.strip(), dd[k]) else: dict_apply(gene2affy, k.strip(), dd[k]) return gene2affy
def load_exons_for_species(species, exons_key='exons'): refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz') reflink_file = os.path.join(DATA_FOLDER, species, 'database/refLink.txt.gz') load_start(refflat_file) t0 = time.time() refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False) ref2exons = [] for ld in tabfile_feeder(refflat_file, header=0): refseq = ld[1] chr = ld[2] if chr.startswith('chr'): chr = chr[3:] exons = zip([int(x) for x in ld[9].split(',') if x], [int(x) for x in ld[10].split(',') if x]) assert len(exons) == int(ld[8]), (len(exons), int(ld[8])) ref2exons.append((refseq, { 'chr': chr, 'strand': -1 if ld[3] == '-' else 1, 'txstart': int(ld[4]), 'txend': int(ld[5]), 'cdsstart': int(ld[6]), 'cdsend': int(ld[7]), 'exons': exons })) ref2exons = list2dict(ref2exons, 0) gene2exons = {} for refseq in sorted(ref2exons.keys()): geneid = refseq2gene.get(refseq, None) if geneid and geneid != '0': if geneid not in gene2exons: gene2exons[geneid] = {exons_key: {refseq: ref2exons[refseq]}} else: gene2exons[geneid][exons_key][refseq] = ref2exons[refseq] load_done('[%d, %s]' % (len(gene2exons), timesofar(t0))) return gene2exons
def _load_affy(df): filename = os.path.split(df)[1] rawfile, ext = os.path.splitext(filename) if ext.lower() == '.zip': df = (df, rawfile) dd = tab2dict(df, (0, 7), 1, sep=',', header=1, includefn=lambda ld: len(ld) > 7 and ld[7] != '---' and ld[7] != 'gene_assignment') #fix for keys like "472 /// 4863" for mulitple geneids gene2affy = {} for k in dd: kk = k.split('///') if len(kk) > 1: for kkk in kk: k4 = kkk.split('//') if k4[len(k4) - 1].strip() != '---': dict_apply(gene2affy, k4[len(k4) - 1].strip(), dd[k]) else: k4 = k.split('//') if len(k4) > 1: if k4[len(k4) - 1].strip() != '---': dict_apply(gene2affy, k4[len(k4) - 1].strip(), dd[k]) return gene2affy
def load(self, aslist=False): load_start(self.datafile) gene2acc = tab2dict(self.datafile, (1, 3, 5, 7), 0, alwayslist=1, includefn=self.species_filter) def _ff(d): out = { 'rna': [], 'protein': [], 'genomic': [] } for x1, x2, x3 in d: if x1 != '-': out['rna'].append(x1.split('.')[0]) # trim version number after dot if x2 != '-': out['protein'].append(x2.split('.')[0]) if x3 != '-': out['genomic'].append(x3.split('.')[0]) # remove dup for k in out: out[k] = normalized_value(out[k]) # remove empty rna/protein/genomic field _out = {} for k, v in out.items(): if v: _out[k] = v if _out: _out = {self.fieldname: _out} return _out gene2acc = dict_convert(gene2acc, valuefn=_ff) load_done('[%d]' % len(gene2acc)) if aslist: return dict_to_list(gene2acc) else: return gene2acc
def load_broadinstitute_exac_any(one_file, key): print("Loading file %s (%s)" % (one_file, key)) data = tab2dict(os.path.join(DATA_FOLDER, one_file), (0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), 0) exacs = {} for transcript in data: tupleexac = data[transcript] # remove version in key so we can search the dict easily later exacs[transcript.split(".")[0]] = { "exac": { "transcript": transcript, # but keep version here "n_exons": int(tupleexac[0]), "cds_start": int(tupleexac[1]), "cds_end": int(tupleexac[2]), "bp": int(tupleexac[3]), key: { "mu_syn": float(tupleexac[4]), "mu_mis": float(tupleexac[5]), "mu_lof": float(tupleexac[6]), "n_syn": float(tupleexac[7]), "n_mis": float(tupleexac[8]), "n_lof": float(tupleexac[9]), "exp_syn": float(tupleexac[10]), "exp_mis": float(tupleexac[11]), "exp_lof": float(tupleexac[12]), "syn_z": float(tupleexac[13]), "mis_z": float(tupleexac[14]), "lof_z": float(tupleexac[15]), "p_li": float(tupleexac[16]), "p_rec": float(tupleexac[17]), "p_null": float(tupleexac[18]) } } } return exacs
import os.path import time from biothings.utils.common import timesofar from utils.dataload import (load_start, load_done, tab2dict, tabfile_feeder, list2dict) from dataload import get_data_folder # DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot') DATA_FOLDER = os.path.join(get_data_folder('ucsc'), 'goldenPath/currentGenomes') REFLINK_FILE = os.path.join(get_data_folder('ucsc'), 'goldenPath/hgFixed/database/refLink.txt.gz') refseq2gene = tab2dict(REFLINK_FILE, (2, 6), 0, alwayslist=False) def load_exons_for_species(species, exons_key='exons'): refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz') load_start(refflat_file) t0 = time.time() ref2exons = {} for ld in tabfile_feeder(refflat_file, header=0): refseq = ld[1] chr = ld[2] if chr.startswith('chr'): chr = chr[3:] exons = list(zip([int(x) for x in ld[9].split(',') if x], [int(x) for x in ld[10].split(',') if x])) assert len(exons) == int(ld[8]), (len(exons), int(ld[8])) ref2exons.setdefault(refseq,[]).append({ 'transcript' : refseq,
def load(self, aslist=False): ''' loading ncbi "gene_info" file This must be called first to create basic gene documents with all basic fields, e.g., name, symbol, synonyms, etc. format of gene_info file: #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs map_location description type_of_gene Symbol_from nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_da te (tab is used as a separator, pound sign - start of a comment) ''' load_start(self.datafile) gene_d = tab2dict(self.datafile, (0, 1, 2, 3, 4, 5, 7, 8, 9, 13, 14), key=1, alwayslist=0, includefn=self.species_filter) def _ff(d): (taxid, symbol, locus_tag, synonyms, dbxrefs, map_location, description, type_of_gene, other_designations, modification_date) = d out = dict(taxid=int(taxid), symbol=symbol, name=description) if map_location != '-': out['map_location'] = map_location if type_of_gene != '-': out['type_of_gene'] = type_of_gene if synonyms != '-': out['alias'] = normalized_value(synonyms.split('|')) if locus_tag != '-': out['locus_tag'] = locus_tag if other_designations != "-": out['other_names'] = normalized_value( other_designations.split('|')) # when merged, this will become the default timestamp out["_timestamp"] = datetime.datetime.strptime( modification_date, "%Y%m%d") for x in dbxrefs.split('|'): if x == '-': continue xd = x.split(':') if len(xd) == 3 and xd[0] == xd[1] and \ xd[0] in ['VGNC', 'HGNC', 'MGI']: # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328' xd = xd[1:] try: _db, _id = xd except: print(repr(x)) raise # we don't need ensembl xref from here, we will get it from # Ensembl directly if _db.lower() in ['ensembl', 'imgt/gene-db']: # we don't need 'IMGT/GENE-DB" xref either, because they # are mostly the same as gene symbol continue # add "MGI:" prefix for MGI ids. if _db.lower() == 'mgi': _id = "MGI:" + _id out[_db] = _id return out gene_d = value_convert(gene_d, _ff) # add entrezgene field for geneid in gene_d: d = gene_d[geneid] d['entrezgene'] = int(geneid) gene_d[geneid] = d load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def get_geneid_d(species_li=None, load_cache=True, save_cache=True): '''return a dictionary of current/retired geneid to current geneid mapping. This is useful, when other annotations were mapped to geneids may contain retired gene ids. if species_li is None, genes from all species are loaded. Note that all ids are int type. ''' if species_li: taxid_set = set([taxid_d[species] for species in species_li]) else: taxid_set = None orig_cwd = os.getcwd() os.chdir(DATA_FOLDER) # check cache file _cache_file = 'gene/geneid_d.pyobj' if load_cache and os.path.exists(_cache_file) and \ file_newer(_cache_file, 'gene/gene_info.gz') and \ file_newer(_cache_file, 'gene/gene_history.gz'): print('Loading "geneid_d" from cache file...', end='') _taxid_set, out_d = loadobj(_cache_file) assert _taxid_set == taxid_set print('Done.') os.chdir(orig_cwd) return out_d DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz') load_start(DATAFILE) if species_li: species_filter = lambda ld: int(ld[0]) in taxid_set else: species_filter = None geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter)) load_done('[%d]' % len(geneid_li)) DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz') load_start(DATAFILE) if species_li: _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li else: _includefn = lambda ld: ld[1] in geneid_li # include all species retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn) # includefn above makes sure taxid is for species_li and filters out those mapped_to geneid exists in gene_info list load_done('[%d]' % len(retired2gene)) out_d = dict_convert(retired2gene, keyfn=int, valuefn=int) # convert key/value to int for g in geneid_li: _g = int(g) out_d[_g] = _g if save_cache: if species_li: dump((taxid_set, out_d), _cache_file) else: dump((None, out_d), _cache_file) os.chdir(orig_cwd) return out_d
def load(self, aslist=False): ''' loading ncbi "gene_info" file This must be called first to create basic gene documents with all basic fields, e.g., name, symbol, synonyms, etc. format of gene_info file: #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs chromosome map_location description type_of_gene Symbol_from _nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_da te (tab is used as a separator, pound sign - start of a comment) ''' load_start(self.datafile) gene_d = tab2dict(self.datafile, (0, 1, 2, 4, 5, 7, 8, 9), key=1, alwayslist=0, includefn=self.species_filter) def _ff(d): ( taxid, symbol, synonyms, dbxrefs, map_location, description, type_of_gene ) = d out = dict(taxid=int(taxid), symbol=symbol, name=description) if map_location != '-': out['map_location'] = map_location if type_of_gene != '-': out['type_of_gene'] = type_of_gene if synonyms != '-': out['alias'] = normalized_value(synonyms.split('|')) for x in dbxrefs.split('|'): if x == '-': continue xd = x.split(':') if len(xd) == 3 and xd[0] == xd[1] and xd[0] in ['HGNC', 'MGI']: xd = xd[1:] # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328' try: _db, _id = xd except: print(x) raise if _db.lower() in ['ensembl', 'imgt/gene-db']: # we don't need ensembl xref from here, we will get it from Ensembl directly continue # we don't need 'IMGT/GENE-DB" xref either, because they are mostly the same as gene symbol if _db.lower() == 'mgi': # add "MGI:" prefix for MGI ids. _id = "MGI:"+_id out[_db] = _id return out gene_d = value_convert(gene_d, _ff) # add entrezgene field for geneid in gene_d: d = gene_d[geneid] d['entrezgene'] = int(geneid) gene_d[geneid] = d load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def get_geneid_d(species_li=None, load_cache=True, save_cache=True): '''return a dictionary of current/retired geneid to current geneid mapping. This is useful, when other annotations were mapped to geneids may contain retired gene ids. if species_li is None, genes from all species are loaded. Note that all ids are int type. ''' if species_li: taxid_set = set([TAXONOMY[species] for species in species_li]) else: taxid_set = None orig_cwd = os.getcwd() os.chdir(DATA_FOLDER) # check cache file _cache_file = 'gene/geneid_d.pyobj' if load_cache and os.path.exists(_cache_file) and \ file_newer(_cache_file, 'gene/gene_info.gz') and \ file_newer(_cache_file, 'gene/gene_history.gz'): print('Loading "geneid_d" from cache file...', end='') _taxid_set, out_d = loadobj(_cache_file) assert _taxid_set == taxid_set print('Done.') os.chdir(orig_cwd) return out_d DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz') load_start(DATAFILE) if species_li: species_filter = lambda ld: int(ld[0]) in taxid_set else: species_filter = None geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter)) load_done('[%d]' % len(geneid_li)) DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz') load_start(DATAFILE) if species_li: _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li else: _includefn = lambda ld: ld[1] in geneid_li # include all species retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn) # includefn above makes sure taxid is for species_li and filters out those # mapped_to geneid exists in gene_info list load_done('[%d]' % len(retired2gene)) # convert key/value to int out_d = dict_convert(retired2gene, keyfn=int, valuefn=int) for g in geneid_li: _g = int(g) out_d[_g] = _g if save_cache: if species_li: dump((taxid_set, out_d), _cache_file) else: dump((None, out_d), _cache_file) os.chdir(orig_cwd) return out_d
import os.path import time from biothings.utils.common import timesofar from utils.dataload import (load_start, load_done, tab2dict, tabfile_feeder, list2dict) from dataload import get_data_folder # DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot') DATA_FOLDER = os.path.join(get_data_folder('ucsc'), 'goldenPath/currentGenomes') REFLINK_FILE = os.path.join(get_data_folder('ucsc'), 'goldenPath/hgFixed/database/refLink.txt.gz') refseq2gene = tab2dict(REFLINK_FILE, (2, 6), 0, alwayslist=False) def load_exons_for_species(species, exons_key='exons'): refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz') load_start(refflat_file) t0 = time.time() ref2exons = {} for ld in tabfile_feeder(refflat_file, header=0): refseq = ld[1] chr = ld[2] if chr.startswith('chr'): chr = chr[3:] exons = list( zip([int(x) for x in ld[9].split(',') if x],