def _iterfile(self): """Internal method. Do not use""" self.gadrs = {} self.nhgrirs = {} fields = """snp gene phen asnsta pubid refdb refdbid""".split() clnsnp = namedtuple('Clnsnp', fields) for filename in os.listdir(self.inname): #print filename infile = os.path.join(self.inname, filename) if filename == 'all.txt': with anyopen.openfile(infile) as stream: for rec in csv.reader(stream, delimiter='\t'): if len(rec) > 1: if 'Association(Y/N)' in rec: continue if rec[1] != 'N' and rec[8] and rec[2] and \ rec[13].isdigit(): refdbid = rec[0] genes = rec[8].split(',') phen = rec[2] snps = self._parsersId(rec[28]) if rec[1]: asnsta = rec[1] else: asnsta = 'U' pubid = rec[13] for gene in genes: gene = gene.strip() if gene: if snps: for snp in snps: self.gadrs[snp] = [\ gene.upper(), pubid, \ phen.lower().strip()] yield clnsnp._make((snp,\ gene, phen, asnsta, pubid, 'GAD', refdbid)) else: yield clnsnp._make((None, gene,\ phen, asnsta, pubid, 'GAD', refdbid)) elif filename == 'gwascatalog.txt': with anyopen.openfile(infile) as stream: for rec in csv.reader(stream, delimiter='\t'): if len(rec) > 1: if rec[14] != ' - ': if 'Date Added to Catalog' in rec or 'DATE ADDED TO CATALOG' in rec: continue for snpid in rec[21].split(','): self.nhgrirs[snpid] = [rec[14].upper(),\ rec[1], rec[7].lower().strip()] yield clnsnp._make((snpid, rec[14], rec[7],\ 'Y', rec[1], 'NHGRI', rec[0]))
def _extract_go_method(self): fp = anyopen.openfile(self.inname) method_id, obo_file, goa_file, method_desc = fp.next()[1:].rstrip().split('\t') val_field = ','.join(['?']*5) gominsert = 'insert into go_method values (%s)'%val_field self.curs.execute(gominsert) fp.close()
def load_interval(self): """ Loading GERP Element interval data""" d = {} d_scores = {} chrom = None self.entry_cnt = 0 gerp_dir = os.path.dirname(self.gerpfiles) for filename in os.listdir(gerp_dir): if '_elems.txt' in filename: chrom = filename.split('_')[1].strip('chr') gerp_file = os.path.join(gerp_dir, filename) for line in anyopen.openfile(gerp_file, 'r'): if line[:1] == '#': continue self.entry_cnt += 1 p1, p2, len, score, pvalue = line.split() d_scores[(chrom, int(p1), int(p2))] = (score, pvalue) try: pos = d[chrom] except KeyError: pos = [] d[chrom] = pos p1 = int(p1) p2 = int(p2) pos.append((p1, p2)) for key in d: v = d[key] v.sort() d[key] = zip(*v) return d, d_scores
def store_variant_citations(variant_citation_fn): print 'storing variant citations ...' # linked_ids = py_struct(allele_id=[], # variation_id=[], # rs=[]) linked_ids = {} tmp_fn = '%s.tmp' % variant_citation_fn cmd = "cut -f1,2,3 %s | sort -r -k1,1 -k2,2n -k3,3n | uniq > %s" % ( variant_citation_fn, tmp_fn) runcmd(cmd) fp = anyopen.openfile(tmp_fn, 'rt') head = fp.next()[:-1] if head.startswith('#'): head = head[1:] ntuple = namedtuple('ntuple', head.split('\t')) for i in fp: rec = i[:-1] linked_id = ntuple._make(rec.split('\t')) linked_ids[linked_id.AlleleID] = linked_id.VariationID fp.close() os.unlink(tmp_fn) print 'Done.' return linked_ids
def store_submission_summary_fn2(submit_fn): print 'storing submission summary file ...' fp = anyopen.openfile(submit_fn, 'rt') submissions = {} read_heads = False for rec in fp: rec = rec[:-1] # print 'rec:',rec if not read_heads and rec.endswith('SubmittedGeneSymbol'): rec = rec[1:] heads = rec.split('\t') # print 'heads:',heads subm = namedtuple('subm', heads) read_heads = True elif read_heads: subm_rec = subm._make(rec.split('\t')) if subm_rec.VariationID not in submissions: submissions[subm_rec.VariationID] = py_struct( collection_methods=[]) for cmethod in subm_rec.CollectionMethod.split(';'): submissions[subm_rec.VariationID].collection_methods.append( cmethod.replace(' ', '_')) fp.close() print 'Done.' return submissions
def _iterfile(self): """Internal method. Do not use""" for filename in os.listdir(self.inname): self.logger.info('Processing %s..' % filename) infile = os.path.join(self.inname, filename) with anyopen.openfile(infile) as stream: for rec in csv.reader(stream, delimiter='\t'): yield tuple(rec)
def records(self): """ iterates over the records of utrdb""" stream = anyopen.openfile(self.filename, 'r') temp = "" for line in stream: if self.delimit not in line[:3]: temp += line else: yield temp temp = ""
def _iterfile(self): """Internal method. Do not use""" fields = 'prod1 prod2 score denominator'.split() funsim = namedtuple('funsim', fields) stream = anyopen.openfile(self.inname) #stream.next() for rec in csv.reader(stream, delimiter='\t'): #if rec[0][0]!='#': yield funsim._make(rec) stream.close()
def _iterfile(self): fields = "gene,nt_change,pt_change,other_mapping,alias,tx,region,rep_class,inf_class,source,last_eval,last_upd,url,comment".split( ",") F = len(fields) clinvt = namedtuple('clinvt', fields) stream = anyopen.openfile(self.tsv, 'rt') stream.next() for rec in csv.reader(stream, delimiter='\t'): if len(rec) == F: yield clinvt._make(rec) stream.close()
def get_GO_seeds(self, seed_rate): ''' to collect genes associated a disease whose matching score to HPO is relatively high ''' job_name = 'get_GO_seeds' msg = 'collecting genes associated with diseases [%s] showing high HPO matching' % self.hpo2disease_fn lib_utils.msgout('notice', msg) self.logger.info(msg) #count the total number of disease hit whose score > 0. fp = anyopen.openfile(self.hpo2disease_fn) num_omim = 0 for i in fp: if i[0] == '#': continue omim, genes, score = i.rstrip().split('\t') score = float(score) if score > 0.: num_omim += 1 fp.close() t = 0 T = round(num_omim * seed_rate) fp = anyopen.openfile(self.hpo2disease_fn) go_seeds = [] for i in fp: if i[0] == '#': continue if t > T: break omim, genes, score = i.rstrip().split('\t') go_seeds.extend(genes.split(',')) t += 1 fp.close() go_seeds = list(set(go_seeds)) msg = 'total [%d] genes are chosen for GO seeds in [%d] out of [%d] diseases\n' % ( len(go_seeds), T, num_omim) msg += 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg) return go_seeds
def _iterfile(self): """Internal method. Do not use""" fields = """bin name chrom strand txStart txEnd cdsStart cdsEnd exonCount exonStarts exonEnds score name2 cdsStartStat cdsEndStat exonFrames""".split() refgene = namedtuple('Refgene', fields) with anyopen.openfile(self.inname) as stream: for rec in csv.reader(stream, delimiter='\t'): if rec[0].isdigit(): yield refgene._make(rec)
def _iterfile(self): """Internal method. Do not use""" fields = """mir_acc mirna gene_id gene_sym trans_id ext_trans_id \ mirna_alig alig gene_alig mirna_start mirna_end gene_start \ gene_end genome_coord conserv align_score seed_cat energy mirsvr_score """.split() mirna = namedtuple('mirna', fields) with anyopen.openfile(self.inname) as stream: for rec in csv.reader(stream, delimiter='\t'): if '#' not in rec[0]: yield mirna._make(rec)
def _iterfile(self): """Internal method. Do not use""" fields = "ucsc_gene chrom start_bp end_bp gene domain_id domain_desc".split( ) domtup = namedtuple('domtup', fields) clnStat = ClinvarDB() hgmdStat = HgmdDB() stream = anyopen.openfile(self.inname) stream.next() # skip header processed_regions = {} for rec in csv.reader(stream, delimiter='\t'): rec[1] = to_grach(rec[1]) if rec[1] is None: continue domain = domtup._make(rec) region_key = '%s_%s_%s' % (domain.chrom, domain.start_bp, domain.end_bp) if region_key in processed_regions: continue processed_regions[region_key] = True # query to clinvar vartypes = clnStat.count_lofs(goi_tup=domain) # search for hgmd if self.hgmd_on: vartypes = hgmdStat.count_lofs(domain, vartypes) # count vcounts = [[0, 0], [0, 0], [0, 0]] for csigs in vartypes.itervalues(): for i, csig in enumerate(csigs[:-1]): for j, vtype in enumerate(csig): vcounts[i][j] += vtype yield domain, vcounts stream.close() clnStat.conn.close() if self.hgmd_on: hgmdStat.conn.close()
def __init__(self, filename, sampleids=None, strict=False): self.filename = filename self.strict = strict self.stream = anyopen.openfile(filename, 'rt') self.meta = {'INFO': {}, 'FORMAT': {}, 'FILTER': {}, 'ALT': {}, 'SAMPLE': {}, 'contig': {}} self.samples = [] self._headerlines = [] self.parsemeta() if sampleids: self._sampleids = [self.samples.index(s) for s in sampleids] else: self._sampleids = list(range(len(self.samples)))
def uniprot_to_gene(self): self.logger.info('Having a dictionary of mapping uniprot to refGene...') pairs = {} stream = anyopen.openfile(self.goaname) for rec in csv.reader(stream, delimiter='\t'): if rec[0][0]=='!': continue else: pair = '%s:%s'%(rec[1],rec[2]) #uniprot,refGene if pair in pairs: continue else: pairs[pair]=True self.uni2gene[rec[1]]=rec[2] stream.close() pairs = None
def getphenotype(self, mimid): """For the given OMIM Phenotype ID this methods retrieves the OMIM Phenotype using the EUtils service provides by NCBI""" phenotype = None URL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" + \ "?db=omim&id=%s" % (mimid) stream = anyopen.openfile(URL) for line in stream: line = line.strip() if line: if 'Item Name="Title"' in line: phenotype = line.split('>')[1].split('<')[0].strip() if phenotype: break elif 'Item Name="AltTitles"' in line: phenotype = line.split('>')[1].split('<')[0].strip() break return phenotype
def _register_goa(self): self.logger.info('Importing map of uniprot to refGene ...') self.logger.info('Input files: %s' % self.goaname) stream = anyopen.openfile(self.goaname) pairs = {} n = 0 entries = [] entry_cnt = 0 buffer2 = 7000000 val_field = ','.join(['?'] * 2) goinsert = 'insert into uniprot_to_refgene values (%s)' % val_field for rec in csv.reader(stream, delimiter='\t'): if rec[0][0] == '!': continue else: pair = '%s:%s' % (rec[1], rec[2]) if pair in pairs: continue else: pairs[pair] = True entries.append((rec[1], rec[2])) n += 1 if n > buffer2: entry_cnt += buffer2 self.curs.executemany(goinsert, entries) n = 0 entries = [] print 'importing %d row' % entry_cnt if entries: entry_cnt += len(entries) self.curs.executemany(goinsert, entries) print 'importing %d row' % entry_cnt self.conn.commit() print 'done.' pairs = None stream.close()
def load_hg(self, chrom, inname=None): """Internal method. This method loads the human genome in a dictionary where the key is chromosome name and value is sequence""" self.chrom_seq = None if not inname: inname = self.inname if chrom[:3] == 'chr': chrom = chrom[3:] stream = anyopen.openfile(inname) es = FastaStream(stream) for seqi in es: header = seqi[0].rstrip().split()[0] if 'chr' in header: header = header[4:] else: header = header[1:] if header == chrom: self.chrom_seq = ''.join([s[:-1] for s in seqi[1:]]).lower() break return self.chrom_seq
def to_file(rows,Header,out_fn,fmode='wb'): #check if out_fn can be writable fp2 = anyopen.openfile(out_fn,fmode) if isinstance(Header,basestring): headStr = Header else: headStr = lib_utils.joined(Header,'\t') fp2.write('#%s\n'%headStr) if len(rows)>0: decimal_idx = get_decimal_idx(rows[0]) fix_record = False if len(decimal_idx)>0: fix_record = True for i, r in enumerate(rows): r = list(r) if fix_record: r = reformat_fields(r, decimal_idx) fp2.write('%s\n'%lib_utils.joined(r,'\t')) fp2.close()
def _iterfile(self): """Internal method. Do not use""" mimids = set() stream = anyopen.openfile(self.inname) acnum = None gene = None for line in stream: if line[:2] == '//': yield acnum, gene, list(mimids) mimids = set() elif line[:2] == 'ID': acnum = None elif line[:2] == 'AC': if acnum is not None: continue acnum = line[2:].split(';')[0].strip() elif line[:10] == 'GN Name=': gene = line.split(';')[0].split('=')[1].strip() elif line[:9] == 'DR MIM;': if 'phenotype' in line: line = line.strip() id = line.split(';')[1].strip() mimids.add(int(id))
def store_variant_summary(variant_summary_fn, linked_ids): ''' objective: parse variant summary file from Clinvar and generate a dictionary (rcvaccess) of refSeq tx HGVSc, aa change in hgvs last evaluate date REV status :return: ''' print 'storing variant summary ...' fp = anyopen.openfile(variant_summary_fn, 'rt') head_col = fp.next().split('\t') alleleid_i = head_col.index('#AlleleID') name_i = head_col.index('Name') rcvacc_i = head_col.index('RCVaccession') date_i = head_col.index('LastEvaluated') assembly_i = head_col.index('Assembly') review_i = head_col.index('ReviewStatus') # rsid_i = 9 vars_to_summuary = {} for i in fp: itms = i.split('\t') if itms[assembly_i] == 'GRCh37': if not itms[rcvacc_i].strip(): continue allele_id = itms[alleleid_i] variation_id = None if allele_id in linked_ids: variation_id = linked_ids[allele_id] rcv_ids = itms[rcvacc_i].split(';') # if itms[rsid_i] == '-1': # rcv_ids = itms[rcvacc_i].split(';') # else: # rcv_ids = ['rs%s'%itms[rsid_i]] for rcv_id in rcv_ids: if rcv_id not in vars_to_summuary: vars_to_summuary[rcv_id] = py_struct(name=None, REFTX=None, HGVSc=None, HGVSp=None, DATE=None, REV=None, CLNMETHOD=None, allele_id=None, variation_id=None) vars_to_summuary[rcv_id].allele_id = allele_id if variation_id: vars_to_summuary[rcv_id].variation_id = variation_id mObj = re.search(r'(.+)\([\w]+\):c\.(.+)\s+\(p\.(.+)\)', itms[name_i]) if mObj: vars_to_summuary[rcv_id].REFTX = mObj.group(1) vars_to_summuary[rcv_id].HGVSc = 'c.%s' % mObj.group(2) vars_to_summuary[rcv_id].HGVSp = 'p.%s' % mObj.group(3) else: #to handle a case where there is no aa hgvs mObj = re.search(r'(.+)\([\w]+\):c\.(.+)', itms[name_i]) if mObj: vars_to_summuary[rcv_id].REFTX = mObj.group(1) vars_to_summuary[ rcv_id].HGVSc = 'c.%s' % mObj.group(2) if itms[date_i] != '-': [mon_date, year] = itms[date_i].split(',') mon2, date2 = mon_date.split() mon2 = month_to_num(mon2) if mon2: month2 = mon2 else: month2 = '01' vars_to_summuary[rcv_id].DATE = '%s-%s-%s' % ( year.strip(), month2, date2.strip()) vars_to_summuary[rcv_id].REV = itms[review_i].replace( ' ', '_') fp.close() print 'Done.' return vars_to_summuary
def _iterfile(self): """Internal method. Do not use""" nsfp_data = namedtuple('Nsfp', fields) for filename in os.listdir(self.indir): if 'chr' in os.path.splitext(filename)[1]: #if '.chr1' == os.path.splitext(filename)[1]: #debug self.logger.info('Processing File : %s' % filename) stream = anyopen.openfile(os.path.join(self.indir, filename)) for rec in csv.reader(stream, delimiter='\t'): if rec[0] == '#chr': h = [] for idx, e in enumerate(rec): if idx == 0: h.append('chr') elif idx == 8: h.append('pos') else: e = e.strip() e = e.replace('(', '_') e = e.replace(')', '_') e = e.replace('-', '_') e = e.replace('+', '') e = e.replace('1000', 'K1') h.append(e) rt = namedtuple('rt', h) else: rec_tup = rt._make(tuple(rec)) # print 'pos1:%s|hg19-pos1:%s'%(rec_tup.pos_1_based_,rec_tup.pos) # if rec_tup.pos == '949523': # debug = 1 # CADD raw and phred scores if rec_tup.CADD_raw != '.': cadd_raw = float(rec_tup.CADD_raw) cadd_phred = float(rec_tup.CADD_phred) else: cadd_raw, cadd_phred = (None, None) # Interpro domain if rec_tup.Interpro_domain != '.': domains = [ e.split('(')[0].strip() for e in rec_tup.Interpro_domain.split(';') if e ] domains = [ e.replace(' ', '').replace(',', '_') for e in domains ] domains = ','.join(domains) else: domains = None # ENSEMBL TRANSCRIPT IDS trans_ids = [ trans.strip() for trans in rec_tup.Ensembl_transcriptid.split(';') ] # AMINO ACID POSITIONS IN TRANSCRIPTS trans_aapos = [ int(pos.strip()) for pos in rec_tup.aapos.split(';') ] if len(trans_ids) > 1 and len(trans_aapos) == 1: trans_aapos = trans_aapos * len(trans_ids) # GENE NAME if rec_tup.genename != '.': gene = rec_tup.genename.strip() else: gene = None # UNIPROT ACC AND AMINO ACID POSITION IN UNIPROT if rec_tup.Uniprot_acc_Polyphen2.strip() != '.': unip_acs = [ acc.strip() for acc in rec_tup.Uniprot_acc_Polyphen2.split(';') ] unp_aapos = [ int(pos.strip()) for pos in rec_tup.Uniprot_aapos_Polyphen2.split(';') ] else: unip_acs = None # SIFT SCORE / PREDICTION sifteff = {} if rec_tup.SIFT_score.strip() != '.': sift_score = [ float(score.strip()) for score in rec_tup.SIFT_score.split(';') if score != '.' ] # sift_pos = [int(e.split(':')[1][1:-1]) # for e in rec_tup.aapos_SIFT.split(';')] for ss, key in zip(sift_score, trans_aapos): flag = False if key not in sifteff: sifteff[key] = [ss] flag = True else: if ss < sifteff[key][0]: sifteff[key] = [ss] flag = True if flag is True: if ss < 0.05: # Damaging sifteff[key].append('D') else: # Tolerated sifteff[key].append('T') # POLYPHEN2(HUMVAR) SCORE / PREDICTION if rec_tup.Polyphen2_HVAR_score.strip() != '.': pp2_hvar_score = [ score.strip() for score in rec_tup.Polyphen2_HVAR_score.split(';') ] pp2_hvar_pred = [ pred.strip() for pred in rec_tup.Polyphen2_HVAR_pred.split(';') ] else: pp2_hvar_score = None pp2_hvar_pred = None # UNIPROT ACC EFFECT PREDICTION MAP hvareff = {} if pp2_hvar_pred: for pos, acc, hvscore, hvpred in \ zip(unp_aapos, unip_acs, pp2_hvar_score, pp2_hvar_pred): if hvscore != '.': hvscore = float(hvscore) if pos not in hvareff: hvareff[pos] = [hvscore, hvpred, acc] else: if hvscore > hvareff[pos][0]: hvareff[pos][0] = hvscore hvareff[pos][1] = hvpred # YIELD THE ROW FOR DB INSERTION for tid, pos in zip(trans_ids, trans_aapos): if pos in sifteff: ss, sp = sifteff[pos] else: ss, sp = (None, None) if pos in hvareff: eff = hvareff[pos] yield nsfp_data._make( (rec_tup.chr, int(rec_tup.pos), rec_tup.ref, rec_tup.alt, rec_tup.aaref, rec_tup.aaalt, gene, tid, pos, eff[2], ss, sp, eff[0], eff[1], cadd_raw, cadd_phred, domains)) else: if ss is not None or cadd_raw is not None: yield nsfp_data._make( (rec_tup.chr, int(rec_tup.pos), rec_tup.ref, rec_tup.alt, rec_tup.aaref, rec_tup.aaalt, gene, tid, pos, None, ss, sp, None, None, cadd_raw, cadd_phred, domains))