def get_dbscSNV_ant(chrom, pos, ref, alt): '''Checks if the variant affects splicing''' f = '/dbscSNV/dbscSNV1.1.chr' + chrom + '.gz' fo = Tabix(f) for rec in fo.query(chrom, pos - 1, pos + 1): if int(rec[1]) == pos and rec[2] == ref and rec[3] == alt: ada_score, rf_score = rec[-2], rec[-1].strip() if rf_score and rf_score != '.': rf_score = '' return float(ada_score), rf_score return '', ''
def _get_coord_depth(invcf, capture_file_path, genelist_config, patient_capture_config): #global capcoord_dp # Load the genelist gl = _load_genelist(genelist_config) #gl = ['CCDC39', 'CSF2RA', 'CTC1', 'DNM1L', 'FBN1', 'HSD17B4', 'HYDIN', 'TERT'] # Load the patient capture file pc = _load_patient_capture(patient_capture_config) # Load the captured coordinates for each sample CAP = {'v2': '4-Hopkins_clinical_panel_capture_v2paper.bed', 'v1b': '4-Hopkins_clinical_panel_capture_v1b.bed'} cap_exon_mean = {} vcfo = Tabix(invcf) samples = _get_sample_ids(invcf) for cver, pl in pc.items(): capture_file = os.path.join(capture_file_path, CAP[cver]) cap_gene_coords = _load_capture(capture_file, gl) for gene, cap_exons in cap_gene_coords.items(): for idx, exon in enumerate(cap_exons): exno = idx + 1 chrom, sp, ep = exon key = (chrom, sp, ep, exno) temp = {} for rec in vcfo.query(chrom, sp, ep): gt_info_raw = rec[9:] for sid, gt_info in zip(samples, gt_info_raw): if sid not in pl: continue if sid not in temp: temp[sid] = [] if gt_info == './.': temp[sid].append(0) else: gt_d = gt_info.split(':') if len(gt_d) <= 3: temp[sid].append(int(gt_d[-1])) else: temp[sid].append(int(gt_d[2])) # Compute the mean for sid, val in temp.items(): m = np.average(val) if sid not in cap_exon_mean: cap_exon_mean[sid] = {} if gene not in cap_exon_mean[sid]: cap_exon_mean[sid][gene] = {} cap_exon_mean[sid][gene][key] = m return cap_exon_mean
def gerp(vf, af, name="gerp"): print "inside gerp" v = BedTool(vf) t = Tabix(af) results = {} for var in v: try: result = 0.0 num = 0 for res in t.query(var.chrom, var.start, var.end): result += float(res[4]) num += 1 if num > 0: results[var.name] = result/num except: pass print "exit gerp" return Series(results, name=name)
samples = [] results = {} for ln in open(sys.argv[2]): sample, fn = ln.rstrip().split("\t") samples.append(sample) results[sample] = {} # p = subprocess.Popen(['bcftools', 'view', fn], stdout=subprocess.PIPE) # vcf_reader = vcf.VCFReader(p.stdout, 'rb') # for record in vcf_reader: # pos = (record.CHROM, record.POS) print fn tab = Tabix(fn) for pos in positions: search = "%s:%d-%d;" % (pos[0], pos[1], pos[1]) try: itr = tab.fetch(search) rec = itr.next() except StopIteration: print "can't find: %s" % (search,) pass cols = rec.split("\t") record = {} record['REF'] = cols[3] record['ALT'] = cols[4] record['QUAL'] = float(cols[5])
def _get_coord_depth(invcf, capture_file_path, patient_capture_config, sample_ids, qgene): # Load the patient capture file pc = _load_patient_capture(patient_capture_config, sample_ids) print pc # Load the captured coordinates for each sample CAP = { 'v2': '4-Hopkins_clinical_panel_capture_v2.bed', 'v1b': '4-Hopkins_clinical_panel_capture_v1b.bed' } vcfo = Tabix(invcf) samples = _get_sample_ids(invcf) coord_depth = {} for cver, pl in pc.items(): if cver == 'v2': continue capture_file = os.path.join(capture_file_path, CAP[cver]) cap_exons = _load_capture(capture_file, qgene) ex_cnt = len(cap_exons) for idx, exon in enumerate(cap_exons): exno = idx + 1 chrom, sp, ep = exon midpos = range(sp, ep)[(ep - sp) / 2] for rec in vcfo.query(chrom, sp, ep): pos = int(rec[1]) gt_info_raw = rec[9:] tsid = [] for sid, gt_info in zip(samples, gt_info_raw): if sid not in pl: continue tsid.append(sid) if sid not in coord_depth: coord_depth[sid] = [[], [], []] if gt_info == './.': dp = 0 else: gt_d = gt_info.split(':') if len(gt_d) <= 3: dp = int(gt_d[-1]) else: dp = int(gt_d[2]) coord_depth[sid][0].append(pos) coord_depth[sid][1].append(dp) if pos == midpos: coord_depth[sid][2].append('CR-' + str(exno)) else: coord_depth[sid][2].append('') if idx + 1 == ex_cnt: continue for sid in tsid: n = 105 for fp in range(ep + 1, ep + 1 + n): coord_depth[sid][0].append(fp) coord_depth[sid][1].append(0) coord_depth[sid][2].append('') return coord_depth
def _generate_TSV(outdir, samples, capcoord): """Internal Method. Generates two tsv files (one for snv and other for indel) that contains the variants reported in 1000 Genomes Project Phase 3 dataset for given genomic regions. Args: outdir (str): Path where TSV files will be created samples (list): List of samples for which the variants will be extracted from 1000 Genomes dataset. capcoord (list): List of bed format coordinates wrapped as tuples. For eg. [(chrom, start_pos, end_pos), ...] Returns: None """ out_snpfile = os.path.join(outdir, 'snp_control_variants_all.tsv') outsnp = open(out_snpfile, 'w') out_indelfile = os.path.join(outdir, 'indel_control_variants_all.tsv') outindel = open(out_indelfile, 'w') chroms = [str(e) for e in range(1, 23)] + ['X', 'Y'] hflag = False for chrom in chroms: if chrom == 'X': fp1 = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz' ofp1 = os.path.join(outdir, 'ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz') fp2 = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz.tbi' ofp2 = os.path.join(outdir, 'ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz.tbi') else: fp1 = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr%s.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz' % chrom ofp1 = os.path.join(outdir, 'ALL.chr%s.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz' % chrom) fp2 = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr%s.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi' % chrom ofp2 = os.path.join(outdir, 'ALL.chr%s.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi' % chrom) if not os.path.exists(ofp1): cmd = "wget %s -O %s" % (fp1, ofp1) print cmd os.system(cmd) if not os.path.exists(ofp2): cmd = "wget %s -O %s" % (fp2, ofp2) print cmd os.system(cmd) for line in os.popen("zcat %s | head -260 | grep '#CHROM'" % ofp1): line = line.strip() h = line.split('\t')[:9] kgsamples = line.split('\t')[9:] if hflag == False: h += samples outsnp.write('\t'.join(h) + '\n') outindel.write('\t'.join(h) + '\n') hflag = True vcfo = Tabix(ofp1) for coord in capcoord: if coord[0] != chrom: continue for rec in vcfo.query(*coord): if '<' in rec[4] or ',' in rec[4]: continue if len(rec[3]) == len(rec[4]): # to make Indel control set out = outsnp if len(rec[3]) != len(rec[4]): # to make SNV control set out = outindel sv = [] d = dict(zip(kgsamples, rec[9:])) temp = set([]) for sid in samples: sv.append(d[sid]) temp.add(d[sid]) if len(temp) == 1 and list(temp)[0] in ['0|0', '0/0']: continue out.write('\t'.join(rec[:9] + sv) + '\n')
def get_varcnt_gp(invcf, antvcf, capcoord, samples, varcnt, vao): '''Returns a dictionary where the keys are sampleid and values are a list of length 11 whose the elements are the count of - * Nocall * VarLQ * GTLQ * REF allele * HetALT * HomALT * Ts * Tv * Common Variants (Based on 1000 Genomes) * Rare Variants (Based on 1000 Genomes) * Novel Variants (Based on 1000 Genomes) ''' vcfsamples = get_vcfsamples(invcf) for sid in samples: varcnt[sid] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] # NoCall, VarLQ, GTLQ, REF, HetALT, HomALT, Ts, Tv, Common, Rare, Novel vcfo = Tabix(invcf) antvcfo = Tabix(antvcf) for coord in capcoord: try: qdata = vcfo.query(*coord) except: qdata = None if qdata is None: continue for rec in qdata: filter = rec[6] gts = rec[9:] trans = vao.retrieve_genedef(rec[0], int(rec[1]), int(rec[1])) if not trans: # indicating its an intergenic region continue for sid, gtinfo in zip(vcfsamples, gts): if sid in samples: if gtinfo == '.' or './.' in gtinfo: # No Call varcnt[sid][0] += 1 elif filter != 'PASS': varcnt[sid][1] += 1 # VarLQ elif '0/0' in gtinfo: varcnt[sid][3] += 1 # REF else: gt, gq = gtinfo.split(':')[:2] gq = int(gq) a1, a2 = gt.split('/') a1, a2 = int(a1), int(a2) ref, alt = rec[3], rec[4].split(',')[a2 - 1] ct = get_change_type(ref, alt) arec = antvcfo.query(rec[0], int(rec[1]) - 1, int(rec[1])) com, rare, novel = False, False, False iflag = False for e in arec: if int(rec[1]) == int(e[1]): kgaf = 0.0 if 'KGDB' in e[7]: kgaf = float(e[7].split('KGAF=')[1].split( ';')[0].split(',')[a2 - 1]) exacaf = 0.0 if 'EXACDB' in e[7]: exacaf = float( e[7].split('EXACAF=')[1].split( ';')[0].split(',')[a2 - 1]) if kgaf == 0.0 and exacaf == 0.0: novel = True elif kgaf < 0.05 and exacaf < 0.05: rare = True else: com = True break if gq < 30: # GTLQ varcnt[sid][2] += 1 elif a1 != a2: # HetALT varcnt[sid][4] += 1 if ct == 'ts': varcnt[sid][6] += 1 elif ct == 'tv': varcnt[sid][7] += 1 if com: varcnt[sid][8] += 1 elif rare: varcnt[sid][9] += 1 elif novel: varcnt[sid][10] += 1 print sid, rec[0], rec[1], rec[3], rec[4] elif a1 == a2: # HomALT varcnt[sid][5] += 1 if ct == 'ts': varcnt[sid][6] += 1 elif ct == 'tv': varcnt[sid][7] += 1 if com: varcnt[sid][8] += 1 elif rare: varcnt[sid][9] += 1 elif novel: varcnt[sid][10] += 1 print sid, rec[0], rec[1], rec[3], rec[4] return varcnt