def filter_snvs(in_fhand, out_fhand, filters, filtered_fhand=None, log_fhand=None, reader_kwargs=None): '''It filters an input vcf. The input fhand has to be uncompressed. The original file could be a gzipped file, but in that case it has to be opened with gzip.open before sending it to this function. ''' if reader_kwargs is None: reader_kwargs = {} # The input fhand to this function cannot be compressed reader_kwargs.update({ 'compressed': False, 'filename': 'pyvcf_bug_workaround' }) reader = VCFReader(in_fhand, **reader_kwargs) template_reader = VCFReader(StringIO(reader.header)) writer = VCFWriter(out_fhand, template_reader=template_reader) if filtered_fhand: filtered_writer = VCFWriter(filtered_fhand, template_reader=template_reader) else: filtered_writer = None packets = group_in_filter_packets(reader.parse_snvs(), SNPS_PER_FILTER_PACKET) tot_snps = 00.01 passed_snps = OrderedDict() broken_pipe = False for packet in packets: tot_snps += len(packet[PASSED]) + len(packet[FILTERED_OUT]) for filter_ in filters: packet = filter_(packet) filter_name = filter_.__class__.__name__ if filter_name not in passed_snps: passed_snps[filter_name] = 0 passed_snps[filter_name] += len(packet[PASSED]) for snv in packet[PASSED]: if not _safe_write_snv(writer, snv): broken_pipe = True break if filtered_writer: for snv in packet[FILTERED_OUT]: if not _safe_write_snv(filtered_writer, snv): broken_pipe = True break if broken_pipe: break if log_fhand: _write_log(log_fhand, tot_snps, passed_snps) writer.flush()
def run_genotype_filters(in_fhand, out_fhand, gt_filters, plots_dir=None, reader_kwargs=None): if reader_kwargs is None: reader_kwargs = {} reader_kwargs['filename'] = 'pyvcf_bug_workaround' reader_kwargs['compressed'] = False reader = VCFReader(in_fhand, **reader_kwargs) templa_reader = VCFReader(StringIO(reader.header)) writer = VCFWriter(out_fhand, template_reader=templa_reader) for snv in reader.parse_snvs(): for mapper in gt_filters: snv = mapper(snv) try: writer.write_snv(snv) except IOError, error: # The pipe could be already closed if 'Broken pipe' in str(error): break else: raise
def test_cons_recomb(self): vcf_fpath = os.path.join(TEST_DATA_DIR, 'scaff000025.vcf.gz') snvs = VCFReader(open(vcf_fpath)).parse_snvs() snv_filter = WeirdRecombFilter(pop_type='ril_self') flt_snvs = snv_filter.filter_snvs(snvs) assert len(list(flt_snvs)) == 258 assert snv_filter.not_fitted_counter['no close region left'] == 10 fhand = NamedTemporaryFile(suffix='.png') flt_snvs = snv_filter.plot_recomb_at_0_dist_hist(fhand) assert len(snv_filter.recomb_rates['ok']) == 245 assert len(snv_filter.recomb_rates['ok_conf_is_None']) == 13 assert len(snv_filter.recomb_rates['not_ok']) == 14 snvs = VCFReader(open(vcf_fpath)).parse_snvs() snv_filter = WeirdRecombFilter(pop_type='ril_self', max_zero_dist_recomb=0.07, alpha_recomb_0=None) flt_snvs = snv_filter.filter_snvs(snvs) assert len(list(flt_snvs)) == 266 assert snv_filter.not_fitted_counter['no close region left'] == 10 fhand = NamedTemporaryFile(suffix='.png') flt_snvs = snv_filter.plot_recomb_at_0_dist_hist(fhand) assert len(snv_filter.recomb_rates['ok']) == 0 assert len(snv_filter.recomb_rates['ok_conf_is_None']) == 266 assert len(snv_filter.recomb_rates['not_ok']) == 6 fhand = StringIO() snv_filter.write_log(fhand) assert 'SNVs processed: 282' in fhand.getvalue()
def test_high_variable_region_filter(self): records = VCFReader(open(VCF_PATH), min_calls_for_pop_stats=1).parse_snvs() bulk_filter = HighVariableRegion(max_variability=0.02, window_in_bp=101, ref_fpath=REF_PATH) records = list(bulk_filter(records)) assert bulk_filter.name in records[0].filters assert bulk_filter.name not in records[3].filters records = VCFReader(open(VCF_PATH), min_calls_for_pop_stats=1).parse_snvs() bulk_filter = HighVariableRegion(max_variability=0.05, window_in_bp=101, ref_fpath=REF_PATH) records = list(bulk_filter(records)) assert bulk_filter.name not in records[0].filters assert bulk_filter.name == 'hv0.05' desc = 'The region has more than 5 snvs per 101 bases' assert desc in bulk_filter.description records = VCFReader(open(VCF_PATH), min_calls_for_pop_stats=1).parse_snvs() bulk_filter = HighVariableRegion(max_variability=0.003, window_in_bp=11, ref_fpath=REF_PATH) records = list(bulk_filter(records)) assert bulk_filter.name in records[0].filters records = VCFReader(open(VCF_PATH), min_calls_for_pop_stats=1).parse_snvs() bulk_filter = HighVariableRegion(max_variability=0.003, window_in_bp=101, ref_fpath=REF_PATH) records = list(bulk_filter(records)) assert bulk_filter.name in records[0].filters
def test_get_snpcaller(self): varscan = open(join(TEST_DATA_DIR, 'sample.vcf.gz')) gatk = open(join(TEST_DATA_DIR, 'gatk_sample.vcf.gz')) freebayes = open(join(TEST_DATA_DIR, 'freebayes_sample.vcf.gz')) assert VCFReader(fhand=varscan).snpcaller == VARSCAN assert VCFReader(fhand=gatk).snpcaller == GATK assert VCFReader(fhand=freebayes).snpcaller == FREEBAYES tassel = open(join(TEST_DATA_DIR, 'generic.vcf.gz')) assert VCFReader(fhand=tassel).snpcaller == GENERIC
def test_vcf_writer(self): varscan = open(join(TEST_DATA_DIR, 'vari_filter.vcf')) reader = VCFReader(fhand=varscan) out_fhand = NamedTemporaryFile() writer = VCFWriter(out_fhand, reader) for snv in reader.parse_snvs(): writer.write_snv(snv) writer.flush() assert 'CUUC00027_TC01' in open(out_fhand.name).read() writer.close()
def plot_haplotypes(vcf_fhand, plot_fhand, genotype_mode=REFERENCE, filter_alleles_gt=FILTER_ALLELES_GT): reader = VCFReader(vcf_fhand) # collect data genotypes = None samples = [] for snv in reader.parse_snvs(): if genotypes is None: genotypes = {} for call in snv.calls: sample = call.sample genotypes[sample] = [] samples.append(sample) for call in snv.calls: alleles = _get_alleles(call, filter_alleles_gt=filter_alleles_gt) genotypes[call.sample].append(alleles) # draw n_samples = len(samples) xsize = len(genotypes[sample]) / 100 if xsize >= 100: xsize = 100 if xsize <= 8: xsize = 8 ysize = n_samples * 2 if ysize >= 100: ysize = 100 # print xsize, ysize figure_size = (xsize, ysize) fig = Figure(figsize=figure_size) for index, sample in enumerate(samples): axes = fig.add_subplot(n_samples, 1, index) axes.set_title(sample) y_data = genotypes[sample] x_data = [i + 1 for i in range(len(y_data))] x_data, y_data = _flatten_data(x_data, y_data) axes.plot(x_data, y_data, marker='o', linestyle='None', markersize=3.0, markeredgewidth=0, markerfacecolor='red') ylim = axes.get_ylim() ylim = ylim[0] - 0.1, ylim[1] + 0.1 axes.set_ylim(ylim) axes.tick_params(axis='x', bottom='off', top='off', which='both', labelbottom='off') axes.tick_params(axis='y', left='on', right='off', labelleft='off') axes.set_ylabel(sample) canvas = FigureCanvas(fig) canvas.print_figure(plot_fhand, dpi=300) plot_fhand.flush()
def filter_snvs(in_fhand, out_fhand, filters, filtered_fhand=None, log_fhand=None, reader_kwargs=None): '''It filters an input vcf. The input fhand has to be uncompressed. The original file could be a gzipped file, but in that case it has to be opened with gzip.open before sending it to this function. ''' if reader_kwargs is None: reader_kwargs = {} # The input fhand to this function cannot be compressed reader_kwargs.update({'compressed': False, 'filename': 'pyvcf_bug_workaround'}) reader = VCFReader(in_fhand, **reader_kwargs) template_reader = VCFReader(StringIO(reader.header)) writer = VCFWriter(out_fhand, template_reader=template_reader) if filtered_fhand: filtered_writer = VCFWriter(filtered_fhand, template_reader=template_reader) else: filtered_writer = None packets = group_in_filter_packets(reader.parse_snvs(), SNPS_PER_FILTER_PACKET) tot_snps = 00.01 passed_snps = OrderedDict() broken_pipe = False for packet in packets: tot_snps += len(packet[PASSED]) + len(packet[FILTERED_OUT]) for filter_ in filters: packet = filter_(packet) filter_name = filter_.__class__.__name__ if filter_name not in passed_snps: passed_snps[filter_name] = 0 passed_snps[filter_name] += len(packet[PASSED]) for snv in packet[PASSED]: if not _safe_write_snv(writer, snv): broken_pipe = True break if filtered_writer: for snv in packet[FILTERED_OUT]: if not _safe_write_snv(filtered_writer, snv): broken_pipe = True break if broken_pipe: break if log_fhand: _write_log(log_fhand, tot_snps, passed_snps) writer.flush()
def __init__(self, vcf_fpath, gq_threshold=None, dp_threshold=100, min_calls_for_pop_stats=DEF_MIN_CALLS_FOR_POP_STATS, remarkable_coverages=None, window_size=WINDOWS_SIZE): if remarkable_coverages is None: remarkable_depths = REMARKABLE_DEPTHS self.remarkable_depths = remarkable_depths self._reader = VCFReader( open(vcf_fpath), min_calls_for_pop_stats=min_calls_for_pop_stats) self._random_reader = pyvcfReader(filename=vcf_fpath) self.window_size = window_size self._gq_threshold = 0 if gq_threshold is None else gq_threshold self.dp_threshold = dp_threshold self._gt_qual_depth_counter = {HOM: IntBoxplot(), HET: IntBoxplot()} self._ac2d = _AlleleCounts2D() self.sample_dp_coincidence = {1: IntCounter()} for cov in remarkable_depths: self.sample_dp_coincidence[cov] = IntCounter() self.called_snvs = 0 self.called_gts = IntCounter() # sample_counter self._sample_counters = {} for counter_name in SAMPLE_COUNTERS: if counter_name not in self._sample_counters: self._sample_counters[counter_name] = {} for sample in self._reader.samples: if counter_name in (GT_DEPTHS, GT_QUALS): counters = {HOM: IntCounter(), HET: IntCounter()} else: counters = IntCounter() self._sample_counters[counter_name][sample] = counters self._snv_counters = { MAFS: IntCounter(), MACS: IntCounter(), MAFS_DP: IntCounter(), SNV_QUALS: IntCounter(), HET_IN_SNP: IntCounter(), SNV_DENSITY: IntCounter(), INBREED_F_IN_SNP: IntCounter(), DEPTHS: IntCounter() } self._calculate()
def test_close_to_filter(self): records = list( VCFReader(open(FREEBAYES_VCF_PATH), min_calls_for_pop_stats=1).parse_snvs()) rec1 = records[1].copy() filter_ = CloseToSnv(distance=300, max_maf_depth=None) filter_(rec1) assert filter_.name in rec1.filters rec1 = records[1].copy() filter_ = CloseToSnv(distance=300, max_maf_depth=0.5) filter_(rec1) assert rec1.filters is None rec1 = records[1].copy() filter_ = CloseToSnv(distance=300, max_maf_depth=0.8) filter_(rec1) assert filter_.name in rec1.filters assert filter_.name == 'cs300_0.80' desc = 'The snv is closer than 300 nucleotides to another snv, ' desc += 'with maf:0.80' assert desc in filter_.description rec1 = records[1].copy() filter_ = CloseToSnv(distance=300, max_maf_depth=0.8, snv_type='snp') filter_(rec1) assert filter_.name in rec1.filters
def test_filter_calls(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,. 20\t17330\t.\tT\tA\t3\tq10\tNS=3;DP=11;AF=0.017\tGT:GQ:DP:HQ\t0|0:49:3:58,50\t0|1:3:5:65,3\t0/0:41:3 20\t1110696\trs6040355\tA\tG,T\t67\tPASS\tNS=2;DP=10;AF=0.333,0.667;AA=T;DB\tGT:GQ:DP:HQ\t1|2:21:6:23,27\t2|1:2:0:18,2\t2/2:35:4 20\t1230237\t.\tT\t.\t47\tPASS\tNS=3;DP=13;AA=T\tGT:GQ:DP:HQ\t0|0:54:7:56,60\t0|0:48:4:51,51\t0/0:61:2 20\t1234567\tmicrosat1\tGTC\tG,GTCT\t50\tPASS\tNS=3;DP=9;AA=G\tGT:GQ:DP\t0/1:35:4\t0/2:17:2\t1/1:40:3 20\t1234567\tmicrosat1\tGTC\tG,GTCT\t50\tPASS\tNS=3;DP=9;AA=G\tGT:GQ:DP\t./.:35:4\t0/2:17:2\t1/1:40:3 ''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) snp = snps[4] assert len(snp.alleles) == 3 snp_filtered = snp.filter_calls_by_sample(samples=('NA00003',)) assert len(snp_filtered.alleles) == 2 snp = snps[1] snp_filtered = snp.filter_calls_by_sample(samples=('NA00003',)) assert len(snp_filtered.calls) == 1 snp = snps[1] snp_filtered = snp.filter_calls_by_sample(samples=('NA00003',), reverse=True) assert len(snp_filtered.calls) == 2 try: snp_filtered = snp.filter_calls_by_sample(samples=('NA0003',), reverse=True) self.fail("KeyError Expected") except KeyError: pass assert len(snp_filtered.calls) == 2
def test_genotype_freq(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT N1 N2 N3 20\t1\t.\tG\tA\t2\tq1\tNS=3\tGT\t0|0\t1|0\t1/1 20\t2\t.\tT\tA\t3\tq1\tNS=3\tGT\t0|0\t0|0\t0/. 20\t3\t.\tT\tA\t3\tq1\tNS=3\tGT\t1|1\t1|1\t./. 20\t4\t.\tT\tA\t3\tq1\tNS=3\tGT\t.\t.\t. 20\t5\t.\tT\tA\t3\tq1\tNS=3\tGT\t0|2\t1|1\t1/. 20\t5\t.\tT\tA\t3\tq1\tNS=3\tGT\t0|1\t1|0\t1/. 20\t5\t.\tT\tA\t3\tq1\tNS=3\tGT\t0|0\t0|0\t1/. 20\t5\t.\tT\tA\t3\tq1\tNS=3\tGT\t1|1\t0|0\t1/0 ''' vcf = StringIO(VCF_HEADER2 + vcf) snps = list(VCFReader(vcf, min_calls_for_pop_stats=1).parse_snvs()) assert snps[0].genotype_counts == {(0, 0): 1, (0, 1): 1, (1, 1): 1} assert snps[1].genotype_counts == {(0, 0): 2, (None, 0): 1} assert snps[2].genotype_counts == {(1, 1): 2} assert snps[3].genotype_counts is None self.assertAlmostEqual(snps[0].genotype_freqs[(0, 0)], 0.33333, 4) assert snps[4].biallelic_genotype_counts == (1, 0, 1) assert snps[5].biallelic_genotype_counts == (0, 2, 0) assert snps[6].biallelic_genotype_counts == (2, 0, 0) assert snps[7].biallelic_genotype_counts == (1, 1, 1) self.assertAlmostEqual(snps[7].biallelic_genotype_freqs[0], 0.33333, 4)
def test_r_example(self): # r examples self.assertAlmostEqual(_calculate_r_sqr(HaploCount(10, 10, 10, 10)), 0) self.assertAlmostEqual(_fisher_exact(HaploCount(10, 10, 10, 10)), 1) self.assertAlmostEqual(_calculate_r_sqr(HaploCount(10, 0, 0, 10)), 1) self.assertAlmostEqual(_calculate_r_sqr(HaploCount(441, 13, 111, 435)), 0.591332576) self.assertAlmostEqual(_fisher_exact(HaploCount(6, 6, 2, 6)), 0.3728506787) self.assertAlmostEqual(_fisher_exact(HaploCount(1, 0, 5, 7)), 0.4615385) self.assertAlmostEqual(_fisher_exact(HaploCount(5, 23, 1, 20)), 0.219157345) vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t./.\t 20\t3\t.\tG\tA\t29\tPASS\tNS=3\tGT\t3/3\t3/3\t3/3\t2/2\t2/2\t3/3\t3/3\t''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) ld_stats = calculate_ld_stats(snps[0], snps[1]) self.assertAlmostEqual(ld_stats.fisher, 0.39999999999) self.assertAlmostEqual(ld_stats.r_sqr, 0.49999999)
def _create_reader_from_snv(snv): orig_reader = snv.reader fpath = orig_reader.fhand.name min_calls_for_pop_stats = orig_reader.min_calls_for_pop_stats random_reader = VCFReader(open(fpath), min_calls_for_pop_stats=min_calls_for_pop_stats) return random_reader
def test_binary(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8 20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t 20\t703\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t 20\t2003\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t ''' fhand = NamedTemporaryFile() fhand.write(VCF_HEADER + vcf) fhand.flush() out_fhand = NamedTemporaryFile() binary = join(VCF_BIN_DIR, 'filter_vcf_by_ld') cmd = [binary, '-o', out_fhand.name, fhand.name, '--no_bonferroni_correction', '--p_val', '0.03'] process = Popen(cmd, stderr=PIPE) process.communicate() assert len(list(VCFReader(open(out_fhand.name)).parse_snvs())) == 3 log_fhand = NamedTemporaryFile() binary = join(VCF_BIN_DIR, 'filter_vcf_by_ld') cmd = [binary, '-o', out_fhand.name, fhand.name, '--no_bonferroni_correction', '--p_val', '0.03', '-l', log_fhand.name] process = Popen(cmd, stderr=PIPE) process.communicate() assert 'filtered' in open(log_fhand.name).read()
def test_cap_enzyme_filter(self): seq_str = '>seq1\nATGATGATGgaaattcATGATGATGTGGGAT\n' seq_str += '>seq2\nATGATGATGATGATGATGTGGGAT\n' fhand = NamedTemporaryFile() fhand.write(seq_str) fhand.flush() vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 seq1\t11\trs6054257\tAA\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,. seq2\t12\t.\tA\tAA\t3\tq10\tNS=3;DP=11;AF=0.017\tGT:GQ:DP:HQ\t0|0:49:3:58,50\t0|1:3:5:65,3\t0/0:41:3 20\t1110696\trs6040355\tA\tG,T\t67\tPASS\tNS=2;DP=10;AF=0.333,0.667;AA=T;DB\tGT:GQ:DP:HQ\t1|2:21:6:23,27\t2|1:2:0:18,2\t2/2:35:4 20\t1230237\t.\tT\t.\t47\tPASS\tNS=3;DP=13;AA=T\tGT:GQ:DP:HQ\t0|0:54:7:56,60\t0|0:48:4:51,51\t0/0:61:2 20\t1234567\tmicrosat1\tGTC\tG,GTCT\t50\tPASS\tNS=3;DP=9;AA=G\tGT:GQ:DP\t0/1:35:4\t0/2:17:2\t1/1:40:3 20\t1234567\tmicrosat1\tGTC\tG,GTCT\t50\tPASS\tNS=3;DP=9;AA=G\tGT:GQ:DP\t./.:35:4\t0/2:17:2\t1/1:40:3 ''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) filter_ = CapEnzyme(all_enzymes=True, ref_fpath=fhand.name) assert filter_.name == 'cet' desc = 'SNV is not a CAP detectable by the enzymes: all' assert desc in filter_.description rec1 = snps[0].copy() filter_(rec1) assert filter_.name not in rec1.filters rec1 = snps[1].copy() filter_(rec1) assert filter_.name in rec1.filters
def test_bin(self): binary = join(VCF_BIN_DIR, 'filter_vcf_by_weird_segregation') cmd = [binary, '-h'] process = Popen(cmd, stderr=PIPE, stdout=PIPE) stdout = process.communicate()[0] assert 'usage' in stdout vcf_fpath = os.path.join(TEST_DATA_DIR, 'scaff000025.vcf.gz') binary = join(VCF_BIN_DIR, 'filter_vcf_by_weird_segregation') cmd = [ binary, '-n', '2', '-m', '200', '-s', '1_14_1_gbs', '-s', '1_17_1_gbs', '-s', '1_18_4_gbs', '-s', '1_19_4_gbs', '-s', '1_26_1_gbs', '-s', '1_27_1_gbs', '-s', '1_2_2_gbs', '-s', '1_35_13_gbs', '-s', '1_3_2_gbs', '-s', '1_50_1_gbs', '-s', '1_59_1_gbs', '-s', '1_63_4_gbs', '-s', '1_6_2_gbs', '-s', '1_70_1_gbs', '-s', '1_74_1_gbs', '-s', '1_79_1_gbs', '-s', '1_7_2_gbs', '-s', '1_81_10_gbs', '-s', '1_86_1_gbs', '-s', '1_8_2_gbs', '-s', '1_91_2_gbs', '-s', '1_94_4_gbs', '-s', '2_107_1_gbs', '-s', '2_10_2_gbs', '-s', '2_116_1_gbs', '-s', '2_11_1_gbs', '-s', '2_125_2_gbs', '-s', '2_13_1_gbs', vcf_fpath ] process2 = Popen(cmd, stderr=PIPE, stdout=PIPE) stdout, stderr = process2.communicate() assert len(list(VCFReader(StringIO(stdout)).parse_snvs())) == 273 assert 'SNVs processed:' in stderr
def test_het_unknown(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8 20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t./.\t1/.\t ''' vcf = StringIO(VCF_HEADER + vcf) reader = VCFReader(vcf) snps = list(reader.parse_snvs()) snp = snps[0] expected = [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [], [1, None]] assert [call.int_alleles for call in snps[0].calls] == expected assert snp.num_called == 7 out_fhand = StringIO() writer = VCFWriter(out_fhand, reader) for snv in snps: writer.write_snv(snv) assert '1/1\t./.\t1/.' in out_fhand.getvalue()
def test_mafs(self): vcf = open(join(TEST_DATA_DIR, 'freebayes_al_depth.vcf')) snps = list(VCFReader(vcf).parse_snvs()) assert snps[0].maf_depth - 0.5 < 0.001 assert snps[0].allele_depths == {0: 1, 1: 1} assert snps[0].depth == 2 assert snps[1].maf_depth - 1.0 < 0.001 assert snps[1].allele_depths == {0: 2, 1: 0} assert snps[4].maf_depth - 0.9890 < 0.001 assert snps[4].allele_depths == {0: 90, 1: 1} assert snps[4].depth == 91 result = [1, 1, 1, 1, 1, 0.944444] for call, res in zip(snps[4].calls, result): assert call.maf_depth - res < 0.001 assert snps[0].mac snps[0].min_calls_for_pop_stats = 3 assert snps[0].maf is None snps[3].min_calls_for_pop_stats = 3 assert snps[3].maf - 0.75 < 0.0001 snps[4].min_calls_for_pop_stats = 3 assert snps[4].maf - 1.0 < 0.0001 assert snps[0].mac == 2 # varscan varscan_fhand = open(join(TEST_DATA_DIR, 'sample.vcf.gz')) reader = VCFReader(fhand=varscan_fhand) snp = list(reader.parse_snvs())[0] snp.min_calls_for_pop_stats = 1 assert snp.maf_depth is None # gatk fhand = open(join(TEST_DATA_DIR, 'gatk_sample.vcf.gz')) reader = VCFReader(fhand=fhand) snp = list(reader.parse_snvs())[0] assert 0.7 < snp.maf_depth < 0.72 assert 0.7 < snp.get_call('hib_amarillo').maf_depth < 0.72 # freebayes fhand = open(join(TEST_DATA_DIR, 'freebayes_sample.vcf.gz')) reader = VCFReader(fhand=fhand) snp = list(reader.parse_snvs())[0] assert 0.99 < snp.maf_depth < 1.01 assert 0.99 < snp.get_call('pep').maf_depth < 1.01
def test_rqtl_writer(self): vcf = StringIO(unicode(self.VCF_HEADER + self.vcf)) snps = list(VCFReader(vcf).parse_snvs()) fhand = StringIO() writer = RQTLWriter(fhand, phys_to_genet_dist=DEF_PHYS_TO_GENET_DIST) for snp in snps: writer.write(snp) assert fhand.getvalue() == self.expected
def test_empy_snv(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t./.\t./.\t./.\t./.\t./.\t./. 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t1/1\t1/1\t0/0\t0/0\t0/1''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) call1 = snps[0].record.samples call2 = snps[1].record.samples counts = _count_biallelic_haplotypes(call1, call2) assert counts is None
def test_check_backwards(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8 20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t 20\t703\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t 20\t2003\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t 20\t2403\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t ''' vcf = StringIO(VCF_HEADER + vcf) snps = VCFReader(vcf).parse_snvs() snvs = filter_snvs_by_ld(snps, p_val=0.03, bonferroni=False) assert [s.pos for s in snvs] == [1, 702, 2002, 2402]
def test_recomb_rate(self): # samples vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8 20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t 20\t3\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t 20\t4\t.\tG\tA\t29\tPASS\tNS=3\tGT\t1/1\t0/0\t1/1\t0/0\t0/0\t1/1\t0/0\t1/1\t 20\t6\t.\tG\tA\t29\tPASS\tNS=3\tGT\t./.\t./.\t./.\t./.\t./.\t0/1\t0/1\t0/1\t 21\t4\t.\tG\tA\t29\tPASS\tNS=3\tGT\t1/1\t0/0\t1/1\t0/0\t0/0\t1/1\t0/0\t1/1\t ''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) recomb = _calc_recomb_rate(snps[0].record.samples, snps[1].record.samples, 'ril_self') self.assertAlmostEqual(recomb, 0.0, 3) recomb = _calc_recomb_rate(snps[0].record.samples, snps[2].record.samples, 'ril_self') self.assertAlmostEqual(recomb, 0.375, 3) recomb = _calc_recomb_rate(snps[0].record.samples, snps[2].record.samples, 'test_cross') self.assertAlmostEqual(recomb, 0.5, 3) recomb = _calc_recomb_rate(snps[0].record.samples, snps[3].record.samples, 'test_cross') assert recomb is None vcf = '''#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t1_14_1_gbs\t1_17_1_gbs\t1_18_4_gbs\t1_19_4_gbs\t1_26_1_gbs\t1_27_1_gbs1_2_2_gbs\t1_35_13_gbs\t1_3_2_gbs\t1_50_1_gbs\t1_59_1_gbs\t1_63_4_gbs\t1_6_2_gbs\t1_70_1_gbs\t1_74_1_gbs\t1_79_1_gbs\t1_7_2_gbs\t1_81_10_gbs\t1_86_1_gbs\t1_8_2_gbs\t1_91_2_gbs\t1_94_4_gbs\t2_107_1_gbs\t2_10_2_gbs\t2_116_1_gbs\t2_11_1_gbs\t2_125_2_gbs\t2_13_1_gbs\t2_16_3_gbs\t2_21_1_gbs\t2_22A_1_gbs\t2_24_2_gbs\t2_28_2_gbs\t2_31_2_gbs\t2_33_1_gbs\t2_39_3_gbs\t2_43_1_gbs2_5_1_gbs\t2_64_7_gbs\t2_67_2_gbs\t2_6_4_gbs\t2_84_2_gbs\t2_8_3_gbs\t2_95_2_gbs\t4_100B_4_gbs\t4_108_10_gbs\t4_110_11_gbs\t4_111_6_gbs\t4_115B_2_gbs\t4_11B_3_gbs\t4_123B_2_gbs\t4_127_6_gbs\t4_131_1_gbs\t4_136B_3_gbs\t4_136_10_T1_gbs\t4_138B_2_gbs\t4_26_11_gbs\t4_28_4_gbs\t4_33_2_gbs\t4_35_1_gbs\t4_38_2_gbs\t4_39_2_gbs\t4_41B_2_gbs\t4_42_11_gbs\t4_45_2_gbs\t4_53_2_gbs\t4_5_5_gbs\t4_62_4_gbs\t4_64B_1_gbs\t4_65_5_gbs\t4_66_2_gbs\t4_71_2_gbs\t4_72_1_gbs\t4_77_1_gbs\t4_7B_1_gbs\t4_7_2_gbs\t4_81B_2_gbs\t4_82B_4_gbs\t4_85_1_gbs\t4_95_1_gbs\t4_9_1_gbs\t5_14B_1_gbs\t5_15B_1_gbs\t5_18_1_gbs\t5_22_2_gbs\t5_24_2_gbs\t5_25_2_gbs\t5_32_3_gbs\t5_33B_4_gbs\t5_34B_2_gbs\t5_3_1_gbs\t5_40B_2_gbs\t5_49B_2_T1_gbs\t5_57_1_gbs\t5_58_1_gbs\t5_66_1_gbs\t5_80B_2_gbs\tMU_16_5_gbs\tV_196_2_gbs\t1\t2 s7\t4039693\tS7_4039693\tT\tG\t.\tPASS\tIV0=F\tGT\t0/0\t0/0\t0/0\t1/1\t0/0\t1/1\t1/1\t1/1\t1/1\t0/0\t0/0\t0/0\t1/1\t0/0\t0/0\t1/1\t0/0\t1/1\t0/0\t0/0\t0/0\t1/1\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t0/0\t0/0\t0/0\t0/0\t1/1\t0/0\t1/1\t0/0\t0/0\t1/1\t1/1\t0/0\t1/1\t1/1\t1/1\t0/0\t1/1\t1/1\t1/1\t0/0\t1/1\t1/1\t0/0\t0/0\t0/0\t0/0\t0/0\t1/1\t0/0\t0/0\t./.\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t1/1\t0/0\t0/0\t1/1\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t1/1\t0/0\t1/1\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t1/1\t0/0\t1/1\t0/0\t0/0\t0/0\t0/0\t1/1 s7\t4028261\tS7_4028261\tC\tT\t.\tPASS\tIV0=F\tGT\t1/1\t1/1\t./.\t0/0\t1/1\t0/0\t./.\t0/0\t0/0\t1/1\t1/1\t1/1\t0/0\t1/1\t1/1\t0/0\t1/1\t0/0\t1/1\t1/1\t1/1\t0/0\t1/1\t1/1\t1/1\t1/1\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t0/0\t1/1\t0/0\t1/1\t1/1\t0/0\t0/0\t1/1\t0/0\t0/0\t0/0\t1/1\t0/0\t0/0\t0/0\t0/0\t0/0\t./.\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/1\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t1/1\t0/0 ''' vcf = StringIO(VCF_HEADER + vcf) reader = VCFReader(vcf) snps = list(reader.parse_snvs()) recomb = _calc_recomb_rate(snps[0].record.samples, snps[1].record.samples, 'ril_self') self.assertAlmostEqual(recomb, 0.8187, 3)
def test_id(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT N1 20\t14370\tid1\tG\tA\t29\tPASS\tH2\tGT:GQ:DP:HQ\t0|0:48:1:51,51 20\t14371\tid2;id3\tG\tA\t29\tPASS\tH2\tGT:GQ:DP:HQ\t0|0:48:1:51,51 20\t14372\t.\tG\tA\t29\tPASS\tH2\tGT:GQ:DP:HQ\t0|0:48:1:51,51''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) assert snps[0].ids == ['id1'] assert snps[1].ids == ['id2', 'id3'] assert not snps[2].ids assert snps[1].get_or_create_id() == 'id2' assert snps[2].get_or_create_id(prefix='snp_') == 'snp_20_14372'
def test_allele_depths(self): vcf = open(join(TEST_DATA_DIR, 'freebayes_al_depth.vcf')) snps = list(VCFReader(vcf).parse_snvs()) snp = snps[0] result = [None, None, (1, 0), None, None, (0, 1)] for sample, res in zip(snp.calls, result): if res is None: assert sample.ref_depth is None assert not sample.allele_depths else: assert sample.ref_depth == res[0] assert sample.allele_depths[1] == res[1]
def test_no_allele_depths(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t1/1\t1/1\t0/0\t0/0\t0/1''' vcf = StringIO(VCF_HEADER2 + vcf) snps = list(VCFReader(vcf, min_calls_for_pop_stats=4).parse_snvs()) filter_ = LowEvidenceAlleleFilter(0.99) try: snps = [filter_(snp) for snp in snps] self.fail('RuntimeError expected') except RuntimeError: pass
def test_hetegorigot_percent(self): het_in_samples = HeterozigoteInSamples(filter_id=1) records = list( VCFReader(open(FREEBAYES4_VCF_PATH), min_calls_for_pop_stats=1).parse_snvs()) snp = records[0].copy() het_in_samples(snp) info_id = het_in_samples.info_id assert snp.infos[info_id] == 'True' snp = records[1].copy() het_in_samples = HeterozigoteInSamples(filter_id=1, gq_threshold=30, min_num_called=3) het_in_samples(snp) info_id = het_in_samples.info_id assert snp.infos[info_id] == 'None' het_in_samples = HeterozigoteInSamples(filter_id=1, gq_threshold=50, min_num_called=8) snp = records[0].copy() het_in_samples(snp) info_id = het_in_samples.info_id assert snp.infos[info_id] == 'None' het_in_samples = HeterozigoteInSamples(filter_id=1, gq_threshold=30, min_num_called=3, min_percent_het_gt=30) snp = records[0].copy() het_in_samples(snp) for snp in records: if snp.pos == 272668159 and snp.chrom == 'Pepper.v.1.55.chr01': snp = snp.copy() het_in_samples(snp) info_id = het_in_samples.info_id assert snp.infos[info_id] == 'False' break het_in_samples = HeterozigoteInSamples( filter_id=1, gq_threshold=30, min_num_called=3, min_percent_het_gt=30, samples=['sample05_gbs', 'sample06_gbs', 'sample07_gbs']) for snp in records: if snp.pos == 228123401 and snp.chrom == 'Pepper.v.1.55.chr10': snp = snp.copy() het_in_samples(snp) info_id = het_in_samples.info_id assert snp.infos[info_id] == 'None' break
def run_genotype_filters(in_fhand, out_fhand, gt_filters, plots_dir=None, reader_kwargs=None): if reader_kwargs is None: reader_kwargs = {} reader_kwargs["filename"] = "pyvcf_bug_workaround" reader_kwargs["compressed"] = False reader = VCFReader(in_fhand, **reader_kwargs) templa_reader = VCFReader(StringIO(reader.header)) writer = VCFWriter(out_fhand, template_reader=templa_reader) for snv in reader.parse_snvs(): for mapper in gt_filters: snv = mapper(snv) try: writer.write_snv(snv) except IOError, error: # The pipe could be already closed if "Broken pipe" in str(error): break else: raise
def test_het_filter(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,. ''' in_fhand = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(in_fhand).parse_snvs()) exp = [[0, 0], [1, 0], [1, 1]] assert [call.int_alleles for call in snps[0].calls] == exp res = [ call.int_alleles for call in snps[0].remove_gt_from_het_calls().calls ] assert res == [[0, 0], [], [1, 1]]
def test_cache(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8 20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t 20\t703\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t 20\t2003\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t 20\t3003\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t 20\t3403\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t ''' vcf = StringIO(VCF_HEADER + vcf) snps = VCFReader(vcf).parse_snvs() snvs = filter_snvs_by_ld(snps, p_val=0.001, bonferroni=False, snv_win=3) assert not list(snvs)
def test_snv_read_pos_distrib(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 reference1\t187\trs6054257\tAA\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,. reference1\t210\t.\tA\tAA\t3\tq10\tNS=3;DP=11;AF=0.017\tGT:GQ:DP:HQ\t0|0:49:3:58,50\t0|1:3:5:65,3\t0/0:41:3 reference1\t215\trs6040355\tA\tG,T\t67\tPASS\tNS=2;DP=10;AF=0.333,0.667;AA=T;DB\tGT:GQ:DP:HQ\t1|2:21:6:23,27\t2|1:2:0:18,2\t2/2:35:4 reference1\t230\t.\tT\t.\t47\tPASS\tNS=3;DP=13;AA=T\tGT:GQ:DP:HQ\t0|0:54:7:56,60\t0|0:48:4:51,51\t0/0:61:2 reference2\t350\tmicrosat1\tGTC\tG,GTCT\t50\tPASS\tNS=3;DP=9;AA=G\tGT:GQ:DP\t0/1:35:4\t0/2:17:2\t1/1:40:3 reference2\t400\tmicrosat1\tGTC\tG,GTCT\t50\tPASS\tNS=3;DP=9;AA=G\tGT:GQ:DP\t./.:35:4\t0/2:17:2\t1/1:40:3 ''' snvs = VCFReader(StringIO(vcf)).parse_snvs() bam_fpath = join(TEST_DATA_DIR, 'seqs.bam') sam = pysam.AlignmentFile(bam_fpath) stats = calc_snv_read_pos_stats(sam, snvs) assert 'group1+454' in stats['5_read_pos_counts'].keys() assert '5_read_pos_boxplot' in stats assert '3_read_pos_boxplot' in stats assert repr( stats['5_read_pos_counts'] ) == """{'group1+454': IntCounter({24: 9, 1: 9, 44: 9, 29: 9}), 'group2+454': IntCounter({11: 9, 61: 6, 59: 3})}""" assert repr( stats['3_read_pos_counts'] ) == """{'group1+454': IntCounter({73: 9, 50: 9, 45: 9, 30: 9}), 'group2+454': IntCounter({14: 6, 64: 3, 65: 3, 62: 3, 15: 3})}""" snvs = VCFReader(StringIO(vcf)).parse_snvs() bam_fpath = join(TEST_DATA_DIR, 'seqs.bam') sam = pysam.AlignmentFile(bam_fpath) stats = calc_snv_read_pos_stats2(sam, snvs) assert 'group1+454' in stats['5_read_pos_counts'].keys() assert '5_read_pos_boxplot' in stats assert '3_read_pos_boxplot' in stats assert repr( stats['5_read_pos_counts'] ) == """{'group1+454': IntCounter({24: 9, 1: 9, 44: 9, 29: 9}), 'group2+454': IntCounter({11: 9, 61: 6, 59: 3})}""" assert repr( stats['3_read_pos_counts'] ) == """{'group1+454': IntCounter({73: 9, 50: 9, 45: 9, 30: 9}), 'group2+454': IntCounter({14: 6, 64: 3, 65: 3, 62: 3, 15: 3})}""" fhand = NamedTemporaryFile(suffix='.png') draw_read_pos_stats(stats, fhand)
def test_no_geno_no_alle_freq(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t./.\t./.\t./.\t./.\t./.\t./. 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t1/1\t1/1\t0/0\t0/0\t0/1''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) filter_ = LowEvidenceAlleleFilter(0.99) snps = [filter_(snp) for snp in snps] expected = [False] * 12 res = [call.called for snp in snps for call in snp.calls] assert filter_.log == {'not_enough_individuals': 12, 'tot': 12} assert expected == res
def test_complexity_filter(self): records = list(VCFReader(open(VCF_PATH)).parse_snvs()) low_complexity = LowComplexityRegionAnnotator(ref_fpath=REF_PATH) snp = records[0].copy() low_complexity(snp) assert low_complexity.name not in snp.filters snp1 = records[1] low_complexity(snp1) assert low_complexity.name not in snp1.filters fhand = NamedTemporaryFile(suffix='.png') low_complexity.draw_hist(fhand)
def test_amino_change_filter(self): seq_ref = """>SEUC00016_TC01 CACGCTAAACAACGATCATTGTCATCGGTACCGATTGTTACAAGTTGTGTGCAGTGTCGT GCTATTTGTGTGTACATTCCTTCTAAGATGTCGTCAACAAAGTGGTTGGTGTGTGCGCTA GTGGTGGTGTGCGTGAGCGTAAGGCAAGCAACATCTGCGCCGGCGCCGCAGGAACAAGAA TACCCGCCTATGCCCTACGAGTACAAATATGACGTTGAAGATCAAGAGCTTGAAGAGAAA GCTCTCTACTTCGGAGCCAACGAAGCAGGAGATGCCCAGGGCAAGGTCATCGGAGGATAC CGAGTTCTCCTCCCCGATGGTCGTCTTATGACCGTCGAGTACAGTGTGGAGGGAGAAAGC GGTTTCGTTCCCAAAATCACCTTCGAAGACAACGCCAGCCCCTTCGGCAAAGGAAAGTAG ACCTTATAACGACGCCTACAAGACTGGTACCGCGATCAATTGATACTAGTTCAATTTGAT TTCTGAATTCTATGCCGTAAAACATTTTCTTTTATTAATTATACCGATTTCGATAAATAG ACATCTTTACCTACTTAACGAATTTCTCATAGGATTCAGAAGTCGAAACCGAAAAAAGTT ACTTCAGTTTTCATTAGATTGTAAATGTGTGTAAATTATTATTATTATTATATCAGGGAT CCTTAAGTTGATATTAGTGGTGATATAAACGATATTTATGAACGACAATCAGGTATCGTC ACTGGCTTGAGTAATGTTAGAAAAAATATAATTTTACCGAAAGCATTAGTAACTTTTTTC ACGATTATAATCTCCCATACATACTGTATACTTACGTTACGTATAATAATTTTGATTGTC TTCATAGTGTACTCTATAATATATGTAGGTGTAGGCAAAACTCATTCGCCAATAAGATAA TATGTACAGTCAGCGATTTCTAAGATAAATTTGTACCGCAAATATCGAGTTACCGATACT GTGATCAATTAGAACG""" orf_seq = '''>SEUC00016_TC01_orf_seq start=89 end=421 strand=forward ATGTCGTCAACAAAGTGGTTGGTGTGTGCGCTAGTGGTGGTGTGCGTGAGCGTAAGGCAAGCAACATCTGCGC CGGCGCCGCAGGAACAAGAATACCCGCCTATGCCCTACGAGTACAAATATGACGTTGAAGATCAAGAGCTTGAA GAGAAAGCTCTCTACTTCGGAGCCAACGAAGCAGGAGATGCCCAGGGCAAGGTCATCGGAGGATACCGAGTTCT CCTCCCCGATGGTCGTCTTATGACCGTCGAGTACAGTGTGGAGGGAGAAAGCGGTTTCGTTCCCAAAATCACCT TCGAAGACAACGCCAGCCCCTTCGGCAAAGGAAAGTAG''' ref_fhand = NamedTemporaryFile() ref_fhand.write(seq_ref) ref_fhand.flush() orf_fhand = NamedTemporaryFile() orf_fhand.write(orf_seq) orf_fhand.flush() vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 SEUC00016_TC01\t112\trs6054257\tT\tC\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,. ''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) f = AminoChangeAnnotator(ref_fpath=ref_fhand.name, orf_seq_fpath=orf_fhand.name) snv = snps[0].copy() f(snv) assert f.name in snv.filters assert snv.infos['AAC'] == 'C->R' f = AminoSeverityChangeAnnotator(ref_fpath=ref_fhand.name, orf_seq_fpath=orf_fhand.name) record = snps[0].copy() f(record) assert f.name in record.filters
def __init__(self, vcf_fpath, gq_threshold=None, dp_threshold=100, min_calls_for_pop_stats=DEF_MIN_CALLS_FOR_POP_STATS, remarkable_coverages=None, window_size=WINDOWS_SIZE): if remarkable_coverages is None: remarkable_depths = REMARKABLE_DEPTHS self.remarkable_depths = remarkable_depths self._reader = VCFReader(open(vcf_fpath), min_calls_for_pop_stats=min_calls_for_pop_stats) self._random_reader = pyvcfReader(filename=vcf_fpath) self.window_size = window_size self._gq_threshold = 0 if gq_threshold is None else gq_threshold self.dp_threshold = dp_threshold self._gt_qual_depth_counter = {HOM: IntBoxplot(), HET: IntBoxplot()} self._ac2d = _AlleleCounts2D() self.sample_dp_coincidence = {1: IntCounter()} for cov in remarkable_depths: self.sample_dp_coincidence[cov] = IntCounter() self.called_snvs = 0 self.called_gts = IntCounter() # sample_counter self._sample_counters = {} for counter_name in SAMPLE_COUNTERS: if counter_name not in self._sample_counters: self._sample_counters[counter_name] = {} for sample in self._reader.samples: if counter_name in (GT_DEPTHS, GT_QUALS): counters = {HOM: IntCounter(), HET: IntCounter()} else: counters = IntCounter() self._sample_counters[counter_name][sample] = counters self._snv_counters = {MAFS: IntCounter(), MACS: IntCounter(), MAFS_DP: IntCounter(), SNV_QUALS: IntCounter(), HET_IN_SNP: IntCounter(), SNV_DENSITY: IntCounter(), INBREED_F_IN_SNP: IntCounter(), DEPTHS: IntCounter()} self._calculate()
def test_header(self): varscan = open(join(TEST_DATA_DIR, 'sample.vcf.gz')) header = VCFReader(varscan).header assert '##fileformat=VCFv4.1' in header assert '#CHROM' in header assert len([line for line in header.split('\n')]) == 24
def test_sliding_window(self): fhand = open(join(TEST_DATA_DIR, 'sample_to_window.vcf.gz')) reader = VCFReader(fhand=fhand) # snps in this vcf [9, 19, 29, 0, 11, 20] windows = list(reader.sliding_windows(size=10, min_num_snps=1)) assert [snp.pos for snp in windows[0]['snps']] == [9] assert [snp.pos for snp in windows[1]['snps']] == [19] assert [snp.pos for snp in windows[2]['snps']] == [29] assert [snp.pos for snp in windows[3]['snps']] == [0] assert [snp.pos for snp in windows[4]['snps']] == [11] windows = list(reader.sliding_windows(size=20, min_num_snps=1)) assert [snp.pos for snp in windows[0]['snps']] == [9, 19] assert [snp.pos for snp in windows[1]['snps']] == [0, 11] ref = '>CUUC00007_TC01\nCTGATGCTGATCGTGATCGAGTCGTAGTCTAGTCGATGTCGACG\n' ref += '>CUUC00029_TC01\nCTGATGCTGATCGTGATCGAGTCGTAGTCTAGTCGATGTCGAA\n' fhand = open(join(TEST_DATA_DIR, 'sample_to_window.vcf.gz')) reader = VCFReader(fhand=fhand) windows = list(reader.sliding_windows(size=10, min_num_snps=1, ref_fhand=StringIO(ref))) assert [snp.pos for snp in windows[0]['snps']] == [9] assert [snp.pos for snp in windows[1]['snps']] == [19] assert [snp.pos for snp in windows[2]['snps']] == [29] assert [snp.pos for snp in windows[3]['snps']] == [0] assert [snp.pos for snp in windows[4]['snps']] == [11] assert [snp.pos for snp in windows[5]['snps']] == [20] # with fasta fhand = open(join(TEST_DATA_DIR, 'sample_to_window.vcf.gz')) reader = VCFReader(fhand=fhand) windows = list(reader.sliding_windows(size=20, min_num_snps=1, ref_fhand=StringIO(ref))) assert [snp.pos for snp in windows[0]['snps']] == [9, 19] assert [snp.pos for snp in windows[1]['snps']] == [29] assert [snp.pos for snp in windows[2]['snps']] == [0, 11] assert [snp.pos for snp in windows[3]['snps']] == [20] # we skip windows that have no snps fhand = open(join(TEST_DATA_DIR, 'sample_to_window.vcf.gz')) reader = VCFReader(fhand=fhand) windows = list(reader.sliding_windows(size=5, min_num_snps=1, ref_fhand=StringIO(ref))) assert [snp.pos for snp in windows[0]['snps']] == [9] assert [snp.pos for snp in windows[1]['snps']] == [19] assert [snp.pos for snp in windows[2]['snps']] == [29] assert [snp.pos for snp in windows[3]['snps']] == [0] assert [snp.pos for snp in windows[4]['snps']] == [11] assert [snp.pos for snp in windows[5]['snps']] == [20] # we skip no window fhand = open(join(TEST_DATA_DIR, 'sample_to_window.vcf.gz')) reader = VCFReader(fhand=fhand) windows = list(reader.sliding_windows(size=5, min_num_snps=0, ref_fhand=StringIO(ref))) assert [snp.pos for snp in windows[0]['snps']] == [] assert [snp.pos for snp in windows[1]['snps']] == [9] assert [snp.pos for snp in windows[2]['snps']] == [] assert [snp.pos for snp in windows[3]['snps']] == [19] fhand = open(join(TEST_DATA_DIR, 'sample_to_window.vcf.gz')) reader = VCFReader(fhand=fhand) windows = list(reader.sliding_windows(size=10, min_num_snps=0, ref_fhand=StringIO(ref), step=5)) assert [snp.pos for snp in windows[0]['snps']] == [9] assert [snp.pos for snp in windows[1]['snps']] == [9] assert [snp.pos for snp in windows[2]['snps']] == [19] assert [snp.pos for snp in windows[3]['snps']] == [19]
class VcfStats(object): def __init__(self, vcf_fpath, gq_threshold=None, dp_threshold=100, min_calls_for_pop_stats=DEF_MIN_CALLS_FOR_POP_STATS, remarkable_coverages=None, window_size=WINDOWS_SIZE): if remarkable_coverages is None: remarkable_depths = REMARKABLE_DEPTHS self.remarkable_depths = remarkable_depths self._reader = VCFReader(open(vcf_fpath), min_calls_for_pop_stats=min_calls_for_pop_stats) self._random_reader = pyvcfReader(filename=vcf_fpath) self.window_size = window_size self._gq_threshold = 0 if gq_threshold is None else gq_threshold self.dp_threshold = dp_threshold self._gt_qual_depth_counter = {HOM: IntBoxplot(), HET: IntBoxplot()} self._ac2d = _AlleleCounts2D() self.sample_dp_coincidence = {1: IntCounter()} for cov in remarkable_depths: self.sample_dp_coincidence[cov] = IntCounter() self.called_snvs = 0 self.called_gts = IntCounter() # sample_counter self._sample_counters = {} for counter_name in SAMPLE_COUNTERS: if counter_name not in self._sample_counters: self._sample_counters[counter_name] = {} for sample in self._reader.samples: if counter_name in (GT_DEPTHS, GT_QUALS): counters = {HOM: IntCounter(), HET: IntCounter()} else: counters = IntCounter() self._sample_counters[counter_name][sample] = counters self._snv_counters = {MAFS: IntCounter(), MACS: IntCounter(), MAFS_DP: IntCounter(), SNV_QUALS: IntCounter(), HET_IN_SNP: IntCounter(), SNV_DENSITY: IntCounter(), INBREED_F_IN_SNP: IntCounter(), DEPTHS: IntCounter()} self._calculate() def _add_depth(self, snp): depth = snp.depth if depth is None: depth = 0 self._snv_counters[DEPTHS][depth] += 1 def _add_maf_and_mac(self, snp): maf = snp.maf if maf: maf = int(round(maf * 100)) self._snv_counters[MAFS][maf] += 1 mac = snp.mac if mac: self._snv_counters[MACS][mac] += 1 def _add_maf_dp(self, snp): maf_dp = snp.maf_depth if maf_dp is not None: self._snv_counters[MAFS_DP][int(round(maf_dp * 100))] += 1 for call in snp.calls: maf_dp = call.maf_depth if maf_dp is None: continue sample = call.sample maf_depth = int(round(maf_dp * 100)) self._sample_counters[MAFS_DP][sample][maf_depth] += 1 def _add_snv_qual(self, snp): snv_qual = snp.qual if snv_qual is not None: self._snv_counters[SNV_QUALS][int(round(snv_qual))] += 1 def _add_snv_density(self, snp): windows_size = self.window_size pos = snp.pos start = pos - windows_size if pos - windows_size > windows_size else 0 end = pos + windows_size chrom = snp.chrom num_snvs = len(list(self._random_reader.fetch(chrom, start, end))) - 1 self._snv_counters[SNV_DENSITY][num_snvs] += 1 def _add_snv_het_obs_fraction(self, snp): obs_het = snp.obs_het if obs_het is None: return self._snv_counters[HET_IN_SNP][int(round(obs_het * 100))] += 1 inbreed_coef = snp.inbreed_coef if inbreed_coef is None: return inbreed_coef = int(round(inbreed_coef * 100)) self._snv_counters[INBREED_F_IN_SNP][inbreed_coef] += 1 @staticmethod def _num_samples_higher_equal_dp(depth, snp): n_samples = 0 for call in snp.calls: if not call.called: continue if call.depth >= depth: n_samples += 1 return n_samples def _calculate(self): snp_counter = 0 for snp in self._reader.parse_snvs(): snp_counter += 1 self._add_maf_dp(snp) self._add_maf_and_mac(snp) self._add_snv_qual(snp) self._add_snv_density(snp) self._add_snv_het_obs_fraction(snp) self._add_depth(snp) for depth, counter in self.sample_dp_coincidence.viewitems(): n_samples = self._num_samples_higher_equal_dp(depth, snp) counter[n_samples] += 1 n_gt_called = 0 for call in snp.calls: if not call.called: continue n_gt_called += 1 sample_name = call.sample ref_depth = call.ref_depth acs = call.alt_sum_depths gt_type = call.gt_type gt_broud_type = HET if call.is_het else HOM depth = call.depth gt_qual = call.gt_qual if depth is not None and depth < self.dp_threshold: self._gt_qual_depth_counter[gt_broud_type].append(depth, gt_qual) # CHECK THIS. This is an special case where the only info we # have is the genotype if gt_qual is None: self._sample_counters[GT_TYPES][sample_name][gt_type] += 1 if depth is not None: self._sample_counters[GT_DEPTHS][sample_name][gt_broud_type][depth] += 1 elif gt_qual >= self._gq_threshold: self._sample_counters[GT_TYPES][sample_name][gt_type] += 1 self._sample_counters[GT_QUALS][sample_name][gt_broud_type][gt_qual] += 1 self._sample_counters[GT_DEPTHS][sample_name][gt_broud_type][depth] += 1 self._ac2d.add(rc=ref_depth, acs=acs, gt=call.int_alleles, gq=gt_qual) self.called_gts[n_gt_called] += 1 self.called_snvs += 1 def _get_sample_counter(self, kind, sample=None, gt_broud_type=None): counters = self._sample_counters[kind] if sample is not None: if gt_broud_type is None: return counters[sample] else: return counters[sample][gt_broud_type] all_counters = IntCounter() for sample_counter in counters.values(): if gt_broud_type is None: all_counters += sample_counter else: all_counters += sample_counter[gt_broud_type] return all_counters def macs(self): return self._snv_counters[MACS] def mafs(self): return self._snv_counters[MAFS] def mafs_dp(self, sample=None): if sample is None: return self._snv_counters[MAFS_DP] return self._get_sample_counter(MAFS_DP, sample) def gt_depths(self, gt_broud_type, sample=None): return self._get_sample_counter(GT_DEPTHS, sample, gt_broud_type=gt_broud_type) def gt_quals(self, gt_broud_type, sample=None): return self._get_sample_counter(GT_QUALS, sample, gt_broud_type=gt_broud_type) def heterozigosity_for_sample(self, sample): sample_gt_types = self._get_sample_counter(GT_TYPES, sample) het_gt = sample_gt_types[HET] all_gts = sample_gt_types.count try: heterozigosity = het_gt / all_gts except ZeroDivisionError: heterozigosity = 0 return heterozigosity def gt_types(self, sample=None): return self._get_sample_counter(GT_TYPES, sample) @property def samples(self): return self._reader.samples @property def min_calls_for_pop_stats(self): return self._reader.min_calls_for_pop_stats @property def snv_density(self): return self._snv_counters[SNV_DENSITY] @property def snv_quals(self): return self._snv_counters[SNV_QUALS] @property def het_by_snp(self): return self._snv_counters[HET_IN_SNP] @property def inbreeding_by_snp(self): return self._snv_counters[INBREED_F_IN_SNP] @property def allelecount2d(self): return self._ac2d @property def gt_depths_by_gt_and_qual(self): return self._gt_qual_depth_counter @property def depths(self): return self._snv_counters[DEPTHS]
def filter_vcf(self, vcf_fpath, min_samples=DEF_MIN_CALLS_FOR_POP_STATS): reader = VCFReader(open(vcf_fpath), min_calls_for_pop_stats=min_samples) snvs = reader.parse_snvs() random_reader = VCFReader(open(vcf_fpath)) for snv_1 in snvs: self.tot_snps += 1 loc = snv_1.pos if self.plot_dir: chrom = str(snv_1.chrom) fname = chrom + '_' + str(loc) + '.png' chrom_dir = pjoin(self.plot_dir, chrom) if not exists(chrom_dir): mkdir(chrom_dir) plot_fhand = open(pjoin(chrom_dir, fname), 'w') debug_plot_info = [] else: plot_fhand = None win_1_start = loc - (self.win_width / 2) if win_1_start < 0: win_1_start = 0 win_1_end = loc - (self.win_mask_width / 2) if win_1_end < 0: win_1_end = 0 if win_1_end != 0: snvs_win_1 = random_reader.fetch_snvs(snv_1.chrom, start=int(win_1_start), end=int(win_1_end)) else: snvs_win_1 = [] win_2_start = loc + (self.win_mask_width / 2) win_2_end = loc + (self.win_width / 2) snvs_win_2 = random_reader.fetch_snvs(snv_1.chrom, start=win_2_start, end=win_2_end) snvs_in_win = list(chain(snvs_win_1, snvs_win_2)) if len(snvs_in_win) > self.num_snvs_check: snvs_in_win = random.sample(snvs_in_win, self.num_snvs_check) if len(snvs_in_win) < self.min_num_snvs_check_in_win: # Not enough snps to check continue orig_snp = snv_1 if self.samples is not None: snv_1 = snv_1.filter_calls_by_sample(self.samples) exp_cnts = snv_1.biallelic_genotype_counts if exp_cnts is None: continue test_values = [] for snv_2 in snvs_in_win: if self.samples is not None: snv_2 = snv_2.filter_calls_by_sample(self.samples) obs_cnts = snv_2.biallelic_genotype_counts if obs_cnts is None: continue test_values.append(_fisher_extact_rxc(obs_cnts, exp_cnts)) if plot_fhand: debug_plot_info.append({'pos': snv_2.pos, 'AA': obs_cnts[0], 'Aa': obs_cnts[1], 'aa': obs_cnts[2], 'close_snp': True}) alpha2 = self.alpha/len(test_values) results = [] for idx, val in enumerate(test_values): result = False if val is None else val > alpha2 results.append(result) if plot_fhand: debug_plot_info[idx]['result'] = result if len(test_values) < self.min_num_snvs_check_in_win: # few snps can be tested for segregation continue tot_checked = len(test_values) if tot_checked > 0: failed_freq = results.count(False) / tot_checked passed = self.max_failed_freq > failed_freq else: failed_freq = None passed = False if failed_freq is not None: self._failed_freqs.append(failed_freq) if plot_fhand: debug_plot_info.append({'pos': snv_1.pos, 'AA': exp_cnts[0], 'Aa': exp_cnts[1], 'aa': exp_cnts[2], 'result': passed, 'close_snp': False}) self._plot_segregation_debug(debug_plot_info, plot_fhand) if passed: self.passed_snps += 1 yield orig_snp