def filter_snvs(in_fhand, out_fhand, filters, filtered_fhand=None, log_fhand=None, reader_kwargs=None): '''IT filters an input vcf. The input fhand has to be uncompressed. The original file could be a gzipped file, but in that case it has to be opened with gzip.open before sending it to this function. ''' if reader_kwargs is None: reader_kwargs = {} # The input fhand to this function cannot be compressed reader_kwargs.update({'compressed': False, 'filename': 'pyvcf_bug_workaround'}) reader = VCFReader(in_fhand, **reader_kwargs) template_reader = VCFReader(StringIO(reader.header)) writer = VCFWriter(out_fhand, template_reader=template_reader) if filtered_fhand: filtered_writer = VCFWriter(filtered_fhand, template_reader=template_reader) else: filtered_writer = None packets = group_in_filter_packets(reader.parse_snvs(), SNPS_PER_FILTER_PACKET) tot_snps = 00.01 passed_snps = OrderedDict() broken_pipe = False for packet in packets: tot_snps += len(packet[PASSED]) + len(packet[FILTERED_OUT]) for filter_ in filters: packet = filter_(packet) filter_name = filter_.__class__.__name__ if filter_name not in passed_snps: passed_snps[filter_name] = 0 passed_snps[filter_name] += len(packet[PASSED]) for snv in packet[PASSED]: if not _safe_write_snv(writer, snv): broken_pipe = True break if filtered_writer: for snv in packet[FILTERED_OUT]: if not _safe_write_snv(filtered_writer, snv): broken_pipe = True break if broken_pipe: break if log_fhand: _write_log(log_fhand, tot_snps, passed_snps) writer.flush()
def test_het_unknown(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8 20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t./.\t1/.\t ''' vcf = StringIO(VCF_HEADER + vcf) reader = VCFReader(vcf) snps = list(reader.parse_snvs()) snp = snps[0] expected = [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [], [1, None]] assert [call.int_alleles for call in snps[0].calls] == expected assert snp.num_called == 7 out_fhand = StringIO() writer = VCFWriter(out_fhand, reader) for snv in snps: writer.write_snv(snv) assert '1/1\t./.\t1/.' in out_fhand.getvalue()
def test_vcf_writer(self): varscan = open(join(TEST_DATA_DIR, 'vari_filter.vcf')) reader = VCFReader(fhand=varscan) out_fhand = NamedTemporaryFile() writer = VCFWriter(out_fhand, reader) for snv in reader.parse_snvs(): writer.write_snv(snv) writer.flush() assert 'CUUC00027_TC01' in open(out_fhand.name).read() writer.close()
def run_genotype_filters(in_fhand, out_fhand, gt_filters, reader_kwargs=None): if reader_kwargs is None: reader_kwargs = {} reader_kwargs['filename'] = 'pyvcf_bug_workaround' reader_kwargs['compressed'] = False reader = VCFReader(in_fhand, **reader_kwargs) templa_reader = VCFReader(StringIO(reader.header)) writer = VCFWriter(out_fhand, template_reader=templa_reader) for snv in reader.parse_snvs(): for mapper in gt_filters: snv = mapper(snv) try: writer.write_snv(snv) except IOError, error: # The pipe could be already closed if 'Broken pipe' in str(error): break else: raise