def test_VariantQueryable_to_vcf(tmp_path): path = str(tmp_path / 'a.vcf') vcf = MultiSampleVCF(vcf_file) variant_queryable = vcf.query_all() variant_queryable.to_vcf(path, remove_samples=True, clean_info=True) vcf = MultiSampleVCF(path) assert len(vcf.samples) == 0
def test_VariantQueryable_to_sample_csv(tmp_path): vcf = MultiSampleVCF(vcf_file) variant_queryable = vcf.query_all() path = str(tmp_path / 'sample.csv') variant_queryable.to_sample_csv(path) df = pd.read_csv(path) df_expected = pd.DataFrame({ 'variant': ['chr1:4:T>C', 'chr1:25:AACG>GA'], 'sample': ['NA00003', 'NA00002'], 'genotype': [3, 3] }) pd.testing.assert_frame_equal(df, df_expected)
def variant_queryable(): vcf = MultiSampleVCF(vcf_file) return VariantIntervalQueryable( vcf, [([ Variant('chr1', 12, 'A', 'T'), Variant('chr1', 18, 'A', 'C', filter='q10'), ], Interval('chr1', 10, 20)), ([ Variant('chr2', 120, 'AT', 'AAAT'), ], Interval('chr2', 110, 200))])
def test_VariantQueryable_batch_iter(): vcf = MultiSampleVCF(vcf_file) variant_queryable = vcf.query_all() batches = list(variant_queryable.batch_iter(batch_size=1)) assert len(batches) == 3 vcf = MultiSampleVCF(vcf_file) variant_queryable = vcf.query_all() batches = list(variant_queryable.batch_iter(batch_size=2)) assert len(batches) == 2 assert len(batches[0].variant_intervals[0][0]) == 2 assert len(batches[1].variant_intervals[0][0]) == 1 vcf = MultiSampleVCF(vcf_file) variant_queryable = vcf.query_all() batches = list(variant_queryable.batch_iter(batch_size=10)) assert len(batches) == 1 variants, interval = batches[0].variant_intervals[0] assert interval == Interval('chr1', 3, 25) assert len(variants) == 3