예제 #1
0
def test_phase_ped_sample(tmpdir, sample_set):
    # running with --ped and --sample on subset of trio, should give same results as running with only --sample
    # the trio information should be ignored
    outvcf1 = str(tmpdir.join("output1.vcf"))
    outvcf2 = str(tmpdir.join("output2.vcf"))
    run_whatshap(
        phase_input_files=[ped_samples_bamfile],
        variant_file="tests/data/ped_samples.vcf",
        output=outvcf1,
        ped="tests/data/trio.ped",
        samples=sample_set,
    )
    run_whatshap(
        phase_input_files=[ped_samples_bamfile],
        variant_file="tests/data/ped_samples.vcf",
        output=outvcf2,
        samples=sample_set,
    )

    assert os.path.isfile(outvcf1)
    assert os.path.isfile(outvcf2)

    tables1 = list(VcfReader(outvcf1, phases=True))
    tables2 = list(VcfReader(outvcf2, phases=True))

    assert len(tables1) == 1 and len(tables2) == 1
    table1, table2 = tables1[0], tables2[0]

    for individual in sample_set:
        assert_phasing(table1.phases_of(individual),
                       table2.phases_of(individual))
예제 #2
0
def test_ped_sample(sample_set, tmp_path):
    # running with --ped and --sample on subset of trio,
    # should give same results as running with only --sample
    # the trio information should be ignored
    outvcf1 = tmp_path / "output1.vcf"
    outvcf2 = tmp_path / "output2.vcf"
    run_genotype(
        phase_input_files=[ped_samples_bamfile],
        variant_file="tests/data/ped_samples.vcf",
        output=outvcf1,
        ped="tests/data/trio.ped",
        samples=sample_set,
    )
    run_genotype(
        phase_input_files=[ped_samples_bamfile],
        variant_file="tests/data/ped_samples.vcf",
        output=outvcf2,
        samples=sample_set,
    )
    assert os.path.isfile(outvcf1)
    assert os.path.isfile(outvcf2)
    tables1 = list(VcfReader(outvcf1, phases=True, genotype_likelihoods=True))
    tables2 = list(VcfReader(outvcf2, phases=True, genotype_likelihoods=True))
    assert (len(tables1) == 1) and (len(tables2) == 1)
    table1, table2 = tables1[0], tables2[0]

    for individual in sample_set:
        for var1, var2 in zip(
                table1.genotype_likelihoods_of(individual),
                table2.genotype_likelihoods_of(individual),
        ):
            print(var1, var2)
            assert var1.log10_probs() == var2.log10_probs()
예제 #3
0
파일: test_vcf.py 프로젝트: gdv/whatshap
def test_read_region_subsets():
    regions = [(1069570, 1070690), (1074910, 1076152)]
    vcf_reader = VcfReader("tests/data/haplotag_1.vcf.gz", indels=True)
    table = vcf_reader.fetch_regions("chr1", regions)
    assert table.chromosome == "chr1"
    assert len(table.variants) == 8
    assert table.variants[5].reference_allele == "CG"
    assert table.variants[5].alternative_allele == "C"
예제 #4
0
파일: test_vcf.py 프로젝트: gdv/whatshap
def test_read_genotype_likelihoods():
    tables = list(VcfReader("tests/data/genotype-likelihoods.vcf", genotype_likelihoods=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "chrA"
    assert table.samples == ["sample1", "sample2"]
    assert len(table.variants) == 4

    assert len(table.genotypes) == 2
    assert list(table.genotypes[0]) == canonic_index_list_to_biallelic_gt_list([2, 1, 1, 1])
    assert list(table.genotypes[1]) == canonic_index_list_to_biallelic_gt_list([1, 0, 0, 1])

    gl0 = GenotypeLikelihoods([-2.1206, -0.8195, -0.07525])
    gl1 = GenotypeLikelihoods([-10.3849, 0, -5.99143])
    gl2 = GenotypeLikelihoods([-2.1, -0.8, -0.8])
    gl3 = GenotypeLikelihoods([0, -10.0, -0.6])

    assert len(table.genotype_likelihoods_of("sample1")) == 4
    assert len(table.genotype_likelihoods_of("sample2")) == 4

    expected1 = [gl0, gl2, None, gl0]
    expected2 = [gl1, gl3, None, gl1]
    for actual_gl, expected_gl in zip(table.genotype_likelihoods_of("sample1"), expected1):
        assert_genotype_likelihoods(actual_gl, expected_gl)
    for actual_gl, expected_gl in zip(table.genotype_likelihoods_of("sample2"), expected2):
        assert_genotype_likelihoods(actual_gl, expected_gl)
예제 #5
0
파일: test_vcf.py 프로젝트: ekg/graphappy
def test_read_multisample_vcf():
    tables = list(VcfReader("tests/data/multisample.vcf"))
    assert len(tables) == 2
    table, table_b = tables
    assert table_b.chromosome == "chrB"
    assert table_b.samples == ["sample1", "sample2"]

    assert table.chromosome == "chrA"
    assert len(table.variants) == 3
    assert table.samples == ["sample1", "sample2"]

    assert table.variants[0].reference_allele == "A"
    assert table.variants[0].alternative_allele == "T"
    assert table.variants[1].reference_allele == "C"
    assert table.variants[1].alternative_allele == "G"
    assert table.variants[2].reference_allele == "G"
    assert table.variants[2].alternative_allele == "T"

    assert len(table.genotypes) == 2
    assert list(table.genotypes[0]) == canonic_index_list_to_biallelic_gt_list(
        [1, 1, 1])
    assert list(table.genotypes[1]) == canonic_index_list_to_biallelic_gt_list(
        [1, 1, 0])

    assert list(table.genotypes_of(
        "sample1")) == canonic_index_list_to_biallelic_gt_list([1, 1, 1])
    assert list(table.genotypes_of(
        "sample2")) == canonic_index_list_to_biallelic_gt_list([1, 1, 0])
예제 #6
0
파일: test_vcf.py 프로젝트: ekg/graphappy
def test_read_tetraploid_unphased():
    tables = list(
        VcfReader("tests/data/polyploid.chr22.unphased.vcf", phases=False))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "chr22"
    assert table.samples == ["HG00514_NA19240"]
    assert len(table.variants) == 8
    assert table.variants[0].reference_allele == "A"
    assert table.variants[0].alternative_allele == "C"
    assert table.variants[1].reference_allele == "G"
    assert table.variants[1].alternative_allele == "A"
    assert table.variants[2].reference_allele == "G"
    assert table.variants[2].alternative_allele == "T"
    assert table.variants[3].reference_allele == "G"
    assert table.variants[3].alternative_allele == "C"
    print("Got:")
    for genotype in table.genotypes[0]:
        print(genotype)
    print("Exp:")
    for genotypte in canonic_index_list_to_biallelic_gt_list(
        [3, 2, 0, 3, 3, 1, 1, 1]):
        print(genotype)
    assert table.genotypes[0] == canonic_index_list_to_biallelic_gt_list(
        [3, 2, 0, 3, 3, 1, 1, 1], 4)
예제 #7
0
파일: test_vcf.py 프로젝트: gdv/whatshap
def test_read_tetraploid_phased():
    tables = list(VcfReader("tests/data/polyploid.chr22.phased.vcf", phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "chr22"
    assert table.samples == ["HG00514_NA19240"]
    assert len(table.variants) == 8

    expected_phase = [
        VariantCallPhase(block_id=20000000, phase=(1, 0, 1, 1), quality=None),
        VariantCallPhase(block_id=20000000, phase=(1, 0, 1, 0), quality=None),
        None,
        VariantCallPhase(block_id=20000000, phase=(1, 0, 1, 1), quality=None),
        VariantCallPhase(block_id=20001000, phase=(1, 0, 1, 1), quality=None),
        VariantCallPhase(block_id=20001000, phase=(0, 0, 0, 1), quality=None),
        VariantCallPhase(block_id=20001000, phase=(0, 0, 0, 1), quality=None),
        VariantCallPhase(block_id=20001000, phase=(0, 0, 0, 1), quality=None),
    ]
    print("Got:")
    for variant in table.phases[0]:
        print(variant)
    print("Exp:")
    for variant in expected_phase:
        print(variant)
    assert list(table.phases[0]) == expected_phase
예제 #8
0
def test_phase_three_individuals(algorithm):
	with TemporaryDirectory() as tempdir:
		outvcf = tempdir + '/output.vcf'
		outreadlist = tempdir + '/readlist.tsv'
		run_whatshap(
			phase_input_files=[trio_bamfile],
			variant_file='tests/data/trio.vcf',
			read_list_filename=outreadlist,
			output=outvcf,
			algorithm=algorithm)
		assert os.path.isfile(outvcf)
		assert os.path.isfile(outreadlist)

		tables = list(VcfReader(outvcf, phases=True))
		assert len(tables) == 1
		table = tables[0]
		assert table.chromosome == '1'
		assert len(table.variants) == 5
		assert table.samples == ['HG004', 'HG003', 'HG002']

		phase1 = VariantCallPhase(60906167, 0, None)
		phase3 = VariantCallPhase(60907394, 0, None)
		assert_phasing(table.phases_of('HG004'), [None, phase3, phase3, phase3, None])
		assert_phasing(table.phases_of('HG003'), [phase1, None, phase1, None, None])
		assert_phasing(table.phases_of('HG002'), [None, None, None, None, None])
예제 #9
0
def test_genotype_log_likelihoods_given(tmp_path):
    outvcf = tmp_path / "output_gl_log.vcf"
    outpriors = tmp_path / "priors.vcf"
    run_genotype(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio_genotype_log_likelihoods.vcf",
        output=outvcf,
        ped="tests/data/trio.ped",
        genmap="tests/data/trio.map",
        gt_qual_threshold=0,
        prioroutput=outpriors,
    )
    for outfile in [outvcf, outpriors]:
        assert os.path.isfile(outfile)
        tables = list(
            VcfReader(outfile, phases=True, genotype_likelihoods=True))
        assert len(tables) == 1
        table = tables[0]
        assert table.chromosome == "1"
        assert len(table.variants) == 5
        assert table.samples == ["HG004", "HG003", "HG002"]

        # check if GL likelihoods were replaced
        vcf_reader = VariantFile(outfile)
        print(list(vcf_reader.header.samples), outfile)
        for record in vcf_reader:
            for call in record.samples.values():
                GL = call.get("GL", None)
                GQ = call.get("GQ", None)
                print("GL:", GL, "GQ", GQ)
                assert GL != [-1, -1, -1]
                assert GQ != 100
예제 #10
0
def test_blockcut_sensitivities(tmp_path):
    """ Ensure that the block cut sets are monotone to the sensitivity"""

    results = []
    for s in range(6):
        outvcf = tmp_path / "output{}.vcf".format(s)
        run_polyphase(
            phase_input_files=["tests/data/polyploid.chr22.42M.12k.bam"],
            variant_file="tests/data/polyploid.chr22.42M.12k.vcf",
            ploidy=4,
            ignore_read_groups=True,
            block_cut_sensitivity=s,
            output=outvcf,
        )
        assert os.path.isfile(outvcf)

        tables = list(VcfReader(outvcf, phases=True))
        assert len(tables) == 1
        block_starts = set([
            i.block_id for i in tables[0].phases_of("HG00514_NA19240")
            if i is not None
        ])
        results.append(block_starts)
        print(block_starts)

    for s in range(5):
        assert all(cut in results[s + 1] for cut in results[s])
예제 #11
0
def test_phase_three_individuals(algorithm, tmpdir):
    outvcf = str(tmpdir.join("output.vcf"))
    outreadlist = str(tmpdir.join("readlist.tsv"))
    run_whatshap(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio.vcf",
        read_list_filename=outreadlist,
        output=outvcf,
        algorithm=algorithm,
    )
    assert os.path.isfile(outvcf)
    assert os.path.isfile(outreadlist)

    tables = list(VcfReader(outvcf, phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "1"
    assert len(table.variants) == 5
    assert table.samples == ["HG004", "HG003", "HG002"]

    phase1 = VariantCallPhase(60906167, (0, 1), None)
    phase3 = VariantCallPhase(60907394, (0, 1), None)
    assert_phasing(table.phases_of("HG004"),
                   [None, phase3, phase3, phase3, None])
    assert_phasing(table.phases_of("HG003"),
                   [phase1, None, phase1, None, None])
    assert_phasing(table.phases_of("HG002"), [None, None, None, None, None])
예제 #12
0
def test_genotyping_specific_chromosome():
	for requested_chromosome in ['1','2']:
		with TemporaryDirectory() as tempdir:
			outvcf = tempdir + '/output.vcf'
			outpriors = tempdir + '/priors.vcf'
			run_genotype(phase_input_files=[trio_bamfile], variant_file='tests/data/trio-two-chromosomes.vcf', output=outvcf,
					ped='tests/data/trio.ped', genmap='tests/data/trio.map', chromosomes=[requested_chromosome],
					prioroutput=outpriors)

			for outfile in [outvcf, outpriors]:
				assert os.path.isfile(outfile)

				tables = list(VcfReader(outfile, genotype_likelihoods=True))

				assert len(tables) == 2
				for table in tables:
					assert len(table.variants) == 5
					assert table.samples == ['HG004', 'HG003', 'HG002']

				index = 0
				if requested_chromosome == '1':
					index = 1

				# should be no genotype likelihoods for skipped chromosomes
				for s in tables[index].samples:
					tables[index].genotype_likelihoods_of(s) == [None] * 5
					tables[not index].genotype_likelihoods_of(s) != [None] * 5
예제 #13
0
def test_genotyping_one_of_three_individuals(tmp_path):
    outvcf = tmp_path / "output.vcf"
    outpriors = tmp_path / "priors.vcf"
    run_genotype(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio.vcf",
        output=outvcf,
        samples=["HG003"],
        prioroutput=outpriors,
    )

    for outfile in [outvcf, outpriors]:
        assert os.path.isfile(outfile)

        tables = list(
            VcfReader(outfile, phases=True, genotype_likelihoods=True))
        assert len(tables) == 1
        table = tables[0]
        assert table.chromosome == "1"
        assert len(table.variants) == 5
        assert table.samples == ["HG004", "HG003", "HG002"]

        # there should be no genotype predictions for HG003/HG002
        default_l = math.log10(1 / 3.0)
        for l in [
                table.genotype_likelihoods_of("HG002"),
                table.genotype_likelihoods_of("HG004"),
        ]:
            for var in l:
                for v in var.log10_probs():
                    assert pytest.approx(default_l) == v
예제 #14
0
def test_phase_trio_paired_end_reads(tmp_path):
    outvcf = tmp_path / "output-paired_end.vcf"
    run_whatshap(
        phase_input_files=[trio_paired_end_bamfile],
        variant_file="tests/data/paired_end.sorted.vcf",
        output=outvcf,
        ped="tests/data/trio_paired_end.ped",
        genmap="tests/data/trio.map",
    )
    assert os.path.isfile(outvcf)

    tables = list(VcfReader(outvcf, phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "1"
    assert len(table.variants) == 3
    assert table.samples == ["mother", "father", "child"]
    assert table.num_of_blocks_of("mother") == 1
    assert table.num_of_blocks_of("father") == 0
    assert table.num_of_blocks_of("child") == 1

    phase0 = VariantCallPhase(80050, (0, 1), None)
    phase1 = VariantCallPhase(80050, (1, 0), None)

    assert_phasing(table.phases_of("mother"), [phase1, phase1, phase0])
    assert_phasing(table.phases_of("father"), [None, None, None])
    assert_phasing(table.phases_of("child"), [None, None, phase1])
예제 #15
0
def test_duplicate_read(algorithm, expected_block, tmp_path):
    # This test is very similar to the previous test_phased_blocks
    # test, except that there is just a single read this time,
    # with homozygous site.  Still, since hapchat would rather
    # phase this homozygous site, since the context is full
    # genotyping, it does so, regardless of any genotype
    # likelihood.  See above test for more details.
    outvcf = tmp_path / "output.vcf"
    run_whatshap(
        phase_input_files=[short_duplicate_bamfile],
        variant_file="tests/data/short-genome/short.vcf",
        ignore_read_groups=True,
        distrust_genotypes=True,
        include_homozygous=True,
        output=outvcf,
        algorithm=algorithm,
    )
    assert os.path.isfile(outvcf)

    tables = list(VcfReader(outvcf, phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "chr1"
    assert len(table.variants) == 5
    assert table.samples == ["sample"]

    blocks = [(p.block_id if p is not None else None)
              for p in table.phases_of("sample")]
    assert blocks == expected_block
예제 #16
0
def test_read_genotype_likelihoods():
    tables = list(
        VcfReader('tests/data/genotype-likelihoods.vcf',
                  genotype_likelihoods=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == 'chrA'
    assert table.samples == ['sample1', 'sample2']
    assert len(table.variants) == 4

    assert len(table.genotypes) == 2
    assert list(table.genotypes[0]) == [2, 1, 1, 1]
    assert list(table.genotypes[1]) == [1, 0, 0, 1]

    gl0 = GenotypeLikelihoods(-2.1206, -0.8195, -0.07525)
    gl1 = GenotypeLikelihoods(-10.3849, 0, -5.99143)
    gl2 = GenotypeLikelihoods(-2.1, -0.8, -0.8)
    gl3 = GenotypeLikelihoods(0, -10.0, -0.6)

    assert len(table.genotype_likelihoods_of('sample1')) == 4
    assert len(table.genotype_likelihoods_of('sample2')) == 4

    expected1 = [gl0, gl2, None, gl0]
    expected2 = [gl1, gl3, None, gl1]
    for actual_gl, expected_gl in zip(table.genotype_likelihoods_of('sample1'),
                                      expected1):
        assert_genotype_likelihoods(actual_gl, expected_gl)
    for actual_gl, expected_gl in zip(table.genotype_likelihoods_of('sample2'),
                                      expected2):
        assert_genotype_likelihoods(actual_gl, expected_gl)
예제 #17
0
def test_phase_trio_dont_merge_blocks(tmpdir):
    outvcf = str(tmpdir.join("output-merged-blocks.vcf"))
    run_whatshap(
        phase_input_files=[trio_merged_bamfile],
        variant_file="tests/data/trio-merged-blocks.vcf",
        output=outvcf,
        ped="tests/data/trio.ped",
        genmap="tests/data/trio.map",
        genetic_haplotyping=False,
    )
    assert os.path.isfile(outvcf)

    tables = list(VcfReader(outvcf, phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "1"
    assert len(table.variants) == 8
    assert table.samples == ["HG002", "HG003", "HG004"]
    assert table.num_of_blocks_of("HG004") == 2
    assert table.num_of_blocks_of("HG003") == 1
    assert table.num_of_blocks_of("HG002") == 1

    phase1 = VariantCallPhase(752566, (1, 0), None)
    phase2_0 = VariantCallPhase(853954, (0, 1), None)
    phase2_1 = VariantCallPhase(853954, (1, 0), None)
    assert_phasing(
        table.phases_of("HG004"),
        [phase1, phase1, phase1, None, phase2_1, phase2_1, phase2_1, phase2_1],
    )
    assert_phasing(
        table.phases_of("HG003"),
        [None, None, None, None, phase2_0, phase2_0, phase2_0, phase2_1],
    )
    assert_phasing(table.phases_of("HG002"),
                   [None, None, None, None, None, None, None, phase2_1])
예제 #18
0
def test_phase_specific_chromosome():
	for requested_chromosome in ['1','2']:
		with TemporaryDirectory() as tempdir:
			outvcf = tempdir + '/output.vcf'
			run_whatshap(phase_input_files=[trio_bamfile], variant_file='tests/data/trio-two-chromosomes.vcf', output=outvcf,
					ped='tests/data/trio.ped', genmap='tests/data/trio.map', chromosomes=[requested_chromosome])
			assert os.path.isfile(outvcf)

			tables = list(VcfReader(outvcf, phases=True))
			assert len(tables) == 2
			for table in tables:
				assert len(table.variants) == 5
				assert table.samples == ['HG004', 'HG003', 'HG002']
				if table.chromosome == '1' == requested_chromosome:
					phase0 = VariantCallPhase(60906167, 0, None)
					assert_phasing(table.phases_of('HG004'), [phase0, phase0, phase0, phase0, phase0])
					assert_phasing(table.phases_of('HG003'), [phase0, None, phase0, phase0, phase0])
					assert_phasing(table.phases_of('HG002'), [None, phase0, None, None, None])
				elif table.chromosome == '2' == requested_chromosome:
					phase0 = VariantCallPhase(60906167, 0, None)
					phase1 = VariantCallPhase(60906167, 1, None)
					assert_phasing(table.phases_of('HG004'), [phase0, None, None, None, phase1])
					assert_phasing(table.phases_of('HG003'), [phase0, None, None, None, None])
					assert_phasing(table.phases_of('HG002'), [None, None, None, None, phase0])
				else:
					assert_phasing(table.phases_of('HG004'), [None, None, None, None, None])
					assert_phasing(table.phases_of('HG003'), [None, None, None, None, None])
					assert_phasing(table.phases_of('HG002'), [None, None, None, None, None])
예제 #19
0
def test_genotyping_specific_chromosome(chromosome, tmp_path):
    outvcf = tmp_path / "output.vcf"
    outpriors = tmp_path / "priors.vcf"
    run_genotype(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio-two-chromosomes.vcf",
        output=outvcf,
        ped="tests/data/trio.ped",
        genmap="tests/data/trio.map",
        chromosomes=[chromosome],
        prioroutput=outpriors,
    )
    for outfile in [outvcf, outpriors]:
        assert os.path.isfile(outfile)
        tables = list(VcfReader(outfile, genotype_likelihoods=True))
        assert len(tables) == 2
        for table in tables:
            assert len(table.variants) == 5
            assert table.samples == ["HG004", "HG003", "HG002"]

        index = 0
        if chromosome == "1":
            index = 1

        # should be no genotype likelihoods for skipped chromosomes
        for s in tables[index].samples:
            assert tables[index].genotype_likelihoods_of(s) == [None] * 5
            assert tables[not index].genotype_likelihoods_of(s) != [None] * 5
예제 #20
0
    def __init__(
        self,
        bam_or_vcf_paths,
        reference,
        numeric_sample_ids,
        ignore_read_groups,
        indels,
        **kwargs  # passed to ReadSetReader constructor
    ):
        self._bam_paths, self._vcf_paths = self._split_input_file_list(bam_or_vcf_paths)

        # TODO exit stack!
        self._numeric_sample_ids = numeric_sample_ids
        self._fasta = self._open_reference(reference) if reference else None

        vcf_readers = [VcfReader(f, indels=indels, phases=True) for f in self._vcf_paths]

        self._vcf_readers = vcf_readers
        self._ignore_read_groups = ignore_read_groups

        self._readset_reader = open_readset_reader(
            self._bam_paths, reference, numeric_sample_ids, **kwargs,
        )
        if not self._vcf_readers:
            self._vcfs = []
        else:
            self._vcfs = None  # None means uninitialized, call .read_vcf() first
예제 #21
0
def test_phase_trio_use_ped_samples():
	with TemporaryDirectory() as tempdir:
		for ped_samples in [True, False]:
			outvcf = tempdir + '/output_ped_samples.vcf'
			outreadlist = tempdir + '/readlist.tsv'
			run_whatshap(phase_input_files=[ped_samples_bamfile], variant_file='tests/data/ped_samples.vcf', read_list_filename=outreadlist, output=outvcf,
				ped='tests/data/trio.ped', genmap='tests/data/trio.map', use_ped_samples=ped_samples)
			assert os.path.isfile(outvcf)
			assert os.path.isfile(outreadlist)

			tables = list(VcfReader(outvcf, phases=True))
			assert len(tables) == 1
			table = tables[0]
			assert table.chromosome == '1'
			assert len(table.variants) == 5
			assert table.samples == ['HG004', 'HG003', 'HG002', 'orphan']

			phase0 = VariantCallPhase(60906167, 0, None)
			phase1 = VariantCallPhase(60907394, 0, None)
			assert_phasing(table.phases_of('HG004'), [phase0, phase0, phase0, phase0, phase0])
			assert_phasing(table.phases_of('HG003'), [phase0, None, phase0, phase0, phase0])
			assert_phasing(table.phases_of('HG002'), [None, phase0, None, None, None])

			if ped_samples:
				assert_phasing(table.phases_of('orphan'), [None, None, None, None, None])
			else:
				assert_phasing(table.phases_of('orphan'), [None, phase1, phase1, phase1, None])
예제 #22
0
def test_genotype_log_likelihoods_given():
	with TemporaryDirectory() as tempdir:
		outvcf = tempdir + '/output_gl_log.vcf'
		outpriors = tempdir + '/priors.vcf'
		run_genotype(phase_input_files=[trio_bamfile], variant_file='tests/data/trio_genotype_log_likelihoods.vcf', output=outvcf,
		        ped='tests/data/trio.ped', genmap='tests/data/trio.map', gt_qual_threshold=0, prioroutput=outpriors)

		for outfile in [outvcf, outpriors]:
			assert os.path.isfile(outfile)

			tables = list(VcfReader(outfile, phases=True, genotype_likelihoods=True))
			assert len(tables) == 1
			table = tables[0]
			assert table.chromosome == '1'
			assert len(table.variants) == 5
			assert table.samples == ['HG004', 'HG003', 'HG002']

			# check if GL likelihoods were replaced
			vcf_reader = vcf.Reader(filename=outfile)
			print(vcf_reader.samples, outfile)
			for record in vcf_reader:
				for call in record.samples:
					GL = getattr(call.data, 'GL', None)
					GQ = getattr(call.data, 'GQ', None)
					print('GL:', GL, 'GQ', GQ)
					assert(GL != [-1,-1,-1])
					assert(GQ != 100)
예제 #23
0
def test_genotype_likelihoods_given(tmp_path):
    outvcf = tmp_path / "output_gl.vcf"
    run_genotype(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio_genotype_likelihoods.vcf",
        output=outvcf,
        ped="tests/data/trio.ped",
        genmap="tests/data/trio.map",
    )
    assert os.path.isfile(outvcf)
    tables = list(VcfReader(outvcf, phases=True, genotype_likelihoods=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "1"
    assert len(table.variants) == 5
    assert table.samples == ["HG004", "HG003", "HG002"]

    # check if PL likelihoods (that were present before) are deleted
    vcf_reader = VariantFile(outvcf)
    # print(list(vcf_reader.samples), outvcf)
    for record in vcf_reader:
        for call in record.samples.values():
            PL = call.get("PL", None)
            GL = call.get("GL", None)
            print("GL:", GL, "PL:", PL)
            assert PL == (None, None, None)
            assert GL is not None
예제 #24
0
def test_phase_trio_distrust_genotypes(tmpdir):
    outvcf = str(tmpdir.join("output_gl.vcf"))
    outreadlist = str(tmpdir.join("readlist.tsv"))
    run_whatshap(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio_genotype_likelihoods.vcf",
        read_list_filename=outreadlist,
        output=outvcf,
        ped="tests/data/trio.ped",
        genmap="tests/data/trio.map",
        distrust_genotypes=True,
    )
    assert os.path.isfile(outvcf)
    assert os.path.isfile(outreadlist)

    tables = list(VcfReader(outvcf, phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "1"
    assert len(table.variants) == 5
    assert table.samples == ["HG004", "HG003", "HG002"]

    phase0 = VariantCallPhase(60906167, (0, 1), None)
    assert_phasing(table.phases_of("HG004"),
                   [None, phase0, phase0, phase0, None])
    assert_phasing(table.phases_of("HG003"),
                   [phase0, None, phase0, phase0, phase0])
    assert_phasing(table.phases_of("HG002"),
                   [phase0, None, phase0, phase0, phase0])
예제 #25
0
파일: test_vcf.py 프로젝트: gdv/whatshap
def test_read_phased_vcf():
    for filename in ["tests/data/phased-via-HP.vcf", "tests/data/phased-via-PS.vcf"]:
        print("Testing", filename)
        tables = list(VcfReader(filename, phases=True))
        assert len(tables) == 2
        table_a, table_b = tables

        assert table_a.chromosome == "chrA"
        assert len(table_a.variants) == 4
        assert table_a.samples == ["sample1", "sample2"]

        assert table_b.chromosome == "chrB"
        assert len(table_b.variants) == 2
        assert table_b.samples == ["sample1", "sample2"]

        assert len(table_a.genotypes) == 2
        assert list(table_a.genotypes[0]) == canonic_index_list_to_biallelic_gt_list([1, 2, 1, 1])
        assert list(table_a.genotypes[1]) == canonic_index_list_to_biallelic_gt_list([1, 1, 1, 1])
        assert list(table_a.genotypes_of("sample1")) == canonic_index_list_to_biallelic_gt_list(
            [1, 2, 1, 1]
        )
        assert list(table_a.genotypes_of("sample2")) == canonic_index_list_to_biallelic_gt_list(
            [1, 1, 1, 1]
        )

        assert len(table_b.genotypes) == 2
        assert list(table_b.genotypes[0]) == canonic_index_list_to_biallelic_gt_list([0, 1])
        assert list(table_b.genotypes[1]) == canonic_index_list_to_biallelic_gt_list([1, 2])
        assert list(table_b.genotypes_of("sample1")) == canonic_index_list_to_biallelic_gt_list(
            [0, 1]
        )
        assert list(table_b.genotypes_of("sample2")) == canonic_index_list_to_biallelic_gt_list(
            [1, 2]
        )

        print(table_a.phases)
        assert len(table_a.phases) == 2
        expected_phase_sample1 = [
            None,
            None,
            VariantCallPhase(block_id=300, phase=(1, 0), quality=23),
            VariantCallPhase(block_id=300, phase=(0, 1), quality=42),
        ]
        expected_phase_sample2 = [
            VariantCallPhase(block_id=100, phase=(0, 1), quality=10),
            VariantCallPhase(block_id=100, phase=(1, 0), quality=20),
            VariantCallPhase(block_id=300, phase=(0, 1), quality=30),
            VariantCallPhase(block_id=300, phase=(0, 1), quality=None),
        ]
        assert list(table_a.phases[0]) == expected_phase_sample1
        assert list(table_a.phases[1]) == expected_phase_sample2
        assert list(table_a.phases_of("sample1")) == expected_phase_sample1
        assert list(table_a.phases_of("sample2")) == expected_phase_sample2

        assert len(table_b.phases) == 2
        assert list(table_b.phases[0]) == [None, None]
        assert list(table_b.phases[1]) == [None, None]
        assert list(table_b.phases_of("sample1")) == [None, None]
        assert list(table_b.phases_of("sample2")) == [None, None]
예제 #26
0
파일: test_vcf.py 프로젝트: ekg/graphappy
def test_inconsistent_ploidy_phased():
    try:
        _ = list(
            VcfReader("tests/data/polyploid.chr22.inconsistent.vcf",
                      phases=True))
    except PloidyError:
        return
    assert False
예제 #27
0
파일: test_vcf.py 프로젝트: gdv/whatshap
def test_vcf_without_index(tmp_path):
    vcf_path = tmp_path / "file.vcf.gz"
    import shutil

    shutil.copy("tests/data/haplotag_1.vcf.gz", vcf_path)
    with raises(VcfIndexMissing):
        with VcfReader(vcf_path) as vr:
            list(vr.fetch("chr1"))
예제 #28
0
def test_read_phased():
    tables = list(VcfReader('tests/data/phasedinput.vcf', phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == 'ref'
    assert table.samples == ['sample']
    assert len(table.variants) == 2
    assert table.variants[0].reference_allele == 'A'
    assert table.variants[0].alternative_allele == 'C'
    assert table.variants[1].reference_allele == 'G'
    assert table.variants[1].alternative_allele == 'T'
    assert table.genotypes[0][0] == table.genotypes[0][1] == 1
예제 #29
0
파일: test_vcf.py 프로젝트: gdv/whatshap
def test_read_phased():
    tables = list(VcfReader("tests/data/phasedinput.vcf", phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "ref"
    assert table.samples == ["sample"]
    assert len(table.variants) == 2
    assert table.variants[0].reference_allele == "A"
    assert table.variants[0].alternative_allele == "C"
    assert table.variants[1].reference_allele == "G"
    assert table.variants[1].alternative_allele == "T"
    assert table.genotypes[0][0] == table.genotypes[0][1] == canonic_index_to_biallelic_gt(1)
예제 #30
0
def test_read_phased_vcf():
    for filename in [
            'tests/data/phased-via-HP.vcf', 'tests/data/phased-via-PS.vcf'
    ]:
        print('Testing', filename)
        tables = list(VcfReader(filename, phases=True))
        assert len(tables) == 2
        table_a, table_b = tables

        assert table_a.chromosome == 'chrA'
        assert len(table_a.variants) == 4
        assert table_a.samples == ['sample1', 'sample2']

        assert table_b.chromosome == 'chrB'
        assert len(table_b.variants) == 2
        assert table_b.samples == ['sample1', 'sample2']

        assert len(table_a.genotypes) == 2
        assert list(table_a.genotypes[0]) == [1, 2, 1, 1]
        assert list(table_a.genotypes[1]) == [1, 1, 1, 1]
        assert list(table_a.genotypes_of('sample1')) == [1, 2, 1, 1]
        assert list(table_a.genotypes_of('sample2')) == [1, 1, 1, 1]

        assert len(table_b.genotypes) == 2
        assert list(table_b.genotypes[0]) == [0, 1]
        assert list(table_b.genotypes[1]) == [1, 2]
        assert list(table_b.genotypes_of('sample1')) == [0, 1]
        assert list(table_b.genotypes_of('sample2')) == [1, 2]

        print(table_a.phases)
        assert len(table_a.phases) == 2
        expected_phase_sample1 = [
            None, None,
            VariantCallPhase(block_id=300, phase=1, quality=23),
            VariantCallPhase(block_id=300, phase=0, quality=42)
        ]
        expected_phase_sample2 = [
            VariantCallPhase(block_id=100, phase=0, quality=10),
            VariantCallPhase(block_id=100, phase=1, quality=20),
            VariantCallPhase(block_id=300, phase=0, quality=30),
            VariantCallPhase(block_id=300, phase=0, quality=None)
        ]
        assert list(table_a.phases[0]) == expected_phase_sample1
        assert list(table_a.phases[1]) == expected_phase_sample2
        assert list(table_a.phases_of('sample1')) == expected_phase_sample1
        assert list(table_a.phases_of('sample2')) == expected_phase_sample2

        assert len(table_b.phases) == 2
        assert list(table_b.phases[0]) == [None, None]
        assert list(table_b.phases[1]) == [None, None]
        assert list(table_b.phases_of('sample1')) == [None, None]
        assert list(table_b.phases_of('sample2')) == [None, None]