Пример #1
0
def test_phase_with_phased_blocks(tmpdir):
    outvcf1 = str(tmpdir.join("output1.vcf"))
    outvcf2 = str(tmpdir.join("output2.vcf"))
    # run whatshap without --ignore-read-groups option
    run_whatshap(
        phase_input_files=[
            "tests/data/phased-blocks.reads.bam",
            "tests/data/phased-blocks.blocks.vcf",
        ],
        variant_file="tests/data/phased-blocks.variants.vcf",
        output=outvcf1,
    )
    # run whatshap with --ignore-read-groups option
    run_whatshap(
        phase_input_files=[
            "tests/data/phased-blocks.reads.bam",
            "tests/data/phased-blocks.blocks.vcf",
        ],
        variant_file="tests/data/phased-blocks.variants.vcf",
        output=outvcf2,
        ignore_read_groups=True,
    )
    # the results should be identical
    lines1 = [line for line in open(outvcf1, "r") if line[0] != "#"]
    lines2 = [line for line in open(outvcf2, "r") if line[0] != "#"]

    for l1, l2 in zip(lines1, lines2):
        assert l1 == l2
Пример #2
0
def test_phase_trio_dont_merge_blocks(tmpdir):
    outvcf = str(tmpdir.join("output-merged-blocks.vcf"))
    run_whatshap(
        phase_input_files=[trio_merged_bamfile],
        variant_file="tests/data/trio-merged-blocks.vcf",
        output=outvcf,
        ped="tests/data/trio.ped",
        genmap="tests/data/trio.map",
        genetic_haplotyping=False,
    )
    assert os.path.isfile(outvcf)

    tables = list(VcfReader(outvcf, phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "1"
    assert len(table.variants) == 8
    assert table.samples == ["HG002", "HG003", "HG004"]
    assert table.num_of_blocks_of("HG004") == 2
    assert table.num_of_blocks_of("HG003") == 1
    assert table.num_of_blocks_of("HG002") == 1

    phase1 = VariantCallPhase(752566, (1, 0), None)
    phase2_0 = VariantCallPhase(853954, (0, 1), None)
    phase2_1 = VariantCallPhase(853954, (1, 0), None)
    assert_phasing(
        table.phases_of("HG004"),
        [phase1, phase1, phase1, None, phase2_1, phase2_1, phase2_1, phase2_1],
    )
    assert_phasing(
        table.phases_of("HG003"),
        [None, None, None, None, phase2_0, phase2_0, phase2_0, phase2_1],
    )
    assert_phasing(table.phases_of("HG002"),
                   [None, None, None, None, None, None, None, phase2_1])
Пример #3
0
def test_phase_ped_sample(tmpdir, sample_set):
    # running with --ped and --sample on subset of trio, should give same results as running with only --sample
    # the trio information should be ignored
    outvcf1 = str(tmpdir.join("output1.vcf"))
    outvcf2 = str(tmpdir.join("output2.vcf"))
    run_whatshap(
        phase_input_files=[ped_samples_bamfile],
        variant_file="tests/data/ped_samples.vcf",
        output=outvcf1,
        ped="tests/data/trio.ped",
        samples=sample_set,
    )
    run_whatshap(
        phase_input_files=[ped_samples_bamfile],
        variant_file="tests/data/ped_samples.vcf",
        output=outvcf2,
        samples=sample_set,
    )

    assert os.path.isfile(outvcf1)
    assert os.path.isfile(outvcf2)

    tables1 = list(VcfReader(outvcf1, phases=True))
    tables2 = list(VcfReader(outvcf2, phases=True))

    assert len(tables1) == 1 and len(tables2) == 1
    table1, table2 = tables1[0], tables2[0]

    for individual in sample_set:
        assert_phasing(table1.phases_of(individual),
                       table2.phases_of(individual))
Пример #4
0
def test_duplicate_read(algorithm, expected_block, tmp_path):
    # This test is very similar to the previous test_phased_blocks
    # test, except that there is just a single read this time,
    # with homozygous site.  Still, since hapchat would rather
    # phase this homozygous site, since the context is full
    # genotyping, it does so, regardless of any genotype
    # likelihood.  See above test for more details.
    outvcf = tmp_path / "output.vcf"
    run_whatshap(
        phase_input_files=[short_duplicate_bamfile],
        variant_file="tests/data/short-genome/short.vcf",
        ignore_read_groups=True,
        distrust_genotypes=True,
        include_homozygous=True,
        output=outvcf,
        algorithm=algorithm,
    )
    assert os.path.isfile(outvcf)

    tables = list(VcfReader(outvcf, phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "chr1"
    assert len(table.variants) == 5
    assert table.samples == ["sample"]

    blocks = [(p.block_id if p is not None else None)
              for p in table.phases_of("sample")]
    assert blocks == expected_block
Пример #5
0
def test_quartet2():
    run_whatshap(
        variant_file="tests/data/quartet2.vcf",
        phase_input_files=[quartet2_bamfile],
        ped="tests/data/quartet2.ped",
        output="/dev/null",
    )
Пример #6
0
def test_phase_three_individuals(algorithm, tmpdir):
    outvcf = str(tmpdir.join("output.vcf"))
    outreadlist = str(tmpdir.join("readlist.tsv"))
    run_whatshap(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio.vcf",
        read_list_filename=outreadlist,
        output=outvcf,
        algorithm=algorithm,
    )
    assert os.path.isfile(outvcf)
    assert os.path.isfile(outreadlist)

    tables = list(VcfReader(outvcf, phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "1"
    assert len(table.variants) == 5
    assert table.samples == ["HG004", "HG003", "HG002"]

    phase1 = VariantCallPhase(60906167, (0, 1), None)
    phase3 = VariantCallPhase(60907394, (0, 1), None)
    assert_phasing(table.phases_of("HG004"),
                   [None, phase3, phase3, phase3, None])
    assert_phasing(table.phases_of("HG003"),
                   [phase1, None, phase1, None, None])
    assert_phasing(table.phases_of("HG002"), [None, None, None, None, None])
Пример #7
0
def test_with_reference(algorithm, expected_vcf, tmpdir):
    # This tests also whether lowercase reference FASTA files work:
    # If lowercase and uppercase are treated differently, then the
    # output is slightly different from the expected.

    # note: because hapchat has a different dynamic programming
    # scheme, it may phase some variants differently, e.g., the
    # variant at site 11221 of phased.vcf.  It also phases each
    # heterozygous site, even if the scores (in the DP table) of
    # its (two) possible phasings are identical -- such is the
    # case for sites 13300 and 14324 of phased.vcf.  It is for
    # this reason that we have a second phased_hapchat.vcf which
    # is different in these above three sites.  Whether or not
    # this a desired behaviour is subject to discussion --
    # possible handling (i.e., avoiding the phasing of) sites with
    # identical phasing scores is a possible future work, etc.
    out = str(tmpdir.join("out.vcf"))
    run_whatshap(
        phase_input_files=["tests/data/pacbio/pacbio.bam"],
        variant_file="tests/data/pacbio/variants.vcf",
        reference="tests/data/pacbio/reference.fasta",
        output=out,
        write_command_line_header=False,  # for easier VCF comparison
        algorithm=algorithm,
    )
    print("out:", out)
    with open(expected_vcf) as f:
        expected = f.read()
    with open(out) as f:
        actual = f.read()

    assert actual == expected, "VCF output not as expected"
Пример #8
0
def test_one_variant(algorithm):
    run_whatshap(
        phase_input_files=["tests/data/oneread.bam"],
        variant_file="tests/data/onevariant.vcf",
        output="/dev/null",
        algorithm=algorithm,
    )
Пример #9
0
def test_phase_trio_paired_end_reads(tmp_path):
    outvcf = tmp_path / "output-paired_end.vcf"
    run_whatshap(
        phase_input_files=[trio_paired_end_bamfile],
        variant_file="tests/data/paired_end.sorted.vcf",
        output=outvcf,
        ped="tests/data/trio_paired_end.ped",
        genmap="tests/data/trio.map",
    )
    assert os.path.isfile(outvcf)

    tables = list(VcfReader(outvcf, phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "1"
    assert len(table.variants) == 3
    assert table.samples == ["mother", "father", "child"]
    assert table.num_of_blocks_of("mother") == 1
    assert table.num_of_blocks_of("father") == 0
    assert table.num_of_blocks_of("child") == 1

    phase0 = VariantCallPhase(80050, (0, 1), None)
    phase1 = VariantCallPhase(80050, (1, 0), None)

    assert_phasing(table.phases_of("mother"), [phase1, phase1, phase0])
    assert_phasing(table.phases_of("father"), [None, None, None])
    assert_phasing(table.phases_of("child"), [None, None, phase1])
Пример #10
0
def test_phase_trio_distrust_genotypes(tmpdir):
    outvcf = str(tmpdir.join("output_gl.vcf"))
    outreadlist = str(tmpdir.join("readlist.tsv"))
    run_whatshap(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio_genotype_likelihoods.vcf",
        read_list_filename=outreadlist,
        output=outvcf,
        ped="tests/data/trio.ped",
        genmap="tests/data/trio.map",
        distrust_genotypes=True,
    )
    assert os.path.isfile(outvcf)
    assert os.path.isfile(outreadlist)

    tables = list(VcfReader(outvcf, phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "1"
    assert len(table.variants) == 5
    assert table.samples == ["HG004", "HG003", "HG002"]

    phase0 = VariantCallPhase(60906167, (0, 1), None)
    assert_phasing(table.phases_of("HG004"),
                   [None, phase0, phase0, phase0, None])
    assert_phasing(table.phases_of("HG003"),
                   [phase0, None, phase0, phase0, phase0])
    assert_phasing(table.phases_of("HG002"),
                   [phase0, None, phase0, phase0, phase0])
Пример #11
0
def test_default_output(algorithm):
    """Output to stdout"""
    run_whatshap(
        phase_input_files=["tests/data/oneread.bam"],
        variant_file="tests/data/onevariant.vcf",
        algorithm=algorithm,
    )
Пример #12
0
def test_readgroup_without_sample_name(algorithm):
    run_whatshap(
        phase_input_files=["tests/data/oneread-readgroup-without-sample.bam"],
        variant_file="tests/data/onevariant.vcf",
        output="/dev/null",
        ignore_read_groups=True,
        algorithm=algorithm,
    )
Пример #13
0
def test_with_reference_and_indels(algorithm):
    run_whatshap(
        phase_input_files=["tests/data/pacbio/pacbio.bam"],
        variant_file="tests/data/pacbio/variants.vcf",
        reference="tests/data/pacbio/reference.fasta",
        indels=True,
        algorithm=algorithm,
    )
Пример #14
0
def test_bam_without_readgroup(algorithm):
    run_whatshap(
        phase_input_files=["tests/data/no-readgroup.bam"],
        variant_file="tests/data/onevariant.vcf",
        output="/dev/null",
        ignore_read_groups=True,
        algorithm=algorithm,
    )
Пример #15
0
def test_full_genotyping(algorithm):
    run_whatshap(
        phase_input_files=["tests/data/oneread.bam"],
        variant_file="tests/data/onevariant.vcf",
        output="/dev/null",
        full_genotyping=True,
        algorithm=algorithm,
    )
Пример #16
0
def test_one_variant_cram(algorithm):
    run_whatshap(
        phase_input_files=["tests/data/oneread.cram"],
        reference="tests/data/oneread-ref.fasta",
        variant_file="tests/data/onevariant.vcf",
        output="/dev/null",
        algorithm=algorithm,
    )
Пример #17
0
def test_ignore_read_groups(algorithm):
    run_whatshap(
        variant_file="tests/data/pacbio/variants.vcf",
        phase_input_files=["tests/data/pacbio/pacbio.bam"],
        reference="tests/data/pacbio/reference.fasta",
        ignore_read_groups=True,
        output="/dev/null",
        algorithm=algorithm,
    )
Пример #18
0
def test_vcf_with_missing_headers(algorithm):
    # Since pysam 0.16, this type of invalid VCF is no longer accepted
    with raises(CommandLineError):
        run_whatshap(
            phase_input_files=["tests/data/oneread.bam"],
            variant_file="tests/data/missing-headers.vcf",
            output="/dev/null",
            algorithm=algorithm,
        )
Пример #19
0
def test_requested_sample_not_found(algorithm):
    with raises(CommandLineError):
        run_whatshap(
            phase_input_files=["tests/data/oneread.bam"],
            variant_file="tests/data/onevariant.vcf",
            output="/dev/null",
            samples=["DOES_NOT_EXIST"],
            algorithm=algorithm,
        )
Пример #20
0
def test_with_read_merging(algorithm):
    run_whatshap(
        phase_input_files=["tests/data/pacbio/pacbio.bam"],
        variant_file="tests/data/pacbio/variants.vcf",
        reference="tests/data/pacbio/reference.fasta",
        output="/dev/null",
        read_merging=True,
        algorithm=algorithm,
    )
Пример #21
0
def test_phase_trio_zero_distance(tmp_path):
    outvcf = tmp_path / "output.vcf"
    run_whatshap(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio.vcf",
        output=outvcf,
        ped="tests/data/trio.ped",
        genmap="tests/data/zero-genetic-distance.map",
    )
    assert os.path.isfile(outvcf)
Пример #22
0
def test_phase_trio_hapchat():
    with raises(CommandLineError) as e:
        run_whatshap(
            phase_input_files=[trio_bamfile],
            variant_file="tests/data/trio.vcf",
            output="/dev/null",
            ped="tests/data/trio.ped",
            algorithm="hapchat",
        )
    assert "cannot do pedigree phasing" in e.value.args[0]
Пример #23
0
def test_wrong_chromosome(algorithm, tmp_path):
    outvcf = tmp_path / "output.vcf"
    with raises(CommandLineError):
        run_whatshap(
            phase_input_files=[short_bamfile],
            ignore_read_groups=True,
            variant_file="tests/data/short-genome/wrongchromosome.vcf",
            output=outvcf,
            algorithm=algorithm,
        )
Пример #24
0
def test_cram_no_reference(algorithm):
    # This needs to fail because CRAM requires a reference, but it was not given.

    # If REF_PATH is not set, pysam/htslib tries to retrieve the reference from EBI via
    # the internet.
    os.environ["REF_PATH"] = "/does/not/exist"
    with raises(CommandLineError):
        run_whatshap(
            phase_input_files=["tests/data/oneread.cram"],
            variant_file="tests/data/onevariant.vcf",
            output="/dev/null",
            algorithm=algorithm,
        )
Пример #25
0
def test_do_not_phase_duplicate_position(algorithm, tmpdir):
    """Ensure HP tag is added only to first of duplicate positions"""
    tmpvcf = str(tmpdir.join("duplicate-positions-phased.vcf"))
    run_whatshap(
        phase_input_files=["tests/data/oneread.bam"],
        variant_file="tests/data/duplicate-positions.vcf",
        output=tmpvcf,
        algorithm=algorithm,
    )
    seen_positions = set()
    records = list(pysam.VariantFile(tmpvcf))
    assert len(records) == 4
    for record in records:
        assert not (record.start in seen_positions and "HP" in record.format)
        seen_positions.add(record.start)
Пример #26
0
def test_ps_tag(algorithm, expected_lines, tmpdir):
    out = str(tmpdir.join("out.vcf"))
    run_whatshap(
        variant_file="tests/data/trio.vcf",
        phase_input_files=["tests/data/trio.pacbio.bam"],
        output=out,
        tag="PS",
        algorithm=algorithm,
    )
    with open(out) as f:
        lines = [line for line in f.readlines() if not line.startswith("#")]

    # TODO This is quite an ugly way to test phased VCF writing (see parametrization)
    for i in range(5):
        assert lines[i] == expected_lines[i]
Пример #27
0
def test_distrust_genotypes_assertion(tmp_path):
    outvcf = tmp_path / "output.vcf"
    run_whatshap(
        indels=False,
        phase_input_files=[dist_geno_bamfile],
        variant_file="tests/data/test_dist_geno.vcf",
        output=outvcf,
    )
    assert os.path.isfile(outvcf)
    tables = list(VcfReader(outvcf, phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "chr1"
    phase0 = VariantCallPhase(23824647, (0, 1), None)
    assert_phasing(table.phases_of("NA12878"), [None, phase0, None, phase0])
Пример #28
0
def test_phase_quartet_recombination_breakpoints(expect_recombination,
                                                 parameters, tmp_path):
    outvcf = tmp_path / "output-recombination_breaks.vcf"
    outlist = tmp_path / "output.recomb"
    run_whatshap(
        phase_input_files=[recombination_breaks_bamfile],
        variant_file="tests/data/quartet.vcf.gz",
        output=outvcf,
        ped="tests/data/recombination_breaks.ped",
        recombination_list_filename=outlist,
        **parameters,
    )
    assert os.path.isfile(outvcf)

    tables = list(VcfReader(outvcf, phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "1"
    assert len(table.variants) == 4
    assert table.samples == ["HG002", "HG005", "HG003", "HG004"]
    assert table.num_of_blocks_of("HG002") == 0
    assert table.num_of_blocks_of("HG005") == 0
    assert table.num_of_blocks_of("HG003") == 1
    assert table.num_of_blocks_of("HG004") == 0

    phase0 = VariantCallPhase(68735304, (0, 1), None)
    phase1 = VariantCallPhase(68735304, (1, 0), None)

    assert_phasing(table.phases_of("HG002"), [None, None, None, None])
    assert_phasing(table.phases_of("HG005"), [None, None, None, None])
    if expect_recombination:
        assert_phasing(table.phases_of("HG003"),
                       [phase0, phase0, None, phase1])
    else:
        assert_phasing(table.phases_of("HG003"),
                       [phase0, phase0, None, phase0])
    assert_phasing(table.phases_of("HG004"), [None, None, None, None])

    lines = open(outlist).readlines()
    if expect_recombination:
        assert len(lines) == 3
        assert lines[1] == "HG002 1 68735433 68738308 0 1 0 0 3\n"
        assert lines[2] == "HG005 1 68735433 68738308 0 1 0 0 3\n"
    else:
        assert len(lines) == 1
Пример #29
0
def test_genetic_haplotyping(tmp_path):
    outvcf = tmp_path / "output.vcf"
    outrecomb = tmp_path / "utput.recomb"
    run_whatshap(
        variant_file="tests/data/genetic-haplotyping.vcf",
        phase_input_files=[],
        ped="tests/data/genetic-haplotyping.ped",
        output=outvcf,
        recombination_list_filename=outrecomb,
    )
    tables = list(VcfReader(outvcf, phases=True))

    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "1"
    assert len(table.variants) == 3
    assert table.samples == [
        "sampleA", "sampleB", "sampleC", "sampleD", "sampleE"
    ]
    assert table.num_of_blocks_of("sampleA") == 1
    assert table.num_of_blocks_of("sampleB") == 1
    assert table.num_of_blocks_of("sampleC") == 0
    assert table.num_of_blocks_of("sampleD") == 1
    assert table.num_of_blocks_of("sampleE") == 1

    phase0 = VariantCallPhase(10327, (0, 1), None)
    phase1 = VariantCallPhase(10327, (1, 0), None)

    assert_phasing(table.phases_of("sampleA"), [phase0, phase0, phase1])
    assert_phasing(table.phases_of("sampleB"), [phase0, None, None])
    assert_phasing(table.phases_of("sampleC"), [None, None, None])
    assert_phasing(table.phases_of("sampleD"), [phase0, None, phase1])
    assert_phasing(table.phases_of("sampleE"), [phase0, phase0, None])

    lines = [l.split() for l in open(outrecomb)]
    assert len(lines) == 2
    Fields = namedtuple("Fields", [f.strip("#\n") for f in lines[0]])
    recomb = Fields(*lines[1])
    print(recomb)
    assert recomb.child_id == "sampleC"
    assert recomb.chromosome == "1"
    assert recomb.position1 == "31295"
    assert recomb.position2 == "102596"
Пример #30
0
def test_phase_specific_chromosome(chromosome, tmp_path):
    outvcf = tmp_path / "output.vcf"
    run_whatshap(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio-two-chromosomes.vcf",
        output=outvcf,
        ped="tests/data/trio.ped",
        genmap="tests/data/trio.map",
        chromosomes=[chromosome],
    )
    assert os.path.isfile(outvcf)

    tables = list(VcfReader(outvcf, phases=True))
    assert len(tables) == 2
    for table in tables:
        assert len(table.variants) == 5
        assert table.samples == ["HG004", "HG003", "HG002"]
        if table.chromosome == "1" == chromosome:
            phase0 = VariantCallPhase(60906167, (0, 1), None)
            assert_phasing(
                table.phases_of("HG004"),
                [phase0, phase0, phase0, phase0, phase0],
            )
            assert_phasing(table.phases_of("HG003"),
                           [phase0, None, phase0, phase0, phase0])
            assert_phasing(table.phases_of("HG002"),
                           [None, phase0, None, None, None])
        elif table.chromosome == "2" == chromosome:
            phase0 = VariantCallPhase(60906167, (0, 1), None)
            phase1 = VariantCallPhase(60906167, (1, 0), None)
            assert_phasing(table.phases_of("HG004"),
                           [phase0, None, None, None, phase1])
            assert_phasing(table.phases_of("HG003"),
                           [phase0, None, None, None, None])
            assert_phasing(table.phases_of("HG002"),
                           [None, None, None, None, phase0])
        else:
            assert_phasing(table.phases_of("HG004"),
                           [None, None, None, None, None])
            assert_phasing(table.phases_of("HG003"),
                           [None, None, None, None, None])
            assert_phasing(table.phases_of("HG002"),
                           [None, None, None, None, None])