Exemplo n.º 1
0
def test_gt_quality_threshold(threshold, tmpdir):
    thres = 1 - 10**(-threshold / 10.0)

    out_vcf = str(tmpdir.join("out.vcf"))
    priors_vcf = str(tmpdir.join("priors.vcf"))
    run_genotype(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio.vcf",
        output=out_vcf,
        gt_qual_threshold=threshold,
        indels=False,
        prioroutput=priors_vcf,
    )

    for path in [out_vcf, priors_vcf]:
        for record in VariantFile(path):
            for call in record.samples.values():
                likelihoods = call["GL"]
                genotype = call["GT"]
                if genotype == (None, ):
                    genotype = None
                else:
                    genotype = genotype[0] + genotype[1]
                gt = likeliest_genotype(likelihoods[0], likelihoods[1],
                                        likelihoods[2], thres)
                # print(likelihoods[0], likelihoods[1], likelihoods[2], gt, genotype, thres)
                assert gt == genotype
Exemplo n.º 2
0
def test_genotype_likelihoods_given(tmp_path):
    outvcf = tmp_path / "output_gl.vcf"
    run_genotype(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio_genotype_likelihoods.vcf",
        output=outvcf,
        ped="tests/data/trio.ped",
        genmap="tests/data/trio.map",
    )
    assert os.path.isfile(outvcf)
    tables = list(VcfReader(outvcf, phases=True, genotype_likelihoods=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "1"
    assert len(table.variants) == 5
    assert table.samples == ["HG004", "HG003", "HG002"]

    # check if PL likelihoods (that were present before) are deleted
    vcf_reader = VariantFile(outvcf)
    # print(list(vcf_reader.samples), outvcf)
    for record in vcf_reader:
        for call in record.samples.values():
            PL = call.get("PL", None)
            GL = call.get("GL", None)
            print("GL:", GL, "PL:", PL)
            assert PL == (None, None, None)
            assert GL is not None
Exemplo n.º 3
0
def test_genotype_log_likelihoods_given(tmp_path):
    outvcf = tmp_path / "output_gl_log.vcf"
    outpriors = tmp_path / "priors.vcf"
    run_genotype(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio_genotype_log_likelihoods.vcf",
        output=outvcf,
        ped="tests/data/trio.ped",
        genmap="tests/data/trio.map",
        gt_qual_threshold=0,
        prioroutput=outpriors,
    )
    for outfile in [outvcf, outpriors]:
        assert os.path.isfile(outfile)
        tables = list(
            VcfReader(outfile, phases=True, genotype_likelihoods=True))
        assert len(tables) == 1
        table = tables[0]
        assert table.chromosome == "1"
        assert len(table.variants) == 5
        assert table.samples == ["HG004", "HG003", "HG002"]

        # check if GL likelihoods were replaced
        vcf_reader = VariantFile(outfile)
        print(list(vcf_reader.header.samples), outfile)
        for record in vcf_reader:
            for call in record.samples.values():
                GL = call.get("GL", None)
                GQ = call.get("GQ", None)
                print("GL:", GL, "GQ", GQ)
                assert GL != [-1, -1, -1]
                assert GQ != 100
Exemplo n.º 4
0
def test_bam_without_readgroup():
    run_genotype(
        phase_input_files=["tests/data/no-readgroup.bam"],
        variant_file="tests/data/onevariant.vcf",
        output="/dev/null",
        ignore_read_groups=True,
    )
Exemplo n.º 5
0
def test_genotyping_specific_chromosome(chromosome, tmp_path):
    outvcf = tmp_path / "output.vcf"
    outpriors = tmp_path / "priors.vcf"
    run_genotype(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio-two-chromosomes.vcf",
        output=outvcf,
        ped="tests/data/trio.ped",
        genmap="tests/data/trio.map",
        chromosomes=[chromosome],
        prioroutput=outpriors,
    )
    for outfile in [outvcf, outpriors]:
        assert os.path.isfile(outfile)
        tables = list(VcfReader(outfile, genotype_likelihoods=True))
        assert len(tables) == 2
        for table in tables:
            assert len(table.variants) == 5
            assert table.samples == ["HG004", "HG003", "HG002"]

        index = 0
        if chromosome == "1":
            index = 1

        # should be no genotype likelihoods for skipped chromosomes
        for s in tables[index].samples:
            assert tables[index].genotype_likelihoods_of(s) == [None] * 5
            assert tables[not index].genotype_likelihoods_of(s) != [None] * 5
Exemplo n.º 6
0
def test_ped_sample(sample_set, tmp_path):
    # running with --ped and --sample on subset of trio,
    # should give same results as running with only --sample
    # the trio information should be ignored
    outvcf1 = tmp_path / "output1.vcf"
    outvcf2 = tmp_path / "output2.vcf"
    run_genotype(
        phase_input_files=[ped_samples_bamfile],
        variant_file="tests/data/ped_samples.vcf",
        output=outvcf1,
        ped="tests/data/trio.ped",
        samples=sample_set,
    )
    run_genotype(
        phase_input_files=[ped_samples_bamfile],
        variant_file="tests/data/ped_samples.vcf",
        output=outvcf2,
        samples=sample_set,
    )
    assert os.path.isfile(outvcf1)
    assert os.path.isfile(outvcf2)
    tables1 = list(VcfReader(outvcf1, phases=True, genotype_likelihoods=True))
    tables2 = list(VcfReader(outvcf2, phases=True, genotype_likelihoods=True))
    assert (len(tables1) == 1) and (len(tables2) == 1)
    table1, table2 = tables1[0], tables2[0]

    for individual in sample_set:
        for var1, var2 in zip(
                table1.genotype_likelihoods_of(individual),
                table2.genotype_likelihoods_of(individual),
        ):
            print(var1, var2)
            assert var1.log10_probs() == var2.log10_probs()
Exemplo n.º 7
0
def test_genotyping_one_of_three_individuals(tmp_path):
    outvcf = tmp_path / "output.vcf"
    outpriors = tmp_path / "priors.vcf"
    run_genotype(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio.vcf",
        output=outvcf,
        samples=["HG003"],
        prioroutput=outpriors,
    )

    for outfile in [outvcf, outpriors]:
        assert os.path.isfile(outfile)

        tables = list(
            VcfReader(outfile, phases=True, genotype_likelihoods=True))
        assert len(tables) == 1
        table = tables[0]
        assert table.chromosome == "1"
        assert len(table.variants) == 5
        assert table.samples == ["HG004", "HG003", "HG002"]

        # there should be no genotype predictions for HG003/HG002
        default_l = math.log10(1 / 3.0)
        for l in [
                table.genotype_likelihoods_of("HG002"),
                table.genotype_likelihoods_of("HG004"),
        ]:
            for var in l:
                for v in var.log10_probs():
                    assert pytest.approx(default_l) == v
Exemplo n.º 8
0
def test_no_indels(tmpdir, priors):
    prioroutput = str(tmpdir.join("priors.vcf")) if priors else None
    outvcf = str(tmpdir.join("output_gl.vcf"))
    run_genotype(
        phase_input_files=["tests/data/pacbio/pacbio.bam"],
        variant_file="tests/data/pacbio/variants.vcf",
        reference="tests/data/pacbio/reference.fasta",
        output=outvcf,
        indels=False,
        nopriors=not priors,
        prioroutput=prioroutput,
    )
    result_vcfs = [outvcf]
    if priors:
        result_vcfs.append(prioroutput)

    # make sure indels not genotyped (also in priors.vcf if computed)
    for o_vcf in result_vcfs:
        vcf_reader = VariantFile(o_vcf)
        default_l = math.log10(1 / 3.0)

        for record in vcf_reader:
            if len(record.alts[0]) != len(record.ref):
                for call in record.samples.values():
                    for v in call["GL"]:
                        assert pytest.approx(default_l) == v
Exemplo n.º 9
0
def test_requested_sample_not_found():
    with pytest.raises(CommandLineError):
        run_genotype(
            phase_input_files=["tests/data/oneread.bam"],
            variant_file="tests/data/onevariant.vcf",
            output="/dev/null",
            samples=["DOES_NOT_EXIST"],
        )
Exemplo n.º 10
0
def test_wrong_chromosome(tmp_path):
    outvcf = tmp_path / "output.vcf"
    with pytest.raises(CommandLineError):
        run_genotype(
            phase_input_files=[short_bamfile],
            ignore_read_groups=True,
            variant_file="tests/data/short-genome/wrongchromosome.vcf",
            output=outvcf,
        )
Exemplo n.º 11
0
def test_empty_format_field(tmp_path):
    outvcf = tmp_path / "output_empty_format.vcf"
    run_genotype(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/empty_format.vcf",
        output=outvcf,
        gt_qual_threshold=0,
    )

    # check if sample fields now contain information
    assert os.path.isfile(outvcf)
    vcf_reader = VariantFile(outvcf)
    for record in vcf_reader:
        for sample, call in record.samples.items():
            assert set(call) == {"GT", "GL", "GQ"}
Exemplo n.º 12
0
def test_multiallelic(tmpdir):
    outvcf = str(tmpdir.join("output_multi.vcf"))
    run_genotype(
        phase_input_files=["tests/data/pacbio/pacbio.bam"],
        variant_file="tests/data/multiallelic.vcf",
        reference="tests/data/pacbio/reference.fasta",
        output=outvcf,
        indels=False,
    )
    vcf_reader = VariantFile(outvcf)
    for record in vcf_reader:
        n_alleles = len(record.alts) + 1
        if n_alleles > 1:
            for call in record.samples.values():
                assert len(call["GL"]) == ((n_alleles + 1) * n_alleles) / 2
Exemplo n.º 13
0
def test_phase_trio_paired_end_reads(tmp_path):
    outvcf = tmp_path / "output-paired_end.vcf"
    run_genotype(
        phase_input_files=[trio_paired_end_bamfile],
        variant_file="tests/data/paired_end.sorted.vcf",
        output=outvcf,
        ped="tests/data/trio_paired_end.ped",
        genmap="tests/data/trio.map",
    )
    assert os.path.isfile(outvcf)
    tables = list(VcfReader(outvcf, phases=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "1"
    assert len(table.variants) == 3
    assert table.samples == ["mother", "father", "child"]
Exemplo n.º 14
0
def test_adding_constant(constant, tmpdir):
    priors_raw_vcf = str(tmpdir.join("output.raw_priors.vcf"))
    outvcf_raw_vcf = str(tmpdir.join("output_raw.vcf"))
    priors_const_vcf = str(tmpdir.join("output.const_priors.vcf"))
    outvcf_const_vcf = str(tmpdir.join("output_raw.vcf"))

    # run genotyping without adding constant to priors
    run_genotype(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio.vcf",
        prioroutput=priors_raw_vcf,
        output=outvcf_raw_vcf,
        indels=False,
    )

    # run genotyping with modified priors
    run_genotype(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio.vcf",
        prioroutput=priors_const_vcf,
        output=outvcf_const_vcf,
        indels=False,
        constant=constant,
    )

    # check if priors were modified properly
    with pysam.VariantFile(priors_raw_vcf) as f:
        records_raw = list(f)
    with pysam.VariantFile(priors_const_vcf) as f:
        records_const = list(f)

    assert len(records_raw) == len(records_const)

    for record_raw, record_const in zip(records_raw, records_const):
        likelihoods_raw = extract_likelihoods(record_raw)
        likelihoods_const = extract_likelihoods(record_const)
        norm_sum = likelihoods_raw[0] + likelihoods_raw[1] + likelihoods_raw[
            2] + 3.0 * constant
        # print(float(likelihoods_const[0]), float((likelihoods_raw[0] + constant)/norm_sum))
        # print(float(likelihoods_const[1]), float((likelihoods_raw[1] + constant)/norm_sum))
        # print(float(likelihoods_const[2]), float((likelihoods_raw[2] + constant)/norm_sum))

        for j in range(3):
            assert (pytest.approx(likelihoods_const[j],
                                  1e-5) == (likelihoods_raw[j] + constant) /
                    norm_sum)
Exemplo n.º 15
0
def test_genotyping_trio(tmp_path):
    outvcf = tmp_path / "output.vcf"
    outpriors = tmp_path / "priors.vcf"
    run_genotype(
        phase_input_files=[trio_bamfile],
        variant_file="tests/data/trio.vcf",
        output=outvcf,
        ped="tests/data/trio.ped",
        genmap="tests/data/trio.map",
        prioroutput=outpriors,
    )

    for outfile in [outvcf, outpriors]:
        assert os.path.isfile(outfile)

        tables = list(VcfReader(outfile, phases=True))
        assert len(tables) == 1
        table = tables[0]
        assert table.chromosome == "1"
        assert len(table.variants) == 5
        assert table.samples == ["HG004", "HG003", "HG002"]
Exemplo n.º 16
0
def test_use_ped_samples(tmp_path):
    outvcf = tmp_path / "output_ped_samples.vcf"
    run_genotype(
        phase_input_files=[ped_samples_bamfile],
        variant_file="tests/data/ped_samples.vcf",
        output=outvcf,
        ped="tests/data/trio.ped",
        genmap="tests/data/trio.map",
        use_ped_samples=True,
    )
    assert os.path.isfile(outvcf)

    tables = list(VcfReader(outvcf, phases=True, genotype_likelihoods=True))
    assert len(tables) == 1
    table = tables[0]
    assert table.chromosome == "1"
    assert len(table.variants) == 5
    assert table.samples == ["HG004", "HG003", "HG002", "orphan"]

    default_l = math.log10(1 / 3.0)
    for var in table.genotype_likelihoods_of("orphan"):
        for v in var.log10_probs():
            assert pytest.approx(default_l) == v
Exemplo n.º 17
0
def test_one_variant():
    run_genotype(
        phase_input_files=["tests/data/oneread.bam"],
        variant_file="tests/data/onevariant.vcf",
        output="/dev/null",
    )
Exemplo n.º 18
0
def test_with_reference():
    run_genotype(
        phase_input_files=["tests/data/pacbio/pacbio.bam"],
        variant_file="tests/data/pacbio/variants.vcf",
        reference="tests/data/pacbio/reference.fasta",
    )
Exemplo n.º 19
0
def test_default_output():
    """Output to stdout"""
    run_genotype(
        phase_input_files=["tests/data/oneread.bam"],
        variant_file="tests/data/onevariant.vcf",
    )