예제 #1
0
def test_convert_counts(tmp_path, counts, expfile):
    csvfile = str(tmp_path / "out.csv")
    result = TypingResult(fromfile=data_file("prof/deep-filt-clean.json"))
    result.dump_csv(csvfile, "MySample", counts=counts)
    observed = pd.read_csv(csvfile)
    expected = pd.read_csv(data_file(expfile))
    assert observed.equals(expected)
예제 #2
0
def test_pipe_gbr_usc10(tmp_path):
    hg38 = str(tmp_path / "hg38-placeholder.fasta")
    copyfile(data_file("refr/usc10-refr.fna"), hg38)
    run(["bwa", "index", hg38])
    arglist = [
        "pipe",
        data_file("refr/usc10-refr.fna"),
        data_file("def/usc10-offsets.tsv"),
        data_file(""),
        "gbr-usc",
        "--workdir",
        str(tmp_path),
        "--threads",
        "1",
        "--hg38",
        hg38,
    ]
    args = microhapulator.cli.get_parser().parse_args(arglist)
    microhapulator.cli.pipe.main(args)
    expected = SimulatedProfile(fromfile=data_file("prof/gbr-usc10-sim.json"))
    observed = TypingResult(fromfile=tmp_path / "analysis" / "gbr-usc" /
                            "gbr-usc-type.json")
    diff = list(mhapi.diff(observed, expected))
    assert len(diff) == 0
    assert (tmp_path / "report.html").is_file()
    expected = pd.read_csv(data_file("gbr-usc-summary.tsv"), sep="\t")
    observed = pd.read_csv(tmp_path / "analysis" / "summary.tsv", sep="\t")
    assert observed.equals(expected)
예제 #3
0
def test_filter_dupl_marker():
    config = pd.read_csv(data_file("filters-redundant.csv"),
                         sep=None,
                         engine="python")
    result = TypingResult(fromfile=data_file("prof/deep-raw.json"))
    message = "filter config file contains duplicate entries for some markers"
    with pytest.raises(ValueError, match=message):
        result.filter(static=5, dynamic=0.02, config=config)
예제 #4
0
def test_typing_rate():
    result = TypingResult(fromfile=data_file("prof/two-contrib-even.json"))
    rates = result.typing_rate()
    assert rates.TypedReads.head(5).to_list() == [3427, 2653, 3241, 4105, 3819]
    assert rates.TotalReads.head(5).to_list() == [4550, 4540, 4539, 4531, 4538]
    expected = [0.753187, 0.584361, 0.714034, 0.905981, 0.841560]
    observed = rates.TypingRate.head(5).to_list()
    for exp, obs in zip(expected, observed):
        assert exp == pytest.approx(obs)
예제 #5
0
def test_filter_missing_column():
    config = pd.read_csv(data_file("filters-missing.csv"),
                         sep=None,
                         engine="python")
    result = TypingResult(fromfile=data_file("prof/deep-raw.json"))
    with pytest.raises(
            ValueError,
            match=r"filter config file missing column\(s\): Static"):
        result.filter(static=5, dynamic=0.02, config=config)
예제 #6
0
def test_haplotypes():
    simprof = SimulatedProfile.populate_from_bed(data_file("gttest.bed.gz"))
    typeprof = TypingResult(fromfile=data_file("prof/gttest.json"))
    assert simprof.haplotypes("BoGuSlOcUs") == set()
    assert typeprof.haplotypes("BoGuSlOcUs") == set()
    assert simprof.haplotypes("MHDBL000135") == set(["G,C,T", "G,T,C"])
    assert typeprof.haplotypes("MHDBL000135") == set(["G,C,T", "G,T,C"])
    assert simprof.haplotypes("MHDBL000135", index=0) == set(["G,C,T"])
    assert simprof.haplotypes("MHDBL000135", index=1) == set(["G,T,C"])
    assert typeprof.haplotypes("MHDBL000135", index=0) == set()
예제 #7
0
def test_type_cli_simple(tmp_path):
    outfile = str(tmp_path / "typing-result.json")
    arglist = [
        "type",
        "--out",
        outfile,
        data_file("pashtun-sim/tiny-panel.tsv"),
        data_file("pashtun-sim/aligned-reads.bam"),
    ]
    args = microhapulator.cli.get_parser().parse_args(arglist)
    microhapulator.cli.type.main(args)
    result = TypingResult(fromfile=outfile)
    assert result.haplotypes("mh13KK-218") == set()
    assert result.data["markers"]["mh13KK-218"]["typing_result"] == {
        "C,T,C,G": 1,
        "C,T,T,T": 1,
        "G,T,C,T": 1,
        "T,A,C,T": 1,
        "T,G,C,T": 3,
        "T,G,T,T": 2,
        "T,T,A,T": 5,
        "T,T,C,A": 1,
        "T,T,C,C": 2,
        "T,T,C,G": 2,
        "T,T,C,T": 1178,
        "T,T,G,T": 2,
        "T,T,T,A": 2,
        "T,T,T,G": 6,
        "T,T,T,T": 1170,
    }
    assert result.haplotypes("mh21KK-320") == set()
    assert result.data["markers"]["mh21KK-320"]["typing_result"] == {
        "G,A,A,A": 1,
        "G,A,C,A": 3,
        "G,A,G,A": 3,
        "G,A,T,A": 1075,
        "G,A,T,C": 1,
        "G,A,T,G": 1,
        "G,A,T,T": 2,
        "G,C,C,A": 1,
        "G,C,T,A": 4,
        "G,G,A,A": 2,
        "G,G,A,T": 1,
        "G,G,C,A": 1075,
        "G,G,C,C": 3,
        "G,G,C,G": 12,
        "G,G,C,T": 5,
        "G,G,T,A": 4,
        "G,T,C,A": 1,
        "T,G,C,A": 1,
    }
예제 #8
0
def test_filter_simple():
    bam = data_file("pashtun-sim/aligned-reads.bam")
    tsv = data_file("pashtun-sim/tiny-panel.tsv")
    observed = mhapi.type(bam, tsv)
    observed.filter(static=10, dynamic=0.05)
    expected = TypingResult(fromfile=data_file("pashtun-sim/test-output.json"))
    assert observed == expected
예제 #9
0
def test_dist_even_mixture():
    with microhapulator.open(data_file("murica/x-obs-genotype.json"),
                             "r") as fh:
        p1 = TypingResult(fh)
    p2 = SimulatedProfile.populate_from_bed(
        data_file("murica/x-sim-genotype.bed"))
    assert mhapi.dist(p1, p2) == 0
    assert p1 == p2
예제 #10
0
def test_filter_cli_config(tmp_path):
    unfiltered = data_file("prof/deep-raw.json")
    filtered = str(tmp_path / "genotype-call.json")
    arglist = [
        "filter",
        unfiltered,
        "--out",
        filtered,
        "--static",
        "5",
        "--dynamic",
        "0.02",
        "--config",
        data_file("filters.csv"),
    ]
    args = microhapulator.cli.get_parser().parse_args(arglist)
    microhapulator.cli.filter.main(args)
    observed = TypingResult(fromfile=filtered)
    expected = TypingResult(fromfile=data_file("prof/deep-filt.json"))
    assert observed == expected
예제 #11
0
def test_filter_cli(tmp_path):
    unfiltered = str(tmp_path / "typing-result.json")
    filtered = str(tmp_path / "genotype-call.json")
    arglist = [
        "type",
        "--out",
        unfiltered,
        data_file("pashtun-sim/tiny-panel.tsv"),
        data_file("pashtun-sim/aligned-reads.bam"),
    ]
    args = microhapulator.cli.get_parser().parse_args(arglist)
    microhapulator.cli.type.main(args)
    arglist = [
        "filter", "--out", filtered, "--static", "5", "--dynamic", "0.05",
        unfiltered
    ]
    args = microhapulator.cli.get_parser().parse_args(arglist)
    microhapulator.cli.filter.main(args)
    observed = TypingResult(fromfile=filtered)
    expected = TypingResult(fromfile=data_file("pashtun-sim/test-output.json"))
    assert observed == expected
예제 #12
0
def main(args):
    result = TypingResult(fromfile=args.result)
    config = None
    if args.config:
        config = pd.read_csv(args.config, sep=None, engine="python")
    result.filter(static=args.static, dynamic=args.dynamic, config=config)
    result.dump(args.out)
예제 #13
0
def type(bamfile, markertsv, minbasequal=10, max_depth=1e6):
    """Perform haplotype calling

    :param str bamfile: path of a BAM file containing NGS reads aligned to marker reference sequences and sorted
    :param str markertsv: path of a TSV file containing marker metadata, specifically the offset of each SNP for every marker in the panel
    :param int minbasequal: minimum base quality (PHRED score) to be considered reliable for haplotype calling; default is 10, corresponding to Q10, i.e., 90% probability that the base call is correct
    :param float max_depth: maximum permitted read depth
    :returns: an unfiltered catalog of haplotype counts for each marker (a *typing result*)
    :rtype: microhapulator.profile.TypingResult
    """
    check_index(bamfile)
    bam = pysam.AlignmentFile(bamfile, "rb")
    markers = load_marker_definitions(markertsv)
    offsets = defaultdict(list)
    for n, row in markers.iterrows():
        offsets[row.Marker].append(row.Offset)
    cross_check_marker_ids(bam.references, offsets.keys(), "read alignments",
                           "marker definitions")
    haplotype_caller = tally_haplotypes(bam,
                                        offsets,
                                        minbasequal=minbasequal,
                                        max_depth=max_depth)
    result = TypingResult()
    for locusid, cov_by_pos, htcounts, ndiscarded in haplotype_caller:
        result.record_coverage(locusid, cov_by_pos, ndiscarded=ndiscarded)
        for haplotype, count in htcounts.items():
            result.record_haplotype(locusid, haplotype, count)
    return result
예제 #14
0
def test_filter_config_file():
    config = pd.read_csv(data_file("filters.csv"), sep=None, engine="python")
    result = TypingResult(fromfile=data_file("prof/deep-raw.json"))
    result.filter(static=5, dynamic=0.02, config=config)
    assert len(result.haplotypes("mh01XYZ-1")) == 8
    assert len(result.haplotypes("mh02XYZ-2")) == 2
    assert len(result.haplotypes("mh02XYZ-3")) == 2
예제 #15
0
def test_sim_obs_profile_not_equal():
    simprof1 = SimulatedProfile.populate_from_bed(
        data_file("gttest-mismatch1.bed.gz"))
    assert simprof1 is not None
    assert simprof1 != 42
    assert simprof1 != 3.14159
    assert simprof1 != "A,C,C,T"

    typeprof1 = TypingResult(fromfile=data_file("prof/gttest.json"))
    assert simprof1 != typeprof1
    assert typeprof1 != simprof1
    assert typeprof1 != 1985
    assert typeprof1 != 98.6

    simprof2 = SimulatedProfile.populate_from_bed(
        data_file("gttest-mismatch2.bed.gz"))
    assert simprof1 != simprof2
    assert simprof2 != typeprof1
    assert typeprof1 != simprof2

    typeprof2 = TypingResult(fromfile=data_file("prof/gttest-altered.json"))
    assert typeprof1 != typeprof2
예제 #16
0
def main(args):
    result = TypingResult(fromfile=args.input)
    chisq, data = mhapi.interlocus_balance(
        result,
        include_discarded=args.discarded,
        terminal=not args.quiet,
        tofile=args.figure,
        title=args.title,
        figsize=args.figsize,
        dpi=args.dpi,
    )
    print(f"Extent of imbalance (chi-square statistic): {chisq:.4f}")
    if args.csv:
        data.to_csv(args.csv, index=False)
예제 #17
0
def main(args):
    result = TypingResult(fromfile=args.input)
    tstat, data = mhapi.heterozygote_balance(
        result,
        tofile=args.figure,
        title=args.title,
        figsize=args.figsize,
        dpi=args.dpi,
        dolabels=args.labels,
        absolute=args.absolute,
    )
    print(f"Extent of imbalance (t-statistic): {tstat:.4f}")
    if args.csv:
        data.to_csv(args.csv, index=False)
예제 #18
0
def main(args):
    result = TypingResult(fromfile=args.result)
    result.dump_csv(args.out,
                    args.sample,
                    counts=args.counts,
                    fix_homo=args.fix_homo)
예제 #19
0
def test_dist_log_mixture():
    p1 = TypingResult(data_file("murica/y-obs-genotype.json"))
    p2 = SimulatedProfile.populate_from_bed(
        data_file("murica/y-sim-genotype.bed"))
    assert mhapi.dist(p1, p2) == 19
    assert p1 != p2
예제 #20
0
def test_dist_gujarati(gt1, gt2, dist):
    r1 = TypingResult(data_file(gt1))
    r2 = TypingResult(data_file(gt2))
    assert mhapi.dist(r1, r2) == dist
예제 #21
0
def test_sim_obs_profile_equality():
    simprof = SimulatedProfile.populate_from_bed(data_file("gttest.bed.gz"))
    typeprof = TypingResult(fromfile=data_file("prof/gttest.json"))
    assert simprof == typeprof
    assert typeprof == simprof
예제 #22
0
def test_haploindexes():
    simprof = SimulatedProfile.populate_from_bed(
        data_file("gttest-mismatch1.bed.gz"))
    assert simprof.haploindexes() == set([0, 1])
    typeprof = TypingResult(data_file("pashtun-sim/test-output.json"))
    assert typeprof.haploindexes() == set()