def test_convert_counts(tmp_path, counts, expfile): csvfile = str(tmp_path / "out.csv") result = TypingResult(fromfile=data_file("prof/deep-filt-clean.json")) result.dump_csv(csvfile, "MySample", counts=counts) observed = pd.read_csv(csvfile) expected = pd.read_csv(data_file(expfile)) assert observed.equals(expected)
def test_pipe_gbr_usc10(tmp_path): hg38 = str(tmp_path / "hg38-placeholder.fasta") copyfile(data_file("refr/usc10-refr.fna"), hg38) run(["bwa", "index", hg38]) arglist = [ "pipe", data_file("refr/usc10-refr.fna"), data_file("def/usc10-offsets.tsv"), data_file(""), "gbr-usc", "--workdir", str(tmp_path), "--threads", "1", "--hg38", hg38, ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.pipe.main(args) expected = SimulatedProfile(fromfile=data_file("prof/gbr-usc10-sim.json")) observed = TypingResult(fromfile=tmp_path / "analysis" / "gbr-usc" / "gbr-usc-type.json") diff = list(mhapi.diff(observed, expected)) assert len(diff) == 0 assert (tmp_path / "report.html").is_file() expected = pd.read_csv(data_file("gbr-usc-summary.tsv"), sep="\t") observed = pd.read_csv(tmp_path / "analysis" / "summary.tsv", sep="\t") assert observed.equals(expected)
def test_filter_dupl_marker(): config = pd.read_csv(data_file("filters-redundant.csv"), sep=None, engine="python") result = TypingResult(fromfile=data_file("prof/deep-raw.json")) message = "filter config file contains duplicate entries for some markers" with pytest.raises(ValueError, match=message): result.filter(static=5, dynamic=0.02, config=config)
def test_typing_rate(): result = TypingResult(fromfile=data_file("prof/two-contrib-even.json")) rates = result.typing_rate() assert rates.TypedReads.head(5).to_list() == [3427, 2653, 3241, 4105, 3819] assert rates.TotalReads.head(5).to_list() == [4550, 4540, 4539, 4531, 4538] expected = [0.753187, 0.584361, 0.714034, 0.905981, 0.841560] observed = rates.TypingRate.head(5).to_list() for exp, obs in zip(expected, observed): assert exp == pytest.approx(obs)
def test_filter_missing_column(): config = pd.read_csv(data_file("filters-missing.csv"), sep=None, engine="python") result = TypingResult(fromfile=data_file("prof/deep-raw.json")) with pytest.raises( ValueError, match=r"filter config file missing column\(s\): Static"): result.filter(static=5, dynamic=0.02, config=config)
def test_haplotypes(): simprof = SimulatedProfile.populate_from_bed(data_file("gttest.bed.gz")) typeprof = TypingResult(fromfile=data_file("prof/gttest.json")) assert simprof.haplotypes("BoGuSlOcUs") == set() assert typeprof.haplotypes("BoGuSlOcUs") == set() assert simprof.haplotypes("MHDBL000135") == set(["G,C,T", "G,T,C"]) assert typeprof.haplotypes("MHDBL000135") == set(["G,C,T", "G,T,C"]) assert simprof.haplotypes("MHDBL000135", index=0) == set(["G,C,T"]) assert simprof.haplotypes("MHDBL000135", index=1) == set(["G,T,C"]) assert typeprof.haplotypes("MHDBL000135", index=0) == set()
def test_type_cli_simple(tmp_path): outfile = str(tmp_path / "typing-result.json") arglist = [ "type", "--out", outfile, data_file("pashtun-sim/tiny-panel.tsv"), data_file("pashtun-sim/aligned-reads.bam"), ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.type.main(args) result = TypingResult(fromfile=outfile) assert result.haplotypes("mh13KK-218") == set() assert result.data["markers"]["mh13KK-218"]["typing_result"] == { "C,T,C,G": 1, "C,T,T,T": 1, "G,T,C,T": 1, "T,A,C,T": 1, "T,G,C,T": 3, "T,G,T,T": 2, "T,T,A,T": 5, "T,T,C,A": 1, "T,T,C,C": 2, "T,T,C,G": 2, "T,T,C,T": 1178, "T,T,G,T": 2, "T,T,T,A": 2, "T,T,T,G": 6, "T,T,T,T": 1170, } assert result.haplotypes("mh21KK-320") == set() assert result.data["markers"]["mh21KK-320"]["typing_result"] == { "G,A,A,A": 1, "G,A,C,A": 3, "G,A,G,A": 3, "G,A,T,A": 1075, "G,A,T,C": 1, "G,A,T,G": 1, "G,A,T,T": 2, "G,C,C,A": 1, "G,C,T,A": 4, "G,G,A,A": 2, "G,G,A,T": 1, "G,G,C,A": 1075, "G,G,C,C": 3, "G,G,C,G": 12, "G,G,C,T": 5, "G,G,T,A": 4, "G,T,C,A": 1, "T,G,C,A": 1, }
def test_filter_simple(): bam = data_file("pashtun-sim/aligned-reads.bam") tsv = data_file("pashtun-sim/tiny-panel.tsv") observed = mhapi.type(bam, tsv) observed.filter(static=10, dynamic=0.05) expected = TypingResult(fromfile=data_file("pashtun-sim/test-output.json")) assert observed == expected
def test_dist_even_mixture(): with microhapulator.open(data_file("murica/x-obs-genotype.json"), "r") as fh: p1 = TypingResult(fh) p2 = SimulatedProfile.populate_from_bed( data_file("murica/x-sim-genotype.bed")) assert mhapi.dist(p1, p2) == 0 assert p1 == p2
def test_filter_cli_config(tmp_path): unfiltered = data_file("prof/deep-raw.json") filtered = str(tmp_path / "genotype-call.json") arglist = [ "filter", unfiltered, "--out", filtered, "--static", "5", "--dynamic", "0.02", "--config", data_file("filters.csv"), ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.filter.main(args) observed = TypingResult(fromfile=filtered) expected = TypingResult(fromfile=data_file("prof/deep-filt.json")) assert observed == expected
def test_filter_cli(tmp_path): unfiltered = str(tmp_path / "typing-result.json") filtered = str(tmp_path / "genotype-call.json") arglist = [ "type", "--out", unfiltered, data_file("pashtun-sim/tiny-panel.tsv"), data_file("pashtun-sim/aligned-reads.bam"), ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.type.main(args) arglist = [ "filter", "--out", filtered, "--static", "5", "--dynamic", "0.05", unfiltered ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.filter.main(args) observed = TypingResult(fromfile=filtered) expected = TypingResult(fromfile=data_file("pashtun-sim/test-output.json")) assert observed == expected
def main(args): result = TypingResult(fromfile=args.result) config = None if args.config: config = pd.read_csv(args.config, sep=None, engine="python") result.filter(static=args.static, dynamic=args.dynamic, config=config) result.dump(args.out)
def type(bamfile, markertsv, minbasequal=10, max_depth=1e6): """Perform haplotype calling :param str bamfile: path of a BAM file containing NGS reads aligned to marker reference sequences and sorted :param str markertsv: path of a TSV file containing marker metadata, specifically the offset of each SNP for every marker in the panel :param int minbasequal: minimum base quality (PHRED score) to be considered reliable for haplotype calling; default is 10, corresponding to Q10, i.e., 90% probability that the base call is correct :param float max_depth: maximum permitted read depth :returns: an unfiltered catalog of haplotype counts for each marker (a *typing result*) :rtype: microhapulator.profile.TypingResult """ check_index(bamfile) bam = pysam.AlignmentFile(bamfile, "rb") markers = load_marker_definitions(markertsv) offsets = defaultdict(list) for n, row in markers.iterrows(): offsets[row.Marker].append(row.Offset) cross_check_marker_ids(bam.references, offsets.keys(), "read alignments", "marker definitions") haplotype_caller = tally_haplotypes(bam, offsets, minbasequal=minbasequal, max_depth=max_depth) result = TypingResult() for locusid, cov_by_pos, htcounts, ndiscarded in haplotype_caller: result.record_coverage(locusid, cov_by_pos, ndiscarded=ndiscarded) for haplotype, count in htcounts.items(): result.record_haplotype(locusid, haplotype, count) return result
def test_filter_config_file(): config = pd.read_csv(data_file("filters.csv"), sep=None, engine="python") result = TypingResult(fromfile=data_file("prof/deep-raw.json")) result.filter(static=5, dynamic=0.02, config=config) assert len(result.haplotypes("mh01XYZ-1")) == 8 assert len(result.haplotypes("mh02XYZ-2")) == 2 assert len(result.haplotypes("mh02XYZ-3")) == 2
def test_sim_obs_profile_not_equal(): simprof1 = SimulatedProfile.populate_from_bed( data_file("gttest-mismatch1.bed.gz")) assert simprof1 is not None assert simprof1 != 42 assert simprof1 != 3.14159 assert simprof1 != "A,C,C,T" typeprof1 = TypingResult(fromfile=data_file("prof/gttest.json")) assert simprof1 != typeprof1 assert typeprof1 != simprof1 assert typeprof1 != 1985 assert typeprof1 != 98.6 simprof2 = SimulatedProfile.populate_from_bed( data_file("gttest-mismatch2.bed.gz")) assert simprof1 != simprof2 assert simprof2 != typeprof1 assert typeprof1 != simprof2 typeprof2 = TypingResult(fromfile=data_file("prof/gttest-altered.json")) assert typeprof1 != typeprof2
def main(args): result = TypingResult(fromfile=args.input) chisq, data = mhapi.interlocus_balance( result, include_discarded=args.discarded, terminal=not args.quiet, tofile=args.figure, title=args.title, figsize=args.figsize, dpi=args.dpi, ) print(f"Extent of imbalance (chi-square statistic): {chisq:.4f}") if args.csv: data.to_csv(args.csv, index=False)
def main(args): result = TypingResult(fromfile=args.input) tstat, data = mhapi.heterozygote_balance( result, tofile=args.figure, title=args.title, figsize=args.figsize, dpi=args.dpi, dolabels=args.labels, absolute=args.absolute, ) print(f"Extent of imbalance (t-statistic): {tstat:.4f}") if args.csv: data.to_csv(args.csv, index=False)
def main(args): result = TypingResult(fromfile=args.result) result.dump_csv(args.out, args.sample, counts=args.counts, fix_homo=args.fix_homo)
def test_dist_log_mixture(): p1 = TypingResult(data_file("murica/y-obs-genotype.json")) p2 = SimulatedProfile.populate_from_bed( data_file("murica/y-sim-genotype.bed")) assert mhapi.dist(p1, p2) == 19 assert p1 != p2
def test_dist_gujarati(gt1, gt2, dist): r1 = TypingResult(data_file(gt1)) r2 = TypingResult(data_file(gt2)) assert mhapi.dist(r1, r2) == dist
def test_sim_obs_profile_equality(): simprof = SimulatedProfile.populate_from_bed(data_file("gttest.bed.gz")) typeprof = TypingResult(fromfile=data_file("prof/gttest.json")) assert simprof == typeprof assert typeprof == simprof
def test_haploindexes(): simprof = SimulatedProfile.populate_from_bed( data_file("gttest-mismatch1.bed.gz")) assert simprof.haploindexes() == set([0, 1]) typeprof = TypingResult(data_file("pashtun-sim/test-output.json")) assert typeprof.haploindexes() == set()