def test_main_mixture(capsys): arglist = [ "seq", "--seeds", "42", "1776", "--proportions", "0.8", "0.2", "--num-reads", "500", data_file("def/yellow-offsets.tsv"), data_file("refr/yellow-refr.fasta.gz"), data_file("prof/yellow-mix-gt.json"), ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.seq.main(args) terminal = capsys.readouterr() outlines = terminal.out.strip().split("\n") nrecords = len(outlines) / 4 assert nrecords == pytest.approx(500, abs=25) assert outlines[-3] == ( "TCAATTCAATTTCTACCCTCAGCATCAAGGCAGGGGTTCATCATAATGGGTATTGGAGGCTCAAAGAAA" "ATTTAGGCTCAGCACACACACACACACACACACACACACACAGCGATTTTTAATGCTGGTACAATCACA" "GGAGACTGCGACCCAGCCCTCCTCAGCGCCTCGGGTGCTCACGGGCACTCCTGGAGTCTCGGCCACACT" "AAGTCCCCCTGGTGGCCACACAGAAGAAGAGGTGGTAAAACTTTCTGGGAGTGAGATCAAAAATTTTAG" "GAGTCTAAAAACATACTTTTCTAAG")
def test_convert_counts(tmp_path, counts, expfile): csvfile = str(tmp_path / "out.csv") result = TypingResult(fromfile=data_file("prof/deep-filt-clean.json")) result.dump_csv(csvfile, "MySample", counts=counts) observed = pd.read_csv(csvfile) expected = pd.read_csv(data_file(expfile)) assert observed.equals(expected)
def test_mixture_failure_modes(): freqs = microhapulator.load_marker_frequencies( data_file("freq/russ4-freq.tsv")) markers = microhapulator.load_marker_definitions( data_file("def/russ4-offsets.tsv")) seqs = microhapulator.load_marker_reference_sequences( data_file("refr/russ4-refr.fasta.gz")) profiles = [mhapi.sim(freqs) for _ in range(3)] with pytest.raises(ValueError, match=r"number of profiles must match number of seeds"): for read in mhapi.seq(profiles, markers, seqs, seeds=[42, 1776]): pass with pytest.raises( ValueError, match=r"mismatch between contributor number and proportions"): for read in mhapi.seq(profiles, markers, seqs, proportions=[0.5, 0.3, 0.1, 0.1]): pass message = r"specified proportions result in 0 reads for 1 or more individuals" with pytest.raises(ValueError, match=message): for read in mhapi.seq(profiles, markers, seqs, totalreads=500, proportions=[1, 100, 10000]): pass
def test_filter_config_file(): config = pd.read_csv(data_file("filters.csv"), sep=None, engine="python") result = TypingResult(fromfile=data_file("prof/deep-raw.json")) result.filter(static=5, dynamic=0.02, config=config) assert len(result.haplotypes("mh01XYZ-1")) == 8 assert len(result.haplotypes("mh02XYZ-2")) == 2 assert len(result.haplotypes("mh02XYZ-3")) == 2
def test_unite_basic(momgt, dadgt, kidgt, seed): mom = Profile(fromfile=data_file(f"prof/{momgt}")) dad = Profile(fromfile=data_file(f"prof/{dadgt}")) kid = Profile(fromfile=data_file(f"prof/{kidgt}")) numpy.random.seed(seed) test = Profile.unite(mom, dad) assert test == kid
def test_unite_unshared(capsys): mom = Profile(fromfile=data_file("prof/swedish-mom.json")) dad = Profile(fromfile=data_file("prof/swedish-dad.json")) kid = Profile.unite(mom, dad) terminal = capsys.readouterr() message = "markers not common to mom and dad profiles are excluded" assert message in terminal.err
def test_filter_simple(): bam = data_file("pashtun-sim/aligned-reads.bam") tsv = data_file("pashtun-sim/tiny-panel.tsv") observed = mhapi.type(bam, tsv) observed.filter(static=10, dynamic=0.05) expected = TypingResult(fromfile=data_file("pashtun-sim/test-output.json")) assert observed == expected
def test_diff_basic(): gt1 = SimulatedProfile(fromfile=data_file("prof/diff-comp-1.json")) gt2 = SimulatedProfile(fromfile=data_file("prof/diff-comp-2.json")) diff = list(mhapi.diff(gt1, gt2)) assert diff == [ ("MHDBL000140", {"C,C,A,A"}, {"C,C,T,A"}), ("MHDBL000163", {"A,A,G,A,T"}, {"C,G,A,A,T"}), ]
def test_interlocus_balance_basic(capfd): profile = Profile(fromfile=data_file("prof/three-contrib-log.json")) chisq, obs_data = mhapi.interlocus_balance(profile) exp_data = pd.read_csv(data_file("three-contrib-log-balance.csv")) assert obs_data.equals(exp_data) assert chisq == pytest.approx(0.00928395) terminal = capfd.readouterr() assert "MHDBL000212: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 50.00" in terminal.out
def test_heterozygote_balance_basic(tmp_path): figfile = tmp_path / "figure.png" profile = Profile(fromfile=data_file("prof/single-contrib-2.json")) tstat, obs_data = mhapi.heterozygote_balance(profile, tofile=figfile) assert tstat == pytest.approx(3.90845) exp_data = pd.read_csv(data_file("het-balance.tsv"), sep="\t") assert obs_data.equals(exp_data) assert figfile.is_file()
def test_dist_even_mixture(): with microhapulator.open(data_file("murica/x-obs-genotype.json"), "r") as fh: p1 = TypingResult(fh) p2 = SimulatedProfile.populate_from_bed( data_file("murica/x-sim-genotype.bed")) assert mhapi.dist(p1, p2) == 0 assert p1 == p2
def test_filter_dupl_marker(): config = pd.read_csv(data_file("filters-redundant.csv"), sep=None, engine="python") result = TypingResult(fromfile=data_file("prof/deep-raw.json")) message = "filter config file contains duplicate entries for some markers" with pytest.raises(ValueError, match=message): result.filter(static=5, dynamic=0.02, config=config)
def test_diff_nonmatching_alleles(): p1 = SimulatedProfile(fromfile=data_file("prof/red-strict-profile.json")) p2 = SimulatedProfile(fromfile=data_file("prof/red-relaxed-profile.json")) diff = list(mhapi.diff(p1, p2)) print(diff) assert diff == [ ("mh07CP-004", set(), {"T,T,T,A,T", "A,A,T,A,T"}), ("mh09KK-157", set(), {"G,C,C,A,T"}), ]
def test_filter_missing_column(): config = pd.read_csv(data_file("filters-missing.csv"), sep=None, engine="python") result = TypingResult(fromfile=data_file("prof/deep-raw.json")) with pytest.raises( ValueError, match=r"filter config file missing column\(s\): Static"): result.filter(static=5, dynamic=0.02, config=config)
def test_contain_cli(capsys): arglist = [ "contain", data_file("prof/one-brit-sim.json"), data_file("prof/one-italian-sim.json"), ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.contain.main(args) terminal = capsys.readouterr() assert '"containment": 0.4444' in terminal.out
def test_haplotypes(): simprof = SimulatedProfile.populate_from_bed(data_file("gttest.bed.gz")) typeprof = TypingResult(fromfile=data_file("prof/gttest.json")) assert simprof.haplotypes("BoGuSlOcUs") == set() assert typeprof.haplotypes("BoGuSlOcUs") == set() assert simprof.haplotypes("MHDBL000135") == set(["G,C,T", "G,T,C"]) assert typeprof.haplotypes("MHDBL000135") == set(["G,C,T", "G,T,C"]) assert simprof.haplotypes("MHDBL000135", index=0) == set(["G,C,T"]) assert simprof.haplotypes("MHDBL000135", index=1) == set(["G,T,C"]) assert typeprof.haplotypes("MHDBL000135", index=0) == set()
def test_complex_genotype(capsys): profile = Profile(fromfile=data_file("prof/mixture-genotype.json")) markers = microhapulator.load_marker_definitions( data_file("def/russ4-offsets.tsv")) seqs = microhapulator.load_marker_reference_sequences( data_file("refr/russ4-refr.fasta.gz")) sequencer = mhapi.seq(list(profile.unmix()), markers, seqs, totalreads=200) for n, read in enumerate(sequencer): pass terminal = capsys.readouterr() assert terminal.err.count("Individual seed=") == 3
def test_type_filter_threshold(): bam = data_file("bam/dyncut-test-reads.bam") tsv = data_file("def/dyncut-panel.tsv") rslt = mhapi.type(bam, tsv) rslt.filter(static=10, dynamic=0.005) assert rslt.haplotypes("MHDBL000018") == set(["C,A,C,T,G", "T,G,C,T,G"]) assert rslt.haplotypes("MHDBL000156") == set(["T,C,A,C", "T,C,G,G"]) rslt = mhapi.type(bam, tsv) rslt.filter(static=4, dynamic=0.005) assert rslt.haplotypes("MHDBL000018") == set( ["C,A,C,T,G", "T,G,C,T,G", "C,A,C,T,A", "T,G,C,T,A"]) assert rslt.haplotypes("MHDBL000156") == set(["T,C,A,C", "T,C,G,G"])
def test_locbalance_cli(tmp_path, capfd): outfile = str(tmp_path / "balance.csv") arglist = ["locbalance", "--csv", outfile, data_file("prof/three-contrib-log.json")] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.locbalance.main(args) obs_data = pd.read_csv(outfile) exp_data = pd.read_csv(data_file("three-contrib-log-balance.csv")) assert obs_data.equals(exp_data) terminal = capfd.readouterr() print(terminal.out) assert "Extent of imbalance (chi-square statistic): 0.0093" in terminal.out assert "MHDBL000212: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 50.00" in terminal.out
def test_diff_cli(): f1 = data_file("prof/diff-comp-1.json") f2 = data_file("prof/diff-comp-3.json") with NamedTemporaryFile(suffix=".json") as outfile: arglist = ["diff", "-o", outfile.name, f1, f2] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.diff.main(args) with microhapulator.open(outfile.name, "r") as fh: output = fh.read().strip() with microhapulator.open(data_file("diff-comp-1-3.txt"), "r") as fh: testoutput = fh.read().strip() assert output == testoutput
def test_dist_cli(): with NamedTemporaryFile() as outfile: arglist = [ "dist", "--out", outfile.name, data_file("prof/gujarati-ind2-gt.json"), data_file("prof/gujarati-ind3-gt.json"), ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.dist.main(args) with open(outfile.name, "r") as fh: assert json.load(fh) == {"hamming_distance": 3}
def test_type_cli_simple(tmp_path): outfile = str(tmp_path / "typing-result.json") arglist = [ "type", "--out", outfile, data_file("pashtun-sim/tiny-panel.tsv"), data_file("pashtun-sim/aligned-reads.bam"), ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.type.main(args) result = TypingResult(fromfile=outfile) assert result.haplotypes("mh13KK-218") == set() assert result.data["markers"]["mh13KK-218"]["typing_result"] == { "C,T,C,G": 1, "C,T,T,T": 1, "G,T,C,T": 1, "T,A,C,T": 1, "T,G,C,T": 3, "T,G,T,T": 2, "T,T,A,T": 5, "T,T,C,A": 1, "T,T,C,C": 2, "T,T,C,G": 2, "T,T,C,T": 1178, "T,T,G,T": 2, "T,T,T,A": 2, "T,T,T,G": 6, "T,T,T,T": 1170, } assert result.haplotypes("mh21KK-320") == set() assert result.data["markers"]["mh21KK-320"]["typing_result"] == { "G,A,A,A": 1, "G,A,C,A": 3, "G,A,G,A": 3, "G,A,T,A": 1075, "G,A,T,C": 1, "G,A,T,G": 1, "G,A,T,T": 2, "G,C,C,A": 1, "G,C,T,A": 4, "G,G,A,A": 2, "G,G,A,T": 1, "G,G,C,A": 1075, "G,G,C,C": 3, "G,G,C,G": 12, "G,G,C,T": 5, "G,G,T,A": 4, "G,T,C,A": 1, "T,G,C,A": 1, }
def test_diff_large(): gt1 = SimulatedProfile(fromfile=data_file("prof/diff-comp-1.json")) gt2 = SimulatedProfile(fromfile=data_file("prof/diff-comp-3.json")) diff = list(mhapi.diff(gt1, gt2)) loci = [d[0] for d in diff] print(diff[9], diff[17], diff[21]) assert loci == [ "MHDBL000002", "MHDBL000003", "MHDBL000007", "MHDBL000013", "MHDBL000017", "MHDBL000018", "MHDBL000030", "MHDBL000036", "MHDBL000038", "MHDBL000047", "MHDBL000058", "MHDBL000061", "MHDBL000076", "MHDBL000079", "MHDBL000082", "MHDBL000085", "MHDBL000088", "MHDBL000101", "MHDBL000106", "MHDBL000108", "MHDBL000111", "MHDBL000112", "MHDBL000122", "MHDBL000124", "MHDBL000128", "MHDBL000129", "MHDBL000135", "MHDBL000136", "MHDBL000138", "MHDBL000140", "MHDBL000144", "MHDBL000152", "MHDBL000154", "MHDBL000163", "MHDBL000181", "MHDBL000183", "MHDBL000194", "MHDBL000210", "MHDBL000211", "MHDBL000212", ] assert diff[9] == ("MHDBL000047", set(), {"T,T"}) assert diff[17] == ("MHDBL000101", {"C,C,C,T"}, {"T,C,C,C"}) assert diff[21] == ("MHDBL000112", {"G,G,A,C"}, set())
def test_dist_sim_vs_obs(hdist): with NamedTemporaryFile() as outfile: filename = "murica/z-obs-genotype-dist{:d}.json".format(hdist) arglist = [ "dist", "--out", outfile.name, data_file(filename), data_file("murica/z-sim-genotype.json"), ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.dist.main(args) with open(outfile.name, "r") as fh: assert json.load(fh) == {"hamming_distance": hdist}
def test_main_out_stdout(capsys): arglist = [ "seq", "--num-reads", "100", data_file("def/orange-offsets.tsv"), data_file("refr/orange-refr.fasta"), data_file("prof/orange-sim-profile.json"), ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.seq.main(args) terminal = capsys.readouterr() outlines = terminal.out.strip().split("\n") nrecords = len(outlines) / 4 assert nrecords == pytest.approx(100, abs=5)
def test_main(tmp_path): outfile = str(tmp_path / "profile.json") arglist = [ "sim", "--out", outfile, "--seed", "1985", data_file("freq/ceu50-freq.tsv"), ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.sim.main(args) observed = SimulatedProfile(fromfile=outfile) expected = SimulatedProfile(fromfile=data_file("prof/bitusa-profile.json")) assert observed == expected
def test_unite_cli(): with NamedTemporaryFile(suffix=".json") as outfile: arglist = [ "unite", "--seed", "113817", "--out", outfile.name, data_file("prof/green-mom-3-gt.json"), data_file("prof/green-dad-3-gt.json"), ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.unite.main(args) p = Profile(fromfile=outfile.name) testp = Profile(fromfile=data_file("prof/green-kid-3-gt.json")) assert p == testp
def test_merge_sim_genotypes(): prof1 = SimulatedProfile() prof1.add(0, "mh11CP-004", "C,G,G") prof1.add(1, "mh11CP-004", "C,G,G") prof1.add(0, "mh05KK-123", "A,C") prof1.add(1, "mh05KK-123", "A,T") prof2 = SimulatedProfile() prof2.add(0, "mh11CP-004", "C,T,A") prof2.add(1, "mh11CP-004", "C,T,G") prof2.add(0, "mh05KK-123", "A,T") prof2.add(1, "mh05KK-123", "A,T") prof3 = SimulatedProfile() prof3.add(0, "mh11CP-004", "C,G,G") prof3.add(1, "mh11CP-004", "T,G,G") prof3.add(0, "mh05KK-123", "G,C") prof3.add(1, "mh05KK-123", "G,T") profile = SimulatedProfile.merge([prof1, prof2, prof3]) markers = pd.read_csv(data_file("def/loc2-offsets.tsv"), sep="\t") output = profile.bedstr(markers) print(output) assert output == ("mh05KK-123\t121\t122\tA|A|A|A|G|G\n" "mh05KK-123\t228\t229\tC|T|T|T|C|T\n" "mh11CP-004\t162\t163\tC|C|C|C|C|T\n" "mh11CP-004\t163\t164\tG|G|T|T|G|G\n" "mh11CP-004\t187\t188\tG|G|A|G|G|G\n")
def test_bed_error(): p = SimulatedProfile() p.add(0, "BOGUS", "A,C,C") p.add(1, "BOGUS", "A,C,C") markers = pd.read_csv(data_file("def/loc2-offsets.tsv"), sep="\t") with pytest.raises(ValueError, match=r"unknown marker identifier 'BOGUS'"): print(p.bedstr(markers))
def test_mix_main(): with NamedTemporaryFile(suffix=".json.gz") as outfile: arglist = [ "mix", "--out", outfile.name, data_file("prof/green-sim-gt-1.json.gz"), data_file("prof/green-sim-gt-2.json.gz"), data_file("prof/green-sim-gt-3.json.gz"), ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.mix.main(args) p = SimulatedProfile(fromfile=outfile.name) testp = SimulatedProfile( fromfile=data_file("prof/green-sim-gt-combined.json.gz")) assert p == testp