def test_bed_error(): p = SimulatedProfile() p.add(0, "BOGUS", "A,C,C") p.add(1, "BOGUS", "A,C,C") markers = pd.read_csv(data_file("def/loc2-offsets.tsv"), sep="\t") with pytest.raises(ValueError, match=r"unknown marker identifier 'BOGUS'"): print(p.bedstr(markers))
def sim(frequencies, seed=None): """Simulate a diploid genotype from the specified microhaplotype frequencies :param pandas.DataFrame frequencies: population haplotype frequencies :param int seed: seed for random number generator :returns: a simulated genotype profile for all markers specified in the haplotype frequencies :rtype: microhapulator.profile.SimulatedProfile """ profile = SimulatedProfile(ploidy=2) if seed is None: seed = np.random.randint(2**32 - 1) profile.data["metadata"] = { "HaploSeed": seed, } np.random.seed(seed) markers = sorted(frequencies.Marker.unique()) for haploindex in range(2): for marker in markers: haplofreqs = frequencies[frequencies.Marker == marker] haplotypes = list(haplofreqs.Haplotype) freqs = list(haplofreqs.Frequency) freqs = [x / sum(freqs) for x in freqs] sampled_haplotype = np.random.choice(haplotypes, p=freqs) profile.add(haploindex, marker, sampled_haplotype) message = f"simulated microhaplotype variation at {len(markers)} markers" print("[MicroHapulator::sim]", message, file=sys.stderr) return profile
def test_diff_basic(): gt1 = SimulatedProfile(fromfile=data_file("prof/diff-comp-1.json")) gt2 = SimulatedProfile(fromfile=data_file("prof/diff-comp-2.json")) diff = list(mhapi.diff(gt1, gt2)) assert diff == [ ("MHDBL000140", {"C,C,A,A"}, {"C,C,T,A"}), ("MHDBL000163", {"A,A,G,A,T"}, {"C,G,A,A,T"}), ]
def test_diff_nonmatching_alleles(): p1 = SimulatedProfile(fromfile=data_file("prof/red-strict-profile.json")) p2 = SimulatedProfile(fromfile=data_file("prof/red-relaxed-profile.json")) diff = list(mhapi.diff(p1, p2)) print(diff) assert diff == [ ("mh07CP-004", set(), {"T,T,T,A,T", "A,A,T,A,T"}), ("mh09KK-157", set(), {"G,C,C,A,T"}), ]
def test_diff_large(): gt1 = SimulatedProfile(fromfile=data_file("prof/diff-comp-1.json")) gt2 = SimulatedProfile(fromfile=data_file("prof/diff-comp-3.json")) diff = list(mhapi.diff(gt1, gt2)) loci = [d[0] for d in diff] print(diff[9], diff[17], diff[21]) assert loci == [ "MHDBL000002", "MHDBL000003", "MHDBL000007", "MHDBL000013", "MHDBL000017", "MHDBL000018", "MHDBL000030", "MHDBL000036", "MHDBL000038", "MHDBL000047", "MHDBL000058", "MHDBL000061", "MHDBL000076", "MHDBL000079", "MHDBL000082", "MHDBL000085", "MHDBL000088", "MHDBL000101", "MHDBL000106", "MHDBL000108", "MHDBL000111", "MHDBL000112", "MHDBL000122", "MHDBL000124", "MHDBL000128", "MHDBL000129", "MHDBL000135", "MHDBL000136", "MHDBL000138", "MHDBL000140", "MHDBL000144", "MHDBL000152", "MHDBL000154", "MHDBL000163", "MHDBL000181", "MHDBL000183", "MHDBL000194", "MHDBL000210", "MHDBL000211", "MHDBL000212", ] assert diff[9] == ("MHDBL000047", set(), {"T,T"}) assert diff[17] == ("MHDBL000101", {"C,C,C,T"}, {"T,C,C,C"}) assert diff[21] == ("MHDBL000112", {"G,G,A,C"}, set())
def test_main(tmp_path): outfile = str(tmp_path / "profile.json") arglist = [ "sim", "--out", outfile, "--seed", "1985", data_file("freq/ceu50-freq.tsv"), ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.sim.main(args) observed = SimulatedProfile(fromfile=outfile) expected = SimulatedProfile(fromfile=data_file("prof/bitusa-profile.json")) assert observed == expected
def test_pipe_gbr_usc10(tmp_path): hg38 = str(tmp_path / "hg38-placeholder.fasta") copyfile(data_file("refr/usc10-refr.fna"), hg38) run(["bwa", "index", hg38]) arglist = [ "pipe", data_file("refr/usc10-refr.fna"), data_file("def/usc10-offsets.tsv"), data_file(""), "gbr-usc", "--workdir", str(tmp_path), "--threads", "1", "--hg38", hg38, ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.pipe.main(args) expected = SimulatedProfile(fromfile=data_file("prof/gbr-usc10-sim.json")) observed = TypingResult(fromfile=tmp_path / "analysis" / "gbr-usc" / "gbr-usc-type.json") diff = list(mhapi.diff(observed, expected)) assert len(diff) == 0 assert (tmp_path / "report.html").is_file() expected = pd.read_csv(data_file("gbr-usc-summary.tsv"), sep="\t") observed = pd.read_csv(tmp_path / "analysis" / "summary.tsv", sep="\t") assert observed.equals(expected)
def test_mix_main(): with NamedTemporaryFile(suffix=".json.gz") as outfile: arglist = [ "mix", "--out", outfile.name, data_file("prof/green-sim-gt-1.json.gz"), data_file("prof/green-sim-gt-2.json.gz"), data_file("prof/green-sim-gt-3.json.gz"), ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.mix.main(args) p = SimulatedProfile(fromfile=outfile.name) testp = SimulatedProfile( fromfile=data_file("prof/green-sim-gt-combined.json.gz")) assert p == testp
def test_profile_roundtrip(tmp_path): seed = numpy.random.randint(1, 2**32 - 1) freqs = pd.read_csv(data_file("freq/asw5-freq.tsv"), sep="\t") profile = mhapi.sim(freqs, seed=seed) profile.dump(tmp_path / "profile.json") test = SimulatedProfile(fromfile=tmp_path / "profile.json") assert profile == test assert str(profile) == str(test)
def test_dist_even_mixture(): with microhapulator.open(data_file("murica/x-obs-genotype.json"), "r") as fh: p1 = TypingResult(fh) p2 = SimulatedProfile.populate_from_bed( data_file("murica/x-sim-genotype.bed")) assert mhapi.dist(p1, p2) == 0 assert p1 == p2
def test_haplotypes(): simprof = SimulatedProfile.populate_from_bed(data_file("gttest.bed.gz")) typeprof = TypingResult(fromfile=data_file("prof/gttest.json")) assert simprof.haplotypes("BoGuSlOcUs") == set() assert typeprof.haplotypes("BoGuSlOcUs") == set() assert simprof.haplotypes("MHDBL000135") == set(["G,C,T", "G,T,C"]) assert typeprof.haplotypes("MHDBL000135") == set(["G,C,T", "G,T,C"]) assert simprof.haplotypes("MHDBL000135", index=0) == set(["G,C,T"]) assert simprof.haplotypes("MHDBL000135", index=1) == set(["G,T,C"]) assert typeprof.haplotypes("MHDBL000135", index=0) == set()
def test_sim_obs_profile_not_equal(): simprof1 = SimulatedProfile.populate_from_bed( data_file("gttest-mismatch1.bed.gz")) assert simprof1 is not None assert simprof1 != 42 assert simprof1 != 3.14159 assert simprof1 != "A,C,C,T" typeprof1 = TypingResult(fromfile=data_file("prof/gttest.json")) assert simprof1 != typeprof1 assert typeprof1 != simprof1 assert typeprof1 != 1985 assert typeprof1 != 98.6 simprof2 = SimulatedProfile.populate_from_bed( data_file("gttest-mismatch2.bed.gz")) assert simprof1 != simprof2 assert simprof2 != typeprof1 assert typeprof1 != simprof2 typeprof2 = TypingResult(fromfile=data_file("prof/gttest-altered.json")) assert typeprof1 != typeprof2
def test_main_haplo_seq(tmp_path): profile = str(tmp_path / "profile.json") hapseq = str(tmp_path / "haplo.fasta") arglist = [ "sim", "--seed", "293847", "--out", profile, "--haplo-seq", hapseq, "--sequences", data_file("refr/orange-refr.fasta"), "--markers", data_file("def/orange-offsets.tsv"), data_file("freq/asw2-freq.tsv"), ] args = microhapulator.cli.get_parser().parse_args(arglist) microhapulator.cli.sim.main(args) observed = SimulatedProfile(fromfile=profile) expected = SimulatedProfile( fromfile=data_file("prof/orange-sim-profile.json")) assert observed == expected assert filecmp.cmp(hapseq, data_file("orange-haplo.fasta"))
def test_dist_log_mixture(): p1 = TypingResult(data_file("murica/y-obs-genotype.json")) p2 = SimulatedProfile.populate_from_bed( data_file("murica/y-sim-genotype.bed")) assert mhapi.dist(p1, p2) == 19 assert p1 != p2
def test_merge_sim_genotypes(): prof1 = SimulatedProfile() prof1.add(0, "mh11CP-004", "C,G,G") prof1.add(1, "mh11CP-004", "C,G,G") prof1.add(0, "mh05KK-123", "A,C") prof1.add(1, "mh05KK-123", "A,T") prof2 = SimulatedProfile() prof2.add(0, "mh11CP-004", "C,T,A") prof2.add(1, "mh11CP-004", "C,T,G") prof2.add(0, "mh05KK-123", "A,T") prof2.add(1, "mh05KK-123", "A,T") prof3 = SimulatedProfile() prof3.add(0, "mh11CP-004", "C,G,G") prof3.add(1, "mh11CP-004", "T,G,G") prof3.add(0, "mh05KK-123", "G,C") prof3.add(1, "mh05KK-123", "G,T") profile = SimulatedProfile.merge([prof1, prof2, prof3]) markers = pd.read_csv(data_file("def/loc2-offsets.tsv"), sep="\t") output = profile.bedstr(markers) print(output) assert output == ("mh05KK-123\t121\t122\tA|A|A|A|G|G\n" "mh05KK-123\t228\t229\tC|T|T|T|C|T\n" "mh11CP-004\t162\t163\tC|C|C|C|C|T\n" "mh11CP-004\t163\t164\tG|G|T|T|G|G\n" "mh11CP-004\t187\t188\tG|G|A|G|G|G\n")
def test_sim_obs_profile_equality(): simprof = SimulatedProfile.populate_from_bed(data_file("gttest.bed.gz")) typeprof = TypingResult(fromfile=data_file("prof/gttest.json")) assert simprof == typeprof assert typeprof == simprof
def test_haploindexes(): simprof = SimulatedProfile.populate_from_bed( data_file("gttest-mismatch1.bed.gz")) assert simprof.haploindexes() == set([0, 1]) typeprof = TypingResult(data_file("pashtun-sim/test-output.json")) assert typeprof.haploindexes() == set()
def test_diff2(): gt1 = SimulatedProfile(fromfile=data_file("prof/euramer-sim-gt.json")) gt2 = SimulatedProfile(fromfile=data_file("prof/euramer-inf-gt.json")) diff = list(mhapi.diff(gt1, gt2)) assert diff == [("MHDBL000018", set(), {"T,G,C,T,A"})]
def test_meaning_of_life(): freqs = pd.read_csv(data_file("freq/ceu50-freq.tsv"), sep="\t") observed = mhapi.sim(freqs, seed=42) expected = SimulatedProfile( fromfile=data_file("prof/meaning-of-life.json.gz")) assert observed == expected
def main(args): profiles = [SimulatedProfile(pfile) for pfile in args.profiles] combined = SimulatedProfile.merge(profiles) with mhopen(args.out, "w") as fh: combined.dump(fh)