def run_simulation(ts, n, Ne, theta, rho): sample_data = tsinfer.formats.SampleData.from_tree_sequence(ts) inferred_ts = tsinfer.infer(sample_data) dated_ts = tsdate.age_inference(ts,theta=theta, rho=rho) dated_inferred_ts_mut = tsdate.age_inference(inferred_ts, theta=theta, rho=rho) return(ts, dated_ts, dated_inferred_ts_mut)
def test_random_data_inferred_no_simplify(self): samples = self.get_random_data_example( 10 * np.arange(10), num_samples=10, seed=2) inferred_ts = tsinfer.infer(samples, simplify=False) ts = self.verify(inferred_ts, 55, 57) self.assertTrue(np.array_equal( ts.genotype_matrix(), inferred_ts.genotype_matrix()))
def test_simple_sim_multi_tree(self): ts = msprime.simulate(8, mutation_rate=5, recombination_rate=5, random_seed=2) self.assertGreater(ts.num_trees, 1) for use_times in [True, False]: sample_data = tsinfer.SampleData.from_tree_sequence( ts, use_sites_time=use_times) inferred_ts = tsinfer.infer(sample_data) max_dated_ts = tsdate.date(inferred_ts, Ne=1, mutation_rate=5, method="maximization") self.assertTrue( all([ a == b for a, b in zip(ts.haplotypes(), max_dated_ts.haplotypes()) ])) io_dated_ts = tsdate.date(inferred_ts, Ne=1, mutation_rate=5) self.assertTrue( all([ a == b for a, b in zip(ts.haplotypes(), io_dated_ts.haplotypes()) ]))
def test_simple_sim_1_tree(self): ts = msprime.simulate(8, mutation_rate=5, random_seed=2) for use_times in [True, False]: sample_data = tsinfer.SampleData.from_tree_sequence(ts, use_times=use_times) inferred_ts = tsinfer.infer(sample_data) dated_ts = tsdate.date(inferred_ts, Ne=1, mutation_rate=5) self.assertTrue( all([a == b for a, b in zip(ts.haplotypes(), dated_ts.haplotypes())]))
def test_inferred_no_simplify(self): ts = msprime.simulate(10, recombination_rate=2, mutation_rate=10, random_seed=3) samples = tsinfer.SampleData.from_tree_sequence(ts, use_times=False) ts = tsinfer.infer(samples, simplify=False) self.verify(ts)
def test_two_populations_high_migration_inferred_no_simplify(self): ts = self.two_populations_high_migration_example() samples = tsinfer.SampleData.from_tree_sequence(ts) inferred_ts = tsinfer.infer(samples, simplify=False) assert inferred_ts.num_populations == ts.num_populations self.verify(inferred_ts, [inferred_ts.samples(0), inferred_ts.samples(1)])
def infer_ts(filename): ''' Inferes tree sequence from genotype matrix Args: filename''' sample_data = read_samples(filename) inferred_ts = tsinfer.infer(sample_data) for tree in inferred_ts.trees(): print(tree.draw(format="unicode")) return inferred_ts
def test_inferred(self): ts = msprime.simulate(10, recombination_rate=2, mutation_rate=10, random_seed=3) samples = tsinfer.SampleData.from_tree_sequence(ts) ts = tsinfer.infer(samples) self.verify(ts)
def test_random_data_inferred_simplify(self): samples = self.get_random_data_example(5 * np.arange(10), num_samples=10, seed=2) inferred_ts = tsinfer.infer(samples, simplify=True) ts = self.verify(inferred_ts, 12, 15) assert np.array_equal(ts.genotype_matrix(), inferred_ts.genotype_matrix())
def test_two_populations_high_migration_inferred(self): ts = self.two_populations_high_migration_example() samples = tsinfer.SampleData.from_tree_sequence(ts) inferred_ts = tsinfer.infer(samples) self.assertEqual(inferred_ts.num_populations, ts.num_populations) self.verify(inferred_ts, [inferred_ts.samples(0), inferred_ts.samples(1)])
def test_equivalance(self): rho = 2 ts = msprime.simulate(5, mutation_rate=2, recombination_rate=rho, random_seed=2) G = ts.genotype_matrix() positions = [site.position for site in ts.sites()] ts1 = tsinfer.infer(genotypes=G, positions=positions, sequence_length=ts.sequence_length, num_threads=1) ts2 = tsinfer.infer(genotypes=G, positions=positions, sequence_length=ts.sequence_length, num_threads=5) self.assertTreeSequencesEqual(ts1, ts2)
def test_inferred_random_data(self): np.random.seed(10) num_sites = 40 num_samples = 8 G = np.random.randint(2, size=(num_sites, num_samples)).astype(np.int8) with tsinfer.SampleData() as sample_data: for j in range(num_sites): sample_data.add_site(j, G[j]) ts = tsinfer.infer(sample_data) self.verify(ts)
def verify_from_inferred(self, remove_leaves): ts = msprime.simulate(15, recombination_rate=1, mutation_rate=2, random_seed=3) samples = tsinfer.SampleData.from_tree_sequence(ts) inferred = tsinfer.infer(samples) ancestors_ts = tsinfer.make_ancestors_ts( samples, inferred, remove_leaves=remove_leaves) tsinfer.check_ancestors_ts(ancestors_ts) for engine in [tsinfer.PY_ENGINE, tsinfer.C_ENGINE]: final_ts = tsinfer.match_samples(samples, ancestors_ts, engine=engine) tsinfer.verify(samples, final_ts)
def infer_from_msprime(simulation): ''' Given msprime simulation results, obtains the corresponding inferred tree sequence using tsinfer Args: result - msprime output ''' with tsinfer.SampleData (sequence_length=simulation.sequence_length, num_flush_threads=2) as sample_data: for var in simulation.variants (): sample_data.add_site ( var.site.position, var.genotypes, var.alleles ) inferred_ts = tsinfer.infer (sample_data) return inferred_ts
def main(): description = """Simple CLI wrapper for tsinfer tskit version: {} tsinfer version: {}""".format(tskit.__version__, tsinfer.__version__) parser = argparse.ArgumentParser( description=description, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--verbosity', '-v', action='count', default=0) parser.add_argument( "samples", help="The samples file name, as saved by tsinfer.SampleData.initialise()") parser.add_argument( "output", help="The path to write the output file to") parser.add_argument( "-l", "--length", default=None, type=int, help="The total sequence length") parser.add_argument( "-t", "--threads", default=1, type=int, help="The number of worker threads to use") parser.add_argument( "-m", "--method", default="C", choices=['C','P'], help="Which implementation to use, [C] (faster) or [P]ython (more debuggable)") parser.add_argument( "--inject-real-ancestors-from-ts", default=None, help="Instead of inferring ancestors, construct known ones from this tree sequence file path") parser.add_argument( "-V", "--version", action='version', version=description) args = parser.parse_args() engine = tsinfer.PY_ENGINE if args.method == "P" else tsinfer.C_ENGINE if not os.path.isfile(args.samples): raise ValueError("No samples file") sample_data = tsinfer.load(args.samples) if all(False for _ in sample_data.genotypes(inference_sites=True)): raise ValueError("No inference sites") if args.inject_real_ancestors_from_ts is not None: ancestor_data = tsinfer.AncestorData.initialise(sample_data, compressor=None) orig_ts = tskit.load(args.inject_real_ancestors_from_ts) eval_util.build_simulated_ancestors(sample_data, ancestor_data, orig_ts) ancestor_data.finalise() ancestors_ts = tsinfer.match_ancestors( sample_data, ancestor_data, engine=engine) ts = tsinfer.match_samples( sample_data, ancestors_ts, engine=engine, simplify=True) else: ts = tsinfer.infer( sample_data, num_threads=args.threads, engine=engine) ts.dump(args.output)
def run_infer(args): setup_logging(args) progress_monitor = ProgressMonitor(enabled=args.progress, generate_ancestors=True, match_ancestors=True, match_samples=True) sample_data = tsinfer.SampleData.load(args.samples) ts = tsinfer.infer(sample_data, progress_monitor=progress_monitor, num_threads=args.num_threads) output_trees = get_output_trees_path(args.output_trees, args.samples) logger.info("Writing output tree sequence to {}".format(output_trees)) ts.dump(output_trees) summarise_usage()
def test_no_error(self): num_sites = 10 G, positions = get_random_data_example(5, num_sites) for method in ["python", "c"]: ts = tsinfer.infer(genotypes=G, positions=positions, sequence_length=num_sites, method=method) self.assertEqual(ts.num_sites, num_sites) self.assertEqual(ts.num_mutations, num_sites) for site in ts.sites(): self.assertEqual(site.ancestral_state, "0") self.assertEqual(len(site.mutations), 1) mutation = site.mutations[0] self.assertEqual(mutation.derived_state, "1") self.assertEqual(mutation.parent, -1)
def verify_data_round_trip(self, genotypes, positions, sequence_length=None): if sequence_length is None: sequence_length = positions[-1] + 1 # import daiquiri # daiquiri.setup(level="DEBUG") for method in ["python", "C"]: ts = tsinfer.infer(genotypes=genotypes, positions=positions, sequence_length=sequence_length, method=method) self.assertEqual(ts.sequence_length, sequence_length) self.assertEqual(ts.num_sites, len(positions)) for v in ts.variants(): self.assertEqual(v.position, positions[v.index]) self.assertTrue(np.array_equal(genotypes[v.index], v.genotypes))
def iteration_tsdate(constr_sample_data, constr_sites, Ne, mut_rate, adjust_priors=True): iter_infer = tsinfer.infer(constr_sample_data).simplify() priors = tsdate.build_prior_grid(iter_infer) if adjust_priors and constr_sites: for mut_pos, limit in constr_sites.items(): infer_mut_pos = np.where( mut_pos == iter_infer.tables.sites.position)[0][0] node = (iter_infer.tables.mutations.node[infer_mut_pos] - iter_infer.num_samples) priors.grid_data[node][:(np.abs(priors.timepoints * 20000 - limit)).argmin()] = 0 iter_dates, _, _, _, _ = tsdate.get_dates(iter_infer, Ne=Ne, mutation_rate=mut_rate, priors=priors) return iter_infer, iter_dates * 2 * Ne
def run_infer(args): setup_logging(args) try: sample_data = tsinfer.SampleData.load(args.samples) except exceptions.FileFormatError as e: # Check if the user has tried to infer a tree sequence, a common basic mistake try: tskit.load(args.samples) except tskit.FileFormatError: raise e # Re-raise the original error raise exceptions.FileFormatError( "Expecting a sample data file, not a tree sequence (you can create one " "via the Python function `tsinfer.SampleData.from_tree_sequence()`)." ) sample_data = tsinfer.SampleData.load(args.samples) ts = tsinfer.infer(sample_data, progress_monitor=args.progress, num_threads=args.num_threads) output_trees = get_output_trees_path(args.output_trees, args.samples) logger.info("Writing output tree sequence to {}".format(output_trees)) ts.dump(output_trees) summarise_usage()
def test_random_data_inferred_no_simplify(self): samples = self.get_random_data_example(num_sites=20, num_samples=3) inferred_ts = tsinfer.infer(samples, simplify=False) samples = inferred_ts.samples() self.verify(inferred_ts, [samples[:1], samples[1:]])
def test_infer(self): ts = msprime.simulate(10, mutation_rate=1, random_seed=1) assert ts.num_sites > 1 samples = tsinfer.SampleData.from_tree_sequence(ts) inferred_ts = tsinfer.infer(samples) self.validate_ts(inferred_ts)
import os import sys import msprime sys.path.insert(0, os.path.abspath("..")) import tsinfer # noqa ts = msprime.simulate(5, mutation_rate=0.7, random_seed=10) tree = ts.first() print(ts.num_sites) print(tree.draw(format="unicode")) with tsinfer.SampleData(path="toy.samples") as sample_data: sample_data.add_site(10, [0, 1, 0, 0, 0], ["A", "T"]) sample_data.add_site(12, [0, 0, 0, 1, 1], ["G", "C"]) sample_data.add_site(23, [0, 1, 1, 0, 0], ["C", "A"]) sample_data.add_site(37, [0, 1, 1, 0, 0], ["G", "C"]) sample_data.add_site(40, [0, 0, 0, 1, 1], ["A", "C"]) sample_data.add_site(50, [0, 1, 0, 0, 0], ["T", "G"]) print(sample_data) inferred_ts = tsinfer.infer(sample_data) for tree in inferred_ts.trees(): print(tree.draw(format="unicode")) for sample_id, h in enumerate(inferred_ts.haplotypes()): print(sample_id, h, sep="\t")
pickle.dump(M, f) with open(os.path.join(write_loc, 'pickles_root_kids_list'), 'wb') as f: pickle.dump(list_of_root_and_kids, f) ## loading example #with open(file_loc + 'pickled_pop_list.pickle', 'rb') as f: # pop_list = pickle.load(f) ########### inference on truncation part ########### # ts infer sd = tsinfer.SampleData.from_tree_sequence(truncated_ts, use_times=False) ts_inferred = tsinfer.infer(sd, simplify=False) ts_inferred = ts_inferred.simplify(filter_sites=False, keep_unary=True) ts_inferred ts_inferred.dump(os.path.join(write_loc, 'inferred_tree.trees')) #Out[43]: <tskit.trees.TreeSequence at 0x1ed55ca9710> X = 18 Y = 20 i = 0 for tree in ts_inferred.trees(): if i > X and i <= Y: display( SVG(tree.draw(height=800, width=2000, tree_height_scale='rank'))) print("Tree {} covers [{:.2f}, {:.2f}); TMRCA = {:.4f}".format(
def evaluate_tsdate_accuracy( parameter, parameters_arr, node_mut=False, inferred=True, prior_distr="lognorm", progress=True, ): Ne = 10000 if node_mut and inferred: raise ValueError( "cannot evaluate node accuracy on inferred tree sequence") mutation_rate = 1e-8 recombination_rate = 1e-8 all_results = { i: {i: [] for i in ["io", "max", "true_times"]} for i in list(map(str, parameters_arr)) } random_seeds = range(1, 6) if inferred: inferred_progress = "using tsinfer" else: inferred_progress = "true topology" if node_mut: node_mut_progress = "comparing true and estimated node times" else: node_mut_progress = "comparing true and estimated mutation times" for _, param in tqdm( enumerate(parameters_arr), desc="Testing " + parameter + " " + inferred_progress + ". Evaluation by " + node_mut_progress, total=len(parameters_arr), disable=not progress, ): for random_seed in random_seeds: if parameter == "sample_size": sample_size = param else: sample_size = 100 ts = msprime.simulate( sample_size=sample_size, Ne=Ne, length=1e6, mutation_rate=mutation_rate, recombination_rate=recombination_rate, random_seed=random_seed, ) if parameter == "length": ts = msprime.simulate( sample_size=sample_size, Ne=Ne, length=param, mutation_rate=mutation_rate, recombination_rate=recombination_rate, random_seed=random_seed, ) if parameter == "mutation_rate": mutated_ts = msprime.mutate(ts, rate=param, random_seed=random_seed) else: mutated_ts = msprime.mutate(ts, rate=mutation_rate, random_seed=random_seed) if inferred: sample_data = tsinfer.formats.SampleData.from_tree_sequence( mutated_ts, use_times=False) target_ts = tsinfer.infer(sample_data).simplify() else: target_ts = mutated_ts if parameter == "mutation_rate": io_dated = tsdate.date( target_ts, mutation_rate=param, Ne=Ne, progress=False, method="inside_outside", ) max_dated = tsdate.date( target_ts, mutation_rate=param, Ne=Ne, progress=False, method="maximization", ) elif parameter == "timepoints": prior = tsdate.build_prior_grid( target_ts, timepoints=param, approximate_prior=True, prior_distribution=prior_distr, progress=False, ) io_dated = tsdate.date( target_ts, mutation_rate=mutation_rate, prior=prior, Ne=Ne, progress=False, method="inside_outside", ) max_dated = tsdate.date( target_ts, mutation_rate=mutation_rate, prior=prior, Ne=Ne, progress=False, method="maximization", ) else: io_dated = tsdate.date( target_ts, mutation_rate=mutation_rate, Ne=Ne, progress=False, method="inside_outside", ) max_dated = tsdate.date( target_ts, mutation_rate=mutation_rate, Ne=Ne, progress=False, method="maximization", ) if node_mut and not inferred: all_results[str(param)]["true_times"].append( mutated_ts.tables.nodes.time[ts.num_samples:]) all_results[str(param)]["io"].append( io_dated.tables.nodes.time[ts.num_samples:]) all_results[str(param)]["max"].append( max_dated.tables.nodes.time[ts.num_samples:]) else: all_results[str(param)]["true_times"].append( mutated_ts.tables.nodes.time[ mutated_ts.tables.mutations.node]) all_results[str(param)]["io"].append( io_dated.tables.nodes.time[io_dated.tables.mutations.node]) all_results[str(param)]["max"].append( max_dated.tables.nodes.time[ max_dated.tables.mutations.node]) return all_results, prior_distr, inferred, node_mut
def test_tsinfer_output(self, small_sd_fixture): ts = tsinfer.infer(small_sd_fixture) with pytest.raises(ValueError): tsinfer.check_ancestors_ts(ts)
def test_inferred_no_simplify(self, medium_sd_fixture): ts = tsinfer.infer(medium_sd_fixture, simplify=False) self.verify(ts)
def test_inferred(self, medium_sd_fixture): ts = tsinfer.infer(medium_sd_fixture) self.verify(ts)
def test_random_data_inferred(self): n = 20 samples = self.get_random_data_example(num_sites=52, num_samples=n) inferred_ts = tsinfer.infer(samples) samples = inferred_ts.samples() self.verify(inferred_ts, [samples[:n // 2], samples[n // 2:]])
def test_tsinfer_output(self): ts = msprime.simulate(10, mutation_rate=1, random_seed=1) samples = tsinfer.SampleData.from_tree_sequence(ts) ts = tsinfer.infer(samples) with self.assertRaises(ValueError): tsinfer.check_ancestors_ts(ts)