def test_linear_space(self): # This makes ~1700 trees, and previously caused a failure ts = msprime.simulate( sample_size=10, length=2e6, Ne=10000, mutation_rate=1e-8, recombination_rate=1e-8, random_seed=11, ) priors = tsdate.build_prior_grid(ts, timepoints=10, approximate_priors=None) dated_ts = tsdate.date(ts, Ne=10000, mutation_rate=1e-8, priors=priors, probability_space=LIN) maximized_ts = tsdate.date( ts, Ne=10000, mutation_rate=1e-8, priors=priors, method="maximization", probability_space=LIN, ) self.ts_equal_except_times(ts, dated_ts) self.ts_equal_except_times(ts, maximized_ts)
def main(): description = """Simple CLI wrapper for tsdate tskit version: {} tsdate version: {}""".format(tskit.__version__, tsdate.__version__) parser = argparse.ArgumentParser( description=description, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--verbosity', '-v', action='count', default=0) parser.add_argument("input", help="The input tree sequence file name") parser.add_argument( "output", help="The path to write the output tree sequence file to") parser.add_argument('Ne', type=float, help="Effective population size") parser.add_argument("--mutation-rate", default=1e-8, type=float, help="Mutation rate") parser.add_argument("-V", "--version", action='version', version=description) args = parser.parse_args() if not os.path.isfile(args.input): raise ValueError("No input tree sequence file") input_ts = tskit.load(args.input) prior = tsdate.build_prior_grid(input_ts, approximate_priors=True) ts = tsdate.date(input_ts, args.Ne, mutation_rate=args.mutation_rate, priors=prior) ts.dump(args.output)
def test_simple_sim_1_tree(self): ts = msprime.simulate(8, mutation_rate=5, random_seed=2) for use_times in [True, False]: sample_data = tsinfer.SampleData.from_tree_sequence(ts, use_times=use_times) inferred_ts = tsinfer.infer(sample_data) dated_ts = tsdate.date(inferred_ts, Ne=1, mutation_rate=5) self.assertTrue( all([a == b for a, b in zip(ts.haplotypes(), dated_ts.haplotypes())]))
def run_date(args): try: ts = tskit.load(args.tree_sequence) except tskit.FileFormatError as ffe: exit("Error loading '{}: {}".format(args.tree_sequence, ffe)) dated_ts = tsdate.date( ts, args.Ne, mutation_rate=args.mutation_rate, recombination_rate=args.recombination_rate, probability_space=args.probability_space, method=args.method, eps=args.epsilon, num_threads=args.num_threads, ignore_oldest_root=args.ignore_oldest, progress=args.progress) dated_ts.dump(args.output)
def compare_python_api(self, input_ts, cmd, Ne, mutation_rate, method): with tempfile.TemporaryDirectory() as tmpdir: input_filename = pathlib.Path(tmpdir) / "input.trees" input_ts.dump(input_filename) output_filename = pathlib.Path(tmpdir) / "output.trees" full_cmd = "date " + str(input_filename) + f" {output_filename} " + cmd cli.tsdate_main(full_cmd.split()) output_ts = tskit.load(output_filename) dated_ts = tsdate.date(input_ts, Ne=Ne, mutation_rate=mutation_rate, method=method) print(dated_ts.tables.nodes.time, output_ts.tables.nodes.time) self.assertTrue(np.array_equal(dated_ts.tables.nodes.time, output_ts.tables.nodes.time))
def test_truncated_ts(self): Ne = 1e2 mu = 2e-4 ts = msprime.simulate( 10, Ne=Ne, length=400, recombination_rate=1e-4, mutation_rate=mu, random_seed=12) truncated_ts = utility_functions.truncate_ts_samples( ts, average_span=200, random_seed=123) dated_ts = tsdate.date(truncated_ts, Ne=Ne, mutation_rate=mu) # We should ideally test whether *haplotypes* are the same here # in case allele encoding has changed. But haplotypes() doesn't currently # deal with missing data self.ts_equal_except_times(truncated_ts, dated_ts)
def evaluate_tsdate_accuracy(parameter, parameters_arr, node_mut=False, inferred=True, prior_distr='lognorm', progress=True): Ne = 10000 if node_mut and inferred: raise ValueError( "cannot evaluate node accuracy on inferred tree sequence") mutation_rate = 1e-8 recombination_rate = 1e-8 all_results = { i: {i: [] for i in ['io', 'max', 'true_times']} for i in list(map(str, parameters_arr)) } random_seeds = range(1, 6) if inferred: inferred_progress = 'using tsinfer' else: inferred_progress = 'true topology' if node_mut: node_mut_progress = 'comparing true and estimated node times' else: node_mut_progress = 'comparing true and estimated mutation times' for index, param in tqdm(enumerate(parameters_arr), desc='Testing ' + parameter + " " + inferred_progress + ". Evaluation by " + node_mut_progress, total=len(parameters_arr), disable=not progress): for random_seed in random_seeds: if parameter == 'sample_size': sample_size = param else: sample_size = 100 ts = msprime.simulate(sample_size=sample_size, Ne=Ne, length=1e6, mutation_rate=mutation_rate, recombination_rate=recombination_rate, random_seed=random_seed) if parameter == 'length': ts = msprime.simulate(sample_size=sample_size, Ne=Ne, length=param, mutation_rate=mutation_rate, recombination_rate=recombination_rate, random_seed=random_seed) if parameter == 'mutation_rate': mutated_ts = msprime.mutate(ts, rate=param, random_seed=random_seed) else: mutated_ts = msprime.mutate(ts, rate=mutation_rate, random_seed=random_seed) if inferred: sample_data = tsinfer.formats.SampleData.from_tree_sequence( mutated_ts, use_times=False) target_ts = tsinfer.infer(sample_data).simplify() else: target_ts = mutated_ts if parameter == 'mutation_rate': io_dated = tsdate.date(target_ts, mutation_rate=param, Ne=Ne, progress=False, method='inside_outside') max_dated = tsdate.date(target_ts, mutation_rate=param, Ne=Ne, progress=False, method='maximization') elif parameter == 'timepoints': prior = tsdate.build_prior_grid(target_ts, timepoints=param, approximate_prior=True, prior_distribution=prior_distr, progress=False) io_dated = tsdate.date(target_ts, mutation_rate=mutation_rate, prior=prior, Ne=Ne, progress=False, method='inside_outside') max_dated = tsdate.date(target_ts, mutation_rate=mutation_rate, prior=prior, Ne=Ne, progress=False, method='maximization') else: io_dated = tsdate.date(target_ts, mutation_rate=mutation_rate, Ne=Ne, progress=False, method='inside_outside') max_dated = tsdate.date(target_ts, mutation_rate=mutation_rate, Ne=Ne, progress=False, method='maximization') if node_mut and not inferred: all_results[str(param)]['true_times'].append( mutated_ts.tables.nodes.time[ts.num_samples:]) all_results[str(param)]['io'].append( io_dated.tables.nodes.time[ts.num_samples:]) all_results[str(param)]['max'].append( max_dated.tables.nodes.time[ts.num_samples:]) else: all_results[str(param)]['true_times'].append( mutated_ts.tables.nodes.time[ mutated_ts.tables.mutations.node]) all_results[str(param)]['io'].append( io_dated.tables.nodes.time[io_dated.tables.mutations.node]) all_results[str(param)]['max'].append( max_dated.tables.nodes.time[ max_dated.tables.mutations.node]) return all_results, prior_distr, inferred, node_mut
def test_simple_sim_1_tree(self): ts = msprime.simulate(8, mutation_rate=5, random_seed=2) dated_ts = tsdate.date(ts, Ne=1, mutation_rate=5) self.ts_equal_except_times(ts, dated_ts)