示例#1
0
def visualise(ts,
              recombination_rate,
              error_rate,
              engine="C",
              box_size=8,
              perfect_ancestors=False,
              path_compression=False,
              time_chunking=False):

    sample_data = tsinfer.SampleData.from_tree_sequence(ts)

    if perfect_ancestors:
        ancestor_data = tsinfer.AncestorData(sample_data)
        tsinfer.build_simulated_ancestors(sample_data,
                                          ancestor_data,
                                          ts,
                                          time_chunking=time_chunking)
        ancestor_data.finalise()
    else:
        ancestor_data = tsinfer.generate_ancestors(sample_data, engine=engine)

    ancestors_ts = tsinfer.match_ancestors(sample_data,
                                           ancestor_data,
                                           engine=engine,
                                           path_compression=path_compression,
                                           extended_checks=True)
    inferred_ts = tsinfer.match_samples(sample_data,
                                        ancestors_ts,
                                        engine=engine,
                                        simplify=False,
                                        path_compression=path_compression,
                                        extended_checks=True)

    prefix = "tmp__NOBACKUP__/"
    visualiser = Visualiser(ts,
                            sample_data,
                            ancestor_data,
                            inferred_ts,
                            box_size=box_size)
    visualiser.draw_copying_paths(os.path.join(prefix, "copying_{}.png"))

    # tsinfer.print_tree_pairs(ts, inferred_ts, compute_distances=False)
    inferred_ts = tsinfer.match_samples(sample_data,
                                        ancestors_ts,
                                        engine=engine,
                                        simplify=True,
                                        path_compression=False,
                                        stabilise_node_ordering=True)

    tsinfer.print_tree_pairs(ts, inferred_ts, compute_distances=True)
    sys.stdout.flush()
    print("num_sites = ", inferred_ts.num_sites, "num_mutations= ",
          inferred_ts.num_mutations)

    for site in inferred_ts.sites():
        if len(site.mutations) > 1:
            print("Multiple mutations at ", site.id, "over",
                  [mut.node for mut in site.mutations])
示例#2
0
 def setUp(self):
     self.tempdir = tempfile.TemporaryDirectory(prefix="tsinfer_cli_test")
     self.sample_file = str(
         pathlib.Path(self.tempdir.name, "input-data.samples"))
     self.ancestor_file = str(
         pathlib.Path(self.tempdir.name, "input-data.ancestors"))
     self.ancestor_trees = str(
         pathlib.Path(self.tempdir.name, "input-data.ancestors.trees"))
     self.output_trees = str(
         pathlib.Path(self.tempdir.name, "input-data.trees"))
     self.input_ts = msprime.simulate(10,
                                      mutation_rate=10,
                                      recombination_rate=10,
                                      random_seed=10)
     sample_data = tsinfer.SampleData(
         sequence_length=self.input_ts.sequence_length,
         path=self.sample_file)
     for var in self.input_ts.variants():
         sample_data.add_site(var.site.position, var.genotypes, var.alleles)
     sample_data.finalise()
     tsinfer.generate_ancestors(sample_data,
                                path=self.ancestor_file,
                                chunk_size=10)
     ancestor_data = tsinfer.load(self.ancestor_file)
     ancestors_ts = tsinfer.match_ancestors(sample_data, ancestor_data)
     ancestors_ts.dump(self.ancestor_trees)
     ts = tsinfer.match_samples(sample_data, ancestors_ts)
     ts.dump(self.output_trees)
     sample_data.close()
示例#3
0
    def infer(self, ts, method, path_compression=False):
        sample_data = tsinfer.SampleData.initialise(
            num_samples=ts.num_samples,
            sequence_length=ts.sequence_length,
            compressor=None)
        for v in ts.variants():
            sample_data.add_variant(v.site.position, v.alleles, v.genotypes)
        sample_data.finalise()

        ancestor_data = tsinfer.AncestorData.initialise(sample_data,
                                                        compressor=None)
        tsinfer.build_simulated_ancestors(sample_data, ancestor_data, ts)
        ancestor_data.finalise()
        ancestors_ts = tsinfer.match_ancestors(
            sample_data,
            ancestor_data,
            method=method,
            path_compression=path_compression,
            extended_checks=True)
        inferred_ts = tsinfer.match_samples(sample_data,
                                            ancestors_ts,
                                            method=method,
                                            simplify=True,
                                            path_compression=path_compression,
                                            extended_checks=True)
        return inferred_ts
示例#4
0
def run_match_samples(sample_data, ancestors_ts, num_threads):
    progress_monitor = tsinfer.cli.ProgressMonitor(enabled=True,
                                                   match_samples=True)
    return tsinfer.match_samples(sample_data,
                                 ancestors_ts,
                                 num_threads=num_threads,
                                 simplify=False,
                                 progress_monitor=progress_monitor)
示例#5
0
 def verify_from_source(self, remove_leaves):
     ts = msprime.simulate(15, recombination_rate=1, mutation_rate=2, random_seed=3)
     samples = tsinfer.SampleData.from_tree_sequence(ts)
     ancestors_ts = tsinfer.make_ancestors_ts(
         samples, ts, remove_leaves=remove_leaves)
     tsinfer.check_ancestors_ts(ancestors_ts)
     for engine in [tsinfer.PY_ENGINE, tsinfer.C_ENGINE]:
         final_ts = tsinfer.match_samples(samples, ancestors_ts, engine=engine)
     tsinfer.verify(samples, final_ts)
def main():

    description = """Simple CLI wrapper for tsinfer
        tskit version: {}
        tsinfer version: {}""".format(tskit.__version__, tsinfer.__version__)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('--verbosity', '-v', action='count', default=0)
    parser.add_argument(
        "samples",
        help="The samples file name, as saved by tsinfer.SampleData.initialise()")
    parser.add_argument(
        "output",
        help="The path to write the output file to")
    parser.add_argument(
        "-l", "--length", default=None, type=int,
        help="The total sequence length")
    parser.add_argument(
        "-t", "--threads", default=1, type=int,
        help="The number of worker threads to use")
    parser.add_argument(
        "-m", "--method", default="C", choices=['C','P'],
        help="Which implementation to use, [C] (faster) or [P]ython (more debuggable)")
    parser.add_argument(
        "--inject-real-ancestors-from-ts", default=None,
        help="Instead of inferring ancestors, construct known ones from this tree sequence file path")
    parser.add_argument(
        "-V", "--version", action='version', version=description)

    args = parser.parse_args()

    engine = tsinfer.PY_ENGINE if args.method == "P" else tsinfer.C_ENGINE

    if not os.path.isfile(args.samples):
        raise ValueError("No samples file")
    sample_data = tsinfer.load(args.samples)
    if all(False for _ in sample_data.genotypes(inference_sites=True)):
        raise ValueError("No inference sites")
    if args.inject_real_ancestors_from_ts is not None:
        ancestor_data = tsinfer.AncestorData.initialise(sample_data, compressor=None)
        orig_ts = tskit.load(args.inject_real_ancestors_from_ts)
        eval_util.build_simulated_ancestors(sample_data, ancestor_data, orig_ts)
        ancestor_data.finalise()
        ancestors_ts = tsinfer.match_ancestors(
            sample_data, ancestor_data, engine=engine)
        ts = tsinfer.match_samples(
            sample_data, ancestors_ts, engine=engine, simplify=True)
    else:
        ts = tsinfer.infer(
            sample_data, num_threads=args.threads, engine=engine)
    ts.dump(args.output)
示例#7
0
def run_match_samples(args):
    setup_logging(args)

    sample_data = tsinfer.SampleData.load(args.input)
    ancestors_ts = get_ancestors_ts(args.ancestors_ts, args.input)
    output_ts = get_output_ts(args.output_ts, args.input)
    logger.info("Loading ancestral genealogies from {}".format(ancestors_ts))
    ancestors_ts = msprime.load(ancestors_ts)
    ts = tsinfer.match_samples(sample_data,
                               ancestors_ts,
                               num_threads=args.num_threads,
                               path_compression=not args.no_path_compression,
                               progress=args.progress)
    logger.info("Writing output tree sequence to {}".format(output_ts))
    ts.dump(output_ts)
def match_samples(samples_fn, inferred_anc_ts, num_threads, r_prob, m_prob,
                  precision, prefix):
    sample_data = tsinfer.load(samples_fn)
    inferred_ts = tsinfer.match_samples(
        sample_data,
        inferred_anc_ts,
        num_threads=num_threads,
        recombination=r_prob,
        mismatch=m_prob,
        precision=precision,
        progress_monitor=True,
        force_sample_times=True,
        simplify=False,
    )
    ts_path = prefix + ".nosimplify.trees"
    inferred_ts.dump(ts_path)
    return inferred_ts
def infer_with_mismatch(
    sample_data,
    path_to_genetic_map,
    ma_mismatch=1,
    ms_mismatch=1,
    precision=15,
    num_threads=1,
    path_compression=True,
    progress_monitor=False,
):
    ancestors = tsinfer.generate_ancestors(
        sample_data, num_threads=num_threads, progress_monitor=progress_monitor
    )
    gmap = msprime.RateMap.read_hapmap(
        path_to_genetic_map, sequence_length=ancestors.sequence_length
    )
    genetic_dists = tsinfer.Matcher.recombination_rate_to_dist(
        gmap, ancestors.sites_position[:]
    )
    recombination = tsinfer.Matcher.recombination_dist_to_prob(genetic_dists)
    recombination[recombination == 0] = 1e-20
    mismatch = np.full(
        len(ancestors.sites_position[:]),
        tsinfer.Matcher.mismatch_ratio_to_prob(1, np.median(genetic_dists), 2),
    )

    ancestors_ts = tsinfer.match_ancestors(
        sample_data,
        ancestors,
        recombination=recombination,
        mismatch=mismatch,
        precision=precision,
        num_threads=num_threads,
        path_compression=path_compression,
        progress_monitor=progress_monitor,
    )
    return tsinfer.match_samples(
        sample_data,
        ancestors_ts,
        recombination=recombination,
        mismatch=mismatch,
        precision=precision,
        num_threads=num_threads,
        path_compression=path_compression,
        progress_monitor=progress_monitor,
    )
示例#10
0
def tsinfer_dev(n,
                L,
                seed,
                num_threads=1,
                recombination_rate=1e-8,
                error_rate=0,
                method="C",
                log_level="WARNING",
                debug=True,
                progress=False,
                path_compression=True):

    np.random.seed(seed)
    random.seed(seed)
    L_megabases = int(L * 10**6)

    # daiquiri.setup(level=log_level)

    ts = msprime.simulate(n,
                          Ne=10**4,
                          length=L_megabases,
                          recombination_rate=recombination_rate,
                          mutation_rate=1e-8,
                          random_seed=seed)
    if debug:
        print("num_sites = ", ts.num_sites)
    assert ts.num_sites > 0

    G = generate_samples(ts, error_rate)
    sample_data = tsinfer.SampleData.initialise(
        num_samples=ts.num_samples, sequence_length=ts.sequence_length)
    for site, genotypes in zip(ts.sites(), G):
        sample_data.add_variant(site.position, ["0", "1"], genotypes)
    sample_data.finalise()

    ancestor_data = tsinfer.AncestorData.initialise(sample_data)
    tsinfer.build_ancestors(sample_data, ancestor_data, method=method)
    ancestor_data.finalise()
    print(ancestor_data)

    ancestors_ts = tsinfer.match_ancestors(sample_data,
                                           ancestor_data,
                                           method=method)
    output_ts = tsinfer.match_samples(sample_data, ancestors_ts, method=method)
    print("inferred_num_edges = ", output_ts.num_edges)
示例#11
0
def run_infer(args):
    setup_logging(args)
    sample_data = tsinfer.SampleData.load(args.input)

    ancestor_data = tsinfer.AncestorData.initialise(sample_data)
    tsinfer.build_ancestors(sample_data, ancestor_data, progress=args.progress)
    ancestor_data.finalise()

    ancestors_ts = tsinfer.match_ancestors(sample_data,
                                           ancestor_data,
                                           num_threads=args.num_threads,
                                           progress=args.progress)
    output_ts = get_output_ts(args.output_ts, args.input)
    ts = tsinfer.match_samples(sample_data,
                               ancestors_ts,
                               num_threads=args.num_threads,
                               progress=args.progress)
    logger.info("Writing output tree sequence to {}".format(output_ts))
    ts.dump(output_ts)
示例#12
0
文件: cli.py 项目: percyfal/tsinfer
def run_match_samples(args):
    setup_logging(args)

    sample_data = tsinfer.SampleData.load(args.samples)
    ancestors_trees = get_ancestors_trees_path(args.ancestors_trees,
                                               args.samples)
    output_trees = get_output_trees_path(args.output_trees, args.samples)
    logger.info(f"Loading ancestral genealogies from {ancestors_trees}")
    ancestors_trees = tskit.load(ancestors_trees)
    ts = tsinfer.match_samples(
        sample_data,
        ancestors_trees,
        num_threads=args.num_threads,
        path_compression=not args.no_path_compression,
        simplify=not args.no_simplify,
        progress_monitor=args.progress,
    )
    logger.info(f"Writing output tree sequence to {output_trees}")
    ts.dump(output_trees)
    summarise_usage()
示例#13
0
    def verify_inserted_ancestors(self, ts):
        # Verifies that we can round-trip the specified tree sequence
        # using the generated ancestors. NOTE: this must be an SMC
        # consistent tree sequence!
        sample_data = formats.SampleData.initialise(
            num_samples=ts.num_samples,
            sequence_length=ts.sequence_length,
            compressor=None)
        for v in ts.variants():
            sample_data.add_variant(v.position, v.alleles, v.genotypes)
        sample_data.finalise()

        ancestor_data = formats.AncestorData.initialise(sample_data,
                                                        compressor=None)
        tsinfer.build_simulated_ancestors(sample_data, ancestor_data, ts)
        ancestor_data.finalise()

        A = np.zeros((ancestor_data.num_sites, ancestor_data.num_ancestors),
                     dtype=np.uint8)
        start = ancestor_data.start[:]
        end = ancestor_data.end[:]
        ancestors = ancestor_data.ancestor[:]
        for j in range(ancestor_data.num_ancestors):
            A[start[j]:end[j], j] = ancestors[j]
        for method in ["P", "C"]:
            ancestors_ts = tsinfer.match_ancestors(sample_data,
                                                   ancestor_data,
                                                   method=method)
            self.assertEqual(ancestor_data.num_sites, ancestors_ts.num_sites)
            self.assertEqual(ancestor_data.num_ancestors,
                             ancestors_ts.num_samples)
            self.assertTrue(np.array_equal(ancestors_ts.genotype_matrix(), A))
            inferred_ts = tsinfer.match_samples(sample_data,
                                                ancestors_ts,
                                                method=method)
            self.assertTrue(
                np.array_equal(inferred_ts.genotype_matrix(),
                               ts.genotype_matrix()))
示例#14
0
def run(params):
    """
    Run a single inference, with the specified rates
    """
    rho = params.rec_rate[1:]
    base_rec_prob = np.quantile(rho, 0.5)
    ma_mis_rate = ms_mis_rate = 1.0
    if params.precision is None:
        # Smallest recombination rate
        min_rho = int(np.ceil(-np.min(np.log10(rho))))
        # Smallest mean
        av_min = int(
            np.ceil(
                -np.log10(min(1, ma_mis_rate, ms_mis_rate) * base_rec_prob)))
        precision = max(min_rho, av_min) + 3
    else:
        precision = params.precision
    ma_mis = base_rec_prob * ma_mis_rate
    ms_mis = base_rec_prob * ms_mis_rate
    print(
        f"Starting {params.cutoff_power}, trim_oldest={params.trim_oldest}",
        f"with base rho {base_rec_prob:.5g}",
        f"(mean {np.mean(rho):.4g} median {np.quantile(rho, 0.5):.4g}",
        f"min {np.min(rho):.4g}, 2.5% quantile {np.quantile(rho, 0.025):.4g})",
        f"precision {precision}")
    prefix = None
    if params.sample_data.path is not None:
        assert params.sample_data.path.endswith(".samples")
        prefix = params.sample_data.path[0:-len(".samples")]
        inf_prefix = "{}_rma{}_rms{}_N{}_{}_p{}".format(
            prefix, ma_mis_rate, ms_mis_rate, params.cutoff_power,
            "trim" if params.trim_oldest else "norm", precision)
    start_time = time.process_time()
    anc = tsinfer.generate_ancestors(
        params.sample_data,
        cutoff_power=params.cutoff_power,
        trim_oldest=params.trim_oldest,
        num_threads=params.num_threads,
        path=None if inf_prefix is None else inf_prefix + ".ancestors",
    )
    print(f"GA done (rel_ma_mis:{ma_mis_rate}, rel_ms_mis:{ms_mis_rate})")
    inferred_anc_ts = tsinfer.match_ancestors(
        params.sample_data,
        anc,
        num_threads=params.num_threads,
        precision=precision,
        recombination_rate=params.rec_rate,
        mismatch_rate=ma_mis,
    )
    inferred_anc_ts.dump(path=inf_prefix + ".atrees")
    print(f"MA done: abs_ma_mis rate = {ma_mis}")
    inferred_ts = tsinfer.match_samples(params.sample_data,
                                        inferred_anc_ts,
                                        num_threads=params.num_threads,
                                        precision=precision,
                                        recombination_rate=params.rec_rate,
                                        mismatch_rate=ms_mis)
    process_time = time.process_time() - start_time
    ts_path = inf_prefix + ".trees"
    inferred_ts.dump(path=ts_path)
    print(f"MS done: abs_ms_mis rate = {ms_mis}")
    simplified_inferred_ts = inferred_ts.simplify()  # Remove unary nodes
    # Calculate mean num children (polytomy-measure) for internal nodes
    nc_sum = 0
    nc_sum_sq = 0
    nc_tot = 0
    root_lengths = collections.defaultdict(float)
    for tree in simplified_inferred_ts.trees():
        for n in tree.nodes():
            n_children = tree.num_children(n)
            if n_children > 0:  # exclude leaves/samples
                nc_sum += n_children * tree.span
                nc_sum_sq += (n_children**2) * tree.span
                nc_tot += tree.span
    nc_mean = nc_sum / nc_tot
    nc_var = nc_sum_sq / nc_tot - (nc_mean**2
                                   )  # can't be bothered to adjust for n

    # Calculate span of root nodes in simplified tree

    # Calculate KC
    try:
        kc = simplified_inferred_ts.kc_distance(tskit.load(prefix + ".trees"))
    except FileNotFoundError:
        kc = None
    return Results(abs_ma_mis=ma_mis,
                   abs_ms_mis=ms_mis,
                   rel_ma_mis=ma_mis_rate,
                   rel_ms_mis=ms_mis_rate,
                   cutoff_power=params.cutoff_power,
                   trim_oldest=params.trim_oldest,
                   precision=precision,
                   edges=inferred_ts.num_edges,
                   muts=inferred_ts.num_mutations,
                   num_trees=inferred_ts.num_trees,
                   kc=kc,
                   mean_node_children=nc_mean,
                   var_node_children=nc_var,
                   process_time=process_time,
                   ts_size=os.path.getsize(ts_path),
                   ts_path=ts_path)
示例#15
0
def run(params):
    """
    Run a single inference, with the specified rates
    """

    prefix = None
    if params.sample_data.path is not None:
        assert params.sample_data.path.endswith(".samples")
        prefix = params.sample_data.path[0:-len(".samples")]
    start_time = time.process_time()
    ga_start_time = time.process_time()
    if os.path.isfile(prefix + ".ancestors") == False:
        anc = tsinfer.generate_ancestors(
            params.sample_data,
            num_threads=params.num_threads,
            path=prefix + ".ancestors",
            progress_monitor=tsinfer.cli.ProgressMonitor(1, 1, 0, 0, 0),
        )
        print(
            f"GA done (ma_mut: {params.ma_mut_rate}, ms_mut: {params.ms_mut_rate})"
        )
    else:
        anc = tsinfer.load(prefix + ".ancestors")
    ga_process_time = time.process_time() - ga_start_time

    anc_w_proxy = anc.insert_proxy_samples(params.sample_data,
                                           allow_mutation=True)
    # If any proxy ancestors were added, save the proxy ancestors file and use for matching
    if anc_w_proxy.num_ancestors != anc.num_ancestors:
        anc = anc_w_proxy.copy(path=prefix + ".proxy.ancestors")
        anc.finalise()
        path_compression = False
    else:
        path_compression = True

    rec_rate = get_rho(anc, params.filename)
    rho = rec_rate[1:]
    base_rec_prob = np.quantile(rho, 0.5)
    if params.precision is None:
        # Smallest recombination rate
        min_rho = int(np.ceil(-np.min(np.log10(rho))))
        # Smallest mean
        av_min = int(
            np.ceil(-np.log10(
                min(1, params.ma_mut_rate, params.ms_mut_rate) *
                base_rec_prob)))
        precision = max(min_rho, av_min) + 3
    else:
        precision = params.precision
    print(
        f"Starting {params.ma_mut_rate} {params.ms_mut_rate}",
        f"with base rho {base_rec_prob:.5g}",
        f"(mean {np.mean(rho):.4g} median {np.quantile(rho, 0.5):.4g}",
        f"min {np.min(rho):.4g}, 2.5% quantile {np.quantile(rho, 0.025):.4g})",
        f"precision {precision}")
    ma_start_time = time.process_time()
    if os.path.isfile(prefix + ".atrees") == False:
        inferred_anc_ts = tsinfer.match_ancestors(
            params.sample_data,
            anc,
            num_threads=params.num_threads,
            precision=precision,
            recombination_rate=rec_rate,
            mismatch_rate=base_rec_prob * params.ma_mut_rate,
            path_compression=path_compression,
            progress_monitor=tsinfer.cli.ProgressMonitor(1, 0, 1, 0, 0),
        )
        inferred_anc_ts.dump(path=prefix + ".atrees")
        print(
            f"MA done (ma_mut:{params.ma_mut_rate} ms_mut{params.ms_mut_rate})"
        )
    else:
        inferred_anc_ts = tskit.load(prefix + ".atrees")
    ma_process_time = time.process_time() - ma_start_time

    ms_start_time = time.process_time()
    if os.path.isfile(prefix + ".trees") == False:
        inferred_ts = tsinfer.match_samples(
            params.sample_data,
            inferred_anc_ts,
            num_threads=params.num_threads,
            precision=precision,
            recombination_rate=rec_rate,
            mismatch_rate=base_rec_prob * params.ms_mut_rate,
            progress_monitor=tsinfer.cli.ProgressMonitor(1, 0, 0, 0, 1),
            force_sample_times=True,
            simplify=False)
        print(f"MS done: ms_mut rate = {params.ms_mut_rate})")
        process_time = time.process_time() - start_time
        ms_process_time = time.process_time() - ms_start_time
        ts_path = prefix + ".nosimplify.trees"
        inferred_ts.dump(path=ts_path)
    else:
        raise ValueError("Inferred tree sequence already present")

    return Results(ma_mut=params.ma_mut_rate,
                   ms_mut=params.ms_mut_rate,
                   precision=precision,
                   edges=inferred_ts.num_edges,
                   muts=inferred_ts.num_mutations,
                   num_trees=inferred_ts.num_trees,
                   process_time=process_time,
                   ga_process_time=ga_process_time,
                   ma_process_time=ma_process_time,
                   ms_process_time=ms_process_time,
                   ts_size=os.path.getsize(ts_path),
                   ts_path=ts_path)
示例#16
0
def tsinfer_dev(
    n,
    L,
    seed,
    num_threads=1,
    recombination_rate=1e-8,
    error_rate=0,
    engine="C",
    log_level="WARNING",
    precision=None,
    debug=True,
    progress=False,
    path_compression=True,
):

    np.random.seed(seed)
    random.seed(seed)
    L_megabases = int(L * 10**6)

    # daiquiri.setup(level=log_level)

    ts = msprime.simulate(
        n,
        Ne=10**4,
        length=L_megabases,
        recombination_rate=recombination_rate,
        mutation_rate=1e-8,
        random_seed=seed,
    )
    if debug:
        print("num_sites = ", ts.num_sites)
    assert ts.num_sites > 0

    # ts = msprime.mutate(ts, rate=1e-8, random_seed=seed,
    #         model=msprime.InfiniteSites(msprime.NUCLEOTIDES))

    samples = tsinfer.SampleData.from_tree_sequence(ts)
    rho = recombination_rate
    mu = 1e-3  # 1e-15

    #     num_alleles = samples.num_alleles(inference_sites=True)
    #     num_sites = samples.num_inference_sites
    #     with tsinfer.AncestorData(samples) as ancestor_data:
    #         t = np.sum(num_alleles) + 1
    #         for j in range(num_sites):
    #             for allele in range(num_alleles[j]):
    #                 ancestor_data.add_ancestor(j, j + 1, t, [j], [allele])
    #                 t -= 1

    ancestor_data = tsinfer.generate_ancestors(samples,
                                               engine=engine,
                                               num_threads=num_threads)

    ancestors_ts = tsinfer.match_ancestors(
        samples,
        ancestor_data,
        engine=engine,
        path_compression=True,
        extended_checks=False,
        precision=precision,
        recombination_rate=rho,
        mutation_rate=mu,
    )
    # print(ancestors_ts.tables)
    # print("ancestors ts")
    # for tree in ancestors_ts.trees():
    #     print(tree.draw_text())
    #     for site in tree.sites():
    #         if len(site.mutations) > 1:
    #             print(site.id)
    #             for mutation in site.mutations:
    #                 print("\t", mutation.node, mutation.derived_state)

    # for var in ancestors_ts.variants():
    #     print(var.genotypes)

    # print(ancestors_ts.tables)

    # ancestors_ts = tsinfer.augment_ancestors(samples, ancestors_ts,
    #         [5, 6, 7], engine=engine)

    ts = tsinfer.match_samples(
        samples,
        ancestors_ts,
        recombination_rate=rho,
        mutation_rate=mu,
        path_compression=False,
        engine=engine,
        precision=precision,
        simplify=False,
    )

    print("num_edges = ", ts.num_edges)

    # # print(ts.draw_text())
    # for tree in ts.trees():
    #     print(tree.draw_text())
    #     for site in tree.sites():
    #         if len(site.mutations) > 1:
    #             print(site.id)
    #             for mutation in site.mutations:
    #                 print("\t", mutation.node, mutation.derived_state)

    # # print(ts.tables.edges)
    # print(ts.dump_tables())

    # simplified = ts.simplify()
    # print("edges before = ", simplified.num_edges)

    # new_ancestors_ts = insert_srb_ancestors(ts)
    # ts = tsinfer.match_samples(samples, new_ancestors_ts,
    #         path_compression=False, engine=engine,
    #         simplify=True)

    #     for tree in ts.trees():
    #         print(tree.interval)
    #         print(tree.draw(format="unicode"))

    # print(ts.tables.edges)
    # for tree in ts.trees():
    #     print(tree.draw(format="unicode"))

    tsinfer.verify(samples, ts)
示例#17
0
def visualise(ts,
              recombination_rate,
              error_rate,
              method="C",
              box_size=8,
              perfect_ancestors=False,
              path_compression=False,
              time_chunking=False):

    sample_data = tsinfer.SampleData.initialise(
        num_samples=ts.num_samples,
        sequence_length=ts.sequence_length,
        compressor=None)
    for v in ts.variants():
        sample_data.add_variant(v.site.position, v.alleles, v.genotypes)
    sample_data.finalise()

    ancestor_data = tsinfer.AncestorData.initialise(sample_data,
                                                    compressor=None)
    if perfect_ancestors:
        tsinfer.build_simulated_ancestors(sample_data,
                                          ancestor_data,
                                          ts,
                                          time_chunking=time_chunking)
    else:
        tsinfer.build_ancestors(sample_data, ancestor_data, method=method)
    ancestor_data.finalise()

    ancestors_ts = tsinfer.match_ancestors(sample_data,
                                           ancestor_data,
                                           method=method,
                                           path_compression=path_compression,
                                           extended_checks=True)
    inferred_ts = tsinfer.match_samples(sample_data,
                                        ancestors_ts,
                                        method=method,
                                        simplify=False,
                                        path_compression=path_compression,
                                        extended_checks=True)

    prefix = "tmp__NOBACKUP__/"
    visualiser = Visualiser(ts,
                            sample_data,
                            ancestor_data,
                            inferred_ts,
                            box_size=box_size)
    visualiser.draw_copying_paths(os.path.join(prefix, "copying_{}.png"))

    # tsinfer.print_tree_pairs(ts, inferred_ts, compute_distances=False)
    inferred_ts = tsinfer.match_samples(sample_data,
                                        ancestors_ts,
                                        method=method,
                                        simplify=True,
                                        path_compression=False,
                                        stabilise_node_ordering=True)

    tsinfer.print_tree_pairs(ts, inferred_ts, compute_distances=True)
    sys.stdout.flush()
    print("num_sites = ", inferred_ts.num_sites, "num_mutations= ",
          inferred_ts.num_mutations)

    for site in inferred_ts.sites():
        if len(site.mutations) > 1:
            print("Multiple mutations at ", site.id, "over",
                  [mut.node for mut in site.mutations])
def run(params):
    """
    Run a single inference, with the specified rates
    """
    precision = params.precision
    logger.info(
        f"Starting {params.ma_mis_ratio} {params.ms_mis_ratio}. Precision {precision}"
    )
    prefix = None
    assert params.sample_file.endswith(".samples")
    assert params.anc_file.endswith(".ancestors")
    samples = tsinfer.load(params.sample_file)
    ancestors = tsinfer.load(params.anc_file)
    start_time = time.process_time()
    prefix = params.sample_file[0:-len(".samples")]
    inf_prefix = "{}_rma{:g}_rms{:g}_p{}".format(prefix, params.ma_mis_ratio,
                                                 params.ms_mis_ratio,
                                                 precision)

    ats_path = inf_prefix + ".atrees"
    if params.skip_existing and os.path.exists(ats_path):
        logger.info(
            f"Ancestors ts file {ats_path} already exists, loading that.")
        inferred_anc_ts = tskit.load(ats_path)
        prov = json.loads(inferred_anc_ts.provenances()[-1].record.encode())
        if ancestors.uuid != prov['parameters']['source']['uuid']:
            logger.warning(
                "The loaded ancestors ts does not match the ancestors file. "
                "Checking the site positions, and will abort if they don't match!"
            )
            # We might be re-running this, but the simulation file is the same
            # So double-check that the positions in the ats are a subset of those in the
            # used sample data file
            assert np.all(
                np.isin(inferred_anc_ts.tables.sites.position,
                        samples.sites_position[:]))

    else:
        logger.info(f"MA running: will save to {ats_path}")
        inferred_anc_ts = tsinfer.match_ancestors(
            samples,
            ancestors,
            num_threads=params.num_threads,
            precision=precision,
            recombination_rate=params.rec_rate,
            mismatch_ratio=params.ma_mis_ratio)
        inferred_anc_ts.dump(ats_path)
        logger.info(f"MA done: mismatch ratio = {params.ma_mis_ratio}")

    ts_path = inf_prefix + ".trees"
    if params.skip_existing and os.path.exists(ts_path):
        logger.info(
            f"Inferred ts file {ts_path} already exists, loading that.")
        inferred_ts = tskit.load(ts_path)
        try:
            user_data = inferred_ts.metadata['user_data']
            try:
                assert np.allclose(params.kc_max, user_data['kc_max'])
            except (KeyError, TypeError):
                pass  # could be NaN e.g. if this is real data
            return user_data
        except (TypeError, KeyError):
            logging.warning(
                "No metadata in {ts_path}: re-inferring these parameters")

    # Otherwise finish off the inference
    logger.info(
        f"MS running with {params.num_threads} threads: will save to {ts_path}"
    )
    inferred_ts = tsinfer.match_samples(samples,
                                        inferred_anc_ts,
                                        num_threads=params.num_threads,
                                        precision=precision,
                                        recombination_rate=params.rec_rate,
                                        mismatch_ratio=params.ms_mis_ratio)
    process_time = time.process_time() - start_time
    logger.info(f"MS done: mismatch ratio = {params.ms_mis_ratio}")
    simplified_inferred_ts = inferred_ts.simplify()  # Remove unary nodes
    # Calculate mean num children (polytomy-measure) for internal nodes
    nc_sum = 0
    nc_sum_sq = 0
    nc_tot = 0
    root_lengths = collections.defaultdict(float)
    for tree in simplified_inferred_ts.trees():
        for n in tree.nodes():
            n_children = tree.num_children(n)
            if n_children > 0:  # exclude leaves/samples
                nc_sum += n_children * tree.span
                nc_sum_sq += (n_children**2) * tree.span
                nc_tot += tree.span
    arity_mean = nc_sum / nc_tot
    arity_var = nc_sum_sq / nc_tot - (arity_mean**2
                                      )  # can't be bothered to adjust for n

    # Calculate span of root nodes in simplified tree

    sim_ts_bytes = sim_ts_min_bytes = None
    kc_poly = kc_split = None

    if params.ts_file is not None:
        try:
            simulated_ts = tskit.load(params.ts_file + ".trees")
            logger.info(f"Calculating KC distances for {ts_path}")
            sim_ts_bytes = simulated_ts.nbytes
            sim_ts_min_bytes = simulated_ts.simplify(
                keep_unary=True,
                reduce_to_site_topology=True,
                filter_sites=False).nbytes
            kc_poly = simplified_inferred_ts.kc_distance(simulated_ts)
            logger.debug("KC poly calculated")
            kc_split = 0
            for interval, orig_tree, new_tree in simulated_ts.coiterate(
                    simplified_inferred_ts, sample_lists=True):
                kc_split += interval.span * orig_tree.kc_distance(
                    new_tree.split_polytomies(random_seed=int(interval.left),
                                              sample_lists=True))
            kc_split /= simulated_ts.sequence_length
            logger.debug("KC split calculated")
        except FileNotFoundError:
            pass

    results = {
        'arity_mean': arity_mean,
        'arity_var': arity_var,
        'edges': inferred_ts.num_edges,
        'error': params.error,
        'kc_max_split': params.kc_max_split,
        'kc_max': params.kc_max,
        'kc_poly': kc_poly,
        'kc_split': kc_split,
        'muts': inferred_ts.num_mutations,
        'n': inferred_ts.num_samples,
        'num_sites': inferred_ts.num_sites,
        'num_trees': inferred_ts.num_trees,
        'precision': precision,
        'proc_time': process_time,
        'ma_mis_ratio': params.ma_mis_ratio,
        'ms_mis_ratio': params.ms_mis_ratio,
        'seed': params.seed,
        'sim_ts_min_bytes': sim_ts_min_bytes,
        'sim_ts_bytes': sim_ts_bytes,
        'source': params.source,
        'ts_bytes': inferred_ts.nbytes,
        'ts_path': ts_path,
    }
    # Save the results into the ts metadata - this should allow us to reconstruct the
    # results table should anything go awry, or if we need to add more
    tables = inferred_ts.dump_tables()
    if tables.metadata_schema != tskit.MetadataSchema({"codec": "json"}):
        if tables.metadata:
            raise RuntimeError(
                "Metadata already exists in the ts, and is not JSON")
        tables.metadata_schema = tskit.MetadataSchema({"codec": "json"})
        tables.metadata = {}
    tables.metadata = {"user_data": results, **tables.metadata}
    tables.tree_sequence().dump(ts_path)
    return results
def run(params):
    """
    Run a single inference, with the specified rates
    """
    rho = params.rec_rate
    av_rho = np.quantile(rho, 0.5)
    ma_mis = av_rho * params.ma_mis_rate
    ms_mis = av_rho * params.ms_mis_rate

    if params.precision is None:
        # Smallest nonzero recombination rate
        min_rho = int(np.ceil(-np.min(np.log10(rho[rho > 0]))))
        # Smallest mean
        av_min = int(np.ceil(-np.log10(min(ma_mis, ms_mis))))
        precision = max(min_rho, av_min) + 3
    else:
        precision = params.precision

    print(
        f"Starting {params.ma_mis_rate} {params.ms_mis_rate}",
        f"with av rho {av_rho:.5g}",
        f"(mean {np.mean(rho):.4g}, median {np.quantile(rho, 0.5):.4g}, ",
        f"nonzero min {np.min(rho[rho > 0]):.4g}, ",
        f"2.5% quantile {np.quantile(rho, 0.025):.4g}) precision {precision}")
    prefix = None
    if params.sample_data.path is not None:
        assert params.sample_data.path.endswith(".samples")
        prefix = params.sample_data.path[0:-len(".samples")]
        inf_prefix = "{}_ma{}_ms{}_N{}_p{}".format(prefix, params.ma_mis_rate,
                                                   params.ms_mis_rate,
                                                   params.cutoff_exponent,
                                                   precision)
    start_time = time.process_time()
    extra_params = dict(num_threads=params.num_threads)
    if params.cutoff_exponent is not None:
        extra_params['cutoff_power'] = params.cutoff_exponent
    anc = tsinfer.generate_ancestors(
        params.sample_data,
        path=None if inf_prefix is None else inf_prefix + ".ancestors",
        progress_monitor=tsinfer.cli.ProgressMonitor(1, 1, 0, 0, 0),
        **extra_params,
    )
    print(f"GA done (cutoff exponent: {params.cutoff_exponent}")
    extra_params = dict(
        num_threads=params.num_threads,
        recombination_rate=rho,
        precision=precision,
    )
    inferred_anc_ts = tsinfer.match_ancestors(
        params.sample_data,
        anc,
        mismatch_rate=ma_mis,
        progress_monitor=tsinfer.cli.ProgressMonitor(1, 0, 1, 0, 0),
        **extra_params,
    )
    inferred_anc_ts.dump(path=inf_prefix + ".atrees")
    print(f"MA done (ma_mis:{ma_mis}")
    inferred_ts = tsinfer.match_samples(
        params.sample_data,
        inferred_anc_ts,
        mismatch_rate=ms_mis,
        progress_monitor=tsinfer.cli.ProgressMonitor(1, 0, 0, 0, 1),
        **extra_params,
    )
    process_time = time.process_time() - start_time
    ts_path = inf_prefix + ".trees"
    inferred_ts.dump(path=ts_path)
    print(f"MS done: ms_mis rate = {ms_mis})")
    simplified_inferred_ts = inferred_ts.simplify()  # Remove unary nodes
    # Calculate mean num children (polytomy-measure) for internal nodes
    nc_sum = 0
    nc_sum_sq = 0
    nc_tot = 0
    root_lengths = collections.defaultdict(float)
    for tree in simplified_inferred_ts.trees():
        for n in tree.nodes():
            n_children = tree.num_children(n)
            if n_children > 0:  # exclude leaves/samples
                nc_sum += n_children * tree.span
                nc_sum_sq += (n_children**2) * tree.span
                nc_tot += tree.span
    nc_mean = nc_sum / nc_tot
    nc_var = nc_sum_sq / nc_tot - (nc_mean**2
                                   )  # can't be bothered to adjust for n

    # Calculate span of root nodes in simplified tree

    # Calculate KC
    try:
        kc = simplified_inferred_ts.kc_distance(tskit.load(prefix + ".trees"))
    except FileNotFoundError:
        kc = None
    return Results(abs_ma_mis=ma_mis,
                   abs_ms_mis=ms_mis,
                   rel_ma_mis=params.ma_mis_rate,
                   rel_ms_mis=params.ms_mis_rate,
                   precision=precision,
                   edges=inferred_ts.num_edges,
                   muts=inferred_ts.num_mutations,
                   num_trees=inferred_ts.num_trees,
                   kc=kc,
                   cutoff_exponent=params.cutoff_exponent,
                   mean_node_children=nc_mean,
                   var_node_children=nc_var,
                   process_time=process_time,
                   ts_size=os.path.getsize(ts_path),
                   ts_path=ts_path)
示例#20
0
def tsinfer_dev(n,
                L,
                seed,
                num_threads=1,
                recombination_rate=1e-8,
                error_rate=0,
                engine="C",
                log_level="WARNING",
                debug=True,
                progress=False,
                path_compression=True):

    np.random.seed(seed)
    random.seed(seed)
    L_megabases = int(L * 10**6)

    # daiquiri.setup(level=log_level)

    ts = msprime.simulate(n,
                          Ne=10**4,
                          length=L_megabases,
                          recombination_rate=recombination_rate,
                          mutation_rate=1e-8,
                          random_seed=seed)
    if debug:
        print("num_sites = ", ts.num_sites)
    assert ts.num_sites > 0

    samples = tsinfer.SampleData.from_tree_sequence(ts)

    ancestor_data = tsinfer.generate_ancestors(samples,
                                               engine=engine,
                                               num_threads=num_threads)
    ancestors_ts = tsinfer.match_ancestors(samples,
                                           ancestor_data,
                                           engine=engine,
                                           path_compression=True,
                                           extended_checks=False)

    ancestors_ts = tsinfer.augment_ancestors(samples,
                                             ancestors_ts, [5, 6, 7],
                                             engine=engine)

    ts = tsinfer.match_samples(samples,
                               ancestors_ts,
                               path_compression=False,
                               engine=engine,
                               simplify=True)

    # print(ts.tables.edges)
    # print(ts.dump_tables())

    # simplified = ts.simplify()
    # print("edges before = ", simplified.num_edges)

    # new_ancestors_ts = insert_srb_ancestors(ts)
    # ts = tsinfer.match_samples(samples, new_ancestors_ts,
    #         path_compression=False, engine=engine,
    #         simplify=True)

    #     for tree in ts.trees():
    #         print(tree.interval)
    #         print(tree.draw(format="unicode"))

    # print(ts.tables.edges)
    # for tree in ts.trees():
    #     print(tree.draw(format="unicode"))

    tsinfer.verify(samples, ts)
示例#21
0
                             alleles=v.alleles,
                             genotypes=v.genotypes[ancient_sample_indices])
ancient_samples.finalise()

#%%
# Infer and date tree from modern samples

primary_ts = ts.simplify(modern_sample_indices, filter_sites=False)
primary_samples = tsinfer.SampleData.from_tree_sequence(primary_ts)

ancestors = tsinfer.generate_ancestors(primary_samples)
ancestors_ts = tsinfer.match_ancestors(
    primary_samples, ancestors)  # This only has inference sites

primary_inferred_ts = tsinfer.match_samples(primary_samples,
                                            ancestors_ts,
                                            simplify=False)
primary_inferred_ts_simplified = primary_inferred_ts.simplify(
    np.where(primary_inferred_ts.tables.nodes.flags == 1)[0], keep_unary=True)

tsdate.date(primary_inferred_ts_simplified,
            Ne=stable_pop_size,
            mutation_rate=2.5e-5)

#%%
# rest of inference- augmenting older samples in

augment_samples = ancient_samples
## re-inserting older samples
augment_samples = augment_samples.copy()