Exemplo n.º 1
0
def make_sampledata_compatible(args):
    """
    Make a list of sampledata files compatible with the first file.
    """

    # Load all the sampledata files into a list
    print("Subset sites with {} sampledata files".format(
        len(args.input_sampledata) - 1))
    for index, fn in enumerate(args.input_sampledata):
        fn = fn.rstrip("\n")
        if index == 0:
            target_sd = tsinfer.load(fn)
            print("Loaded First sampledata file")
            continue
        cur_sd = tsinfer.load(fn)
        print("Loaded sampledata file # {}".format(index))
        keep_sites = np.where(
            np.isin(cur_sd.sites_position[:], target_sd.sites_position[:]))[0]
        print("Subsetting to {} sites".format(len(keep_sites)))
        small_cur_sd = cur_sd.subset(sites=keep_sites)
        print("Done with subset")
        newname = fn[:-len(".samples")] + ".subset.samples"
        small_cur_sd_copy = small_cur_sd.copy(newname)
        small_cur_sd_copy.finalise()
        print(
            "Subsetted to {} sites from {}. Output can be found at {}.".format(
                len(keep_sites), fn, newname))
def get_ancient_constraints_tgp(args):
    if os.path.exists("all-data/1kg_ancients_only_chr20.samples"):
        ancient_samples = tsinfer.load(
            "all-data/1kg_ancients_only_chr20.samples")
    else:
        ancient_samples = tsinfer.load("all-data/1kg_ancients_chr20.samples")
        print("Subsetting SampleData file to only keep ancient samples")
        ancient_indiv_ids = np.where(
            ancient_samples.individuals_time[:] != 0)[0]
        ancient_sample_ids = np.where(ancient_samples.individuals_time[:][
            ancient_samples.samples_individual] != 0)[0]
        ancient_genos = ancient_samples.sites_genotypes[:]
        ancient_sites = np.where(
            np.any(ancient_genos[:, ancient_sample_ids] == 1, axis=1))[0]
        ancient_samples = ancient_samples.subset(individuals=ancient_indiv_ids,
                                                 sites=ancient_sites)
        copy = ancient_samples.copy("all-data/1kg_ancients_only_chr20.samples")
        copy.finalise()
        print("Subsetted to {} samples and {} sites".format(
            ancient_samples.num_samples, ancient_samples.num_sites))
    genotypes = ancient_samples.sites_genotypes[:]
    positions = ancient_samples.sites_position[:]
    alleles = ancient_samples.sites_alleles[:]
    min_site_times = ancient_samples.min_site_times(individuals_only=True)
    lower_bound = [(pos, allele[0], allele[1], age, np.sum(geno[geno == 1]))
                   for pos, allele, age, geno in zip(
                       positions, alleles, min_site_times, genotypes)]
    constraint_df = pd.DataFrame(
        lower_bound,
        columns=[
            "Position",
            "Reference Allele",
            "Alternative Allele",
            "Ancient Bound",
            "Number of Ancients",
        ],
    )
    constraint_df = constraint_df.astype({
        "Position": "int64",
        "Ancient Bound": "float64",
        "Number of Ancients": "int32"
    })
    constraint_df = constraint_df[constraint_df["Ancient Bound"] != 0]
    constraint_df.to_csv("all-data/ancient_constraints.csv")
    try:
        tgp_mut_ests = pd.read_csv("all-data/tgp_mutations.csv", index_col=0)
    except FileNotFoundError:
        raise ValueError(
            "tgp_mutations.csv does not exist. Must run tgp_dates first")
    tgp_muts_constraints = pd.merge(
        tgp_mut_ests,
        constraint_df,
        how="left",
        left_on=[
            "Position", "tsdate_ancestral_allele", "tsdate_derived_allele"
        ],
        right_on=["Position", "Reference Allele", "Alternative Allele"],
    )
    tgp_muts_constraints.to_csv("all-data/tgp_muts_constraints.csv")
Exemplo n.º 3
0
def combined_ts_constrained_samples(args):
    modern_samples = tsinfer.load(args.modern)
    high_cov_samples = tsinfer.load(args.high_cov)
    all_ancient_samples = tsinfer.load(args.all_samples)
    dated_hgdp_1kg_sgdp_ts = tskit.load(args.dated_ts)
    sites_time = tsdate.sites_time_from_ts(dated_hgdp_1kg_sgdp_ts)
    # Only look at sites where the same alleles are found in ancients and moderns. This means the site must be biallelic in moderns and the derived allele is shared between moderns and ancients
    alleles_equal = np.full(high_cov_samples.num_sites, False, dtype=bool)
    for index, (modern_alleles, high_cov_alleles,
                all_ancient_alleles) in enumerate(
                    zip(
                        modern_samples.sites_alleles[:],
                        high_cov_samples.sites_alleles[:],
                        all_ancient_samples.sites_alleles[:],
                    )):
        modern_alleles = [i for i in modern_alleles if i]
        high_cov_alleles = [i for i in high_cov_alleles if i]
        all_ancient_alleles = [i for i in all_ancient_alleles if i]
        if modern_alleles == high_cov_alleles == all_ancient_alleles:
            alleles_equal[index] = True

    # Get the ancient bounds from sampledata file of all ancients
    all_ancient_samples_bound = all_ancient_samples.min_site_times(
        individuals_only=True)
    high_cov_samples_bound = high_cov_samples.min_site_times(
        individuals_only=True)
    # Assert that the all ancient samples files has same or older ancient bounds than only high cov
    assert np.all(all_ancient_samples_bound >= high_cov_samples_bound)
    print(
        "Number of ancient lower bounds (with multiallelic sites): ",
        np.sum(all_ancient_samples_bound != 0),
    )

    # Set time of non-biallelic sites to 0
    all_ancient_samples_bound[~alleles_equal] = 0
    # If args.transversions_only is True, set time of all transversions to 0
    if args.transversions_only:
        transversions = get_transversions(all_ancient_samples)
        all_ancient_samples_bound[~transversions] = 0
    # Constrain the estimated ages from tree sequence with ancient bounds
    constrained_sites_time = np.maximum(sites_time, all_ancient_samples_bound)
    # Add constrained times to sampledata file with moderns and high cov ancients
    dated_samples = tsdate.add_sampledata_times(high_cov_samples,
                                                constrained_sites_time)
    # Record number of constrained sites
    print("Total number of sites: ", sites_time.shape[0])
    print(
        "Number of ancient lower bounds: ",
        np.sum(all_ancient_samples_bound != 0),
    )
    print("Number of corrected times: ",
          np.sum(dated_samples.sites_time[:] != sites_time))
    high_cov_samples_copy = dated_samples.copy(args.output)
    high_cov_samples_copy.finalise()
Exemplo n.º 4
0
 def __init__(self,
              data_file,
              ancestral_states,
              samples,
              target_samples=None):
     self.data_file = data_file
     self.ancestral_states = ancestral_states
     self.samples = samples
     if target_samples is not None:
         self.target_sites_pos = set(
             tsinfer.load(target_samples).sites_position[:])
     else:
         self.target_sites_pos = None
     self.num_samples = -1
     self.num_sites = 0
     # ancestral states counters.
     self.num_no_ancestral_state = 0
     self.num_low_confidence_ancestral_state = 0
     # Counters for genotypes and sites.
     self.num_unphased = 0
     self.num_missing_data = 0
     self.num_invariant = 0
     self.num_indels = 0
     self.num_non_biallelic = 0
     self.num_singletons = 0
     # (n - 1)-tons
     self.num_nmo_tons = 0
Exemplo n.º 5
0
def generate_ancestors(samples_fn, num_threads, prefix):
    sample_data = tsinfer.load(samples_fn)
    anc = tsinfer.generate_ancestors(
        sample_data,
        num_threads=num_threads,
        path=prefix + ".ancestors",
        progress_monitor=True,
    )
    if np.any(sample_data.individuals_time[:] != 0):
        anc_w_proxy = anc.insert_proxy_samples(sample_data,
                                               allow_mutation=True)
        anc = anc_w_proxy.copy(path=prefix + ".proxy.ancestors")
        anc.finalise()
    maximum_time = np.max(anc.ancestors_time[:])
    if (maximum_time < 3
        ):  # hacky way of checking if we used frequency to order ancestors
        anc = anc.truncate_ancestors(0.4,
                                     0.6,
                                     length_multiplier=1,
                                     path=prefix + ".truncated.ancestors")
    else:
        upper_time_limit = maximum_time * 0.6
        lower_time_limit = maximum_time * 0.4
        anc = anc.truncate_ancestors(
            lower_time_limit,
            upper_time_limit,
            length_multiplier=1,
            path=prefix + ".truncated.ancestors",
        )
    return anc
Exemplo n.º 6
0
 def setUp(self):
     self.tempdir = tempfile.TemporaryDirectory(prefix="tsinfer_cli_test")
     self.sample_file = str(
         pathlib.Path(self.tempdir.name, "input-data.samples"))
     self.ancestor_file = str(
         pathlib.Path(self.tempdir.name, "input-data.ancestors"))
     self.ancestor_trees = str(
         pathlib.Path(self.tempdir.name, "input-data.ancestors.trees"))
     self.output_trees = str(
         pathlib.Path(self.tempdir.name, "input-data.trees"))
     self.input_ts = msprime.simulate(10,
                                      mutation_rate=10,
                                      recombination_rate=10,
                                      random_seed=10)
     sample_data = tsinfer.SampleData(
         sequence_length=self.input_ts.sequence_length,
         path=self.sample_file)
     for var in self.input_ts.variants():
         sample_data.add_site(var.site.position, var.genotypes, var.alleles)
     sample_data.finalise()
     tsinfer.generate_ancestors(sample_data,
                                path=self.ancestor_file,
                                chunk_size=10)
     ancestor_data = tsinfer.load(self.ancestor_file)
     ancestors_ts = tsinfer.match_ancestors(sample_data, ancestor_data)
     ancestors_ts.dump(self.ancestor_trees)
     ts = tsinfer.match_samples(sample_data, ancestors_ts)
     ts.dump(self.output_trees)
     sample_data.close()
def setup_sample_file(args):
    """
    Return a Thousand Genomes Project sample data file, the
    corresponding recombination rate array, a prefix to use for files, and None
    """
    filename = args.sample_file
    map = args.genetic_map
    if not filename.endswith(".samples"):
        raise ValueError("Sample data file must end with '.samples'")
    sd = tsinfer.load(filename)
    inference_pos = sd.sites_position[:][sd.sites_inference[:]]

    match = re.search(r'(chr\d+)', filename)
    if match or map is not None:
        if map is not None:
            chr_map = msprime.RecombinationMap.read_hapmap(map)
        else:
            chr = match.group(1)
            print(
                f"Using {chr} from HapMapII_GRCh37 for the recombination map")
            map = stdpopsim.get_species("HomSap").get_genetic_map(
                id="HapMapII_GRCh37")
            if not map.is_cached():
                map.download()
            chr_map = map.get_chromosome_map(chr)
        inference_distances = physical_to_genetic(chr_map, inference_pos)
        d = np.diff(inference_distances)
        rho = np.concatenate(([0.0], d))
    else:
        inference_distances = inference_pos
        d = np.diff(inference_distances)
        rho = np.concatenate(([0.0], d / sd.sequence_length))

    return sd, rho, filename[:-len(".samples")], None
Exemplo n.º 8
0
def split_chromosome(args):
    match = re.search(r"(chr\d+)", args.chrom)
    if match is None:
        raise ValueError("chr must be in filename")
    chrom = match.group(1)
    with open(args.centromeres) as csvfile:
        reader = csv.DictReader(csvfile)
        centromere_positions = list()
        for row in reader:
            if row["chrom"] == chrom:
                centromere_positions.append(int(row["chromStart"]))
                centromere_positions.append(int(row["chromEnd"]))
    start = np.min(centromere_positions)
    end = np.max(centromere_positions)
    split_point = (start + end) / 2
    samples = tsinfer.load(args.input)
    position = samples.sites_position[:]
    print(f"Splitting at {split_point}")
    if args.arm == "p":
        keep_sites = np.where(position < split_point)[0]
        print(f"Keeping {keep_sites.shape[0]} sites")
        arm = samples.subset(sites=keep_sites)
        snipped_samples = arm.copy(path=args.output)
        snipped_samples.data.attrs["sequence_length"] = split_point
    elif args.arm == "q":
        keep_sites = np.where(position > split_point)[0]
        print(f"Keeping {keep_sites.shape[0]} sites")
        arm = samples.subset(sites=keep_sites)
        snipped_samples = arm.copy(path=args.output)
    snipped_samples.finalise()
Exemplo n.º 9
0
def run_get_dated_samples(args):
    samples = tsinfer.load(args.samples)
    ts = tskit.load(args.ts)
    assert args.samples.endswith(".samples")
    prefix = args.samples[0:-len(".samples")]
    copy = samples.copy(prefix + ".dated.samples")
    copy.sites_time[:] = tsdate.get_sites_time(ts)
    copy.finalise()
Exemplo n.º 10
0
def remove_moderns_reich(args):
    samples = tsinfer.load(args.input)
    ancients = samples.subset(individuals=np.where(
        samples.individuals_time[:] != 0)[0])
    genos = ancients.sites_genotypes[:]
    sites = np.where(np.sum(genos == 1, axis=1) != 0)[0]
    ancients_pruned = ancients.subset(sites=sites)
    copy = ancients_pruned.copy(args.output)
    copy.finalise()
Exemplo n.º 11
0
def setup_sample_file(args):
    """
    Return a Thousand Genomes Project sample data file, the
    corresponding recombination rate array, a prefix to use for files, and None
    """
    filename = args.sample_file
    if not filename.endswith(".samples"):
        raise ValueError("Sample data file must end with '.samples'")
    sd = tsinfer.load(filename)
    return sd, filename[:-len(".samples")],
def main():

    description = """Simple CLI wrapper for tsinfer
        tskit version: {}
        tsinfer version: {}""".format(tskit.__version__, tsinfer.__version__)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('--verbosity', '-v', action='count', default=0)
    parser.add_argument(
        "samples",
        help="The samples file name, as saved by tsinfer.SampleData.initialise()")
    parser.add_argument(
        "output",
        help="The path to write the output file to")
    parser.add_argument(
        "-l", "--length", default=None, type=int,
        help="The total sequence length")
    parser.add_argument(
        "-t", "--threads", default=1, type=int,
        help="The number of worker threads to use")
    parser.add_argument(
        "-m", "--method", default="C", choices=['C','P'],
        help="Which implementation to use, [C] (faster) or [P]ython (more debuggable)")
    parser.add_argument(
        "--inject-real-ancestors-from-ts", default=None,
        help="Instead of inferring ancestors, construct known ones from this tree sequence file path")
    parser.add_argument(
        "-V", "--version", action='version', version=description)

    args = parser.parse_args()

    engine = tsinfer.PY_ENGINE if args.method == "P" else tsinfer.C_ENGINE

    if not os.path.isfile(args.samples):
        raise ValueError("No samples file")
    sample_data = tsinfer.load(args.samples)
    if all(False for _ in sample_data.genotypes(inference_sites=True)):
        raise ValueError("No inference sites")
    if args.inject_real_ancestors_from_ts is not None:
        ancestor_data = tsinfer.AncestorData.initialise(sample_data, compressor=None)
        orig_ts = tskit.load(args.inject_real_ancestors_from_ts)
        eval_util.build_simulated_ancestors(sample_data, ancestor_data, orig_ts)
        ancestor_data.finalise()
        ancestors_ts = tsinfer.match_ancestors(
            sample_data, ancestor_data, engine=engine)
        ts = tsinfer.match_samples(
            sample_data, ancestors_ts, engine=engine, simplify=True)
    else:
        ts = tsinfer.infer(
            sample_data, num_threads=args.threads, engine=engine)
    ts.dump(args.output)
Exemplo n.º 13
0
def add_indiv_times(args):
    """
    Takes samples 'age' in metadata and add to individuals_time[:]
    """
    samples = tsinfer.load(args.input)
    times = samples.individuals_time[:]
    copy = samples.copy(args.output)
    for indiv in samples.individuals():
        if "age" in indiv.metadata:
            times[indiv.id] = int(indiv.metadata["age"])

    copy.individuals_time[:] = times
    copy.finalise()
Exemplo n.º 14
0
def combined_ts_constrained_samples(args):
    high_cov_samples = tsinfer.load(args.high_cov)
    dated_hgdp_1kg_sgdp_ts = tskit.load(args.dated_ts)
    sites_time = tsdate.sites_time_from_ts(dated_hgdp_1kg_sgdp_ts)
    dated_samples = tsdate.add_sampledata_times(high_cov_samples, sites_time)
    # Record number of constrained sites
    print("Total number of sites: ", sites_time.shape[0])
    print("Number of ancient lower bounds: ",
          np.sum(high_cov_samples.min_site_times(individuals_only=True) != 0))
    print("Number of corrected times: ",
          np.sum(dated_samples.sites_time[:] != sites_time))
    high_cov_samples_copy = dated_samples.copy(args.output)
    high_cov_samples_copy.finalise()
Exemplo n.º 15
0
def match_ancestors(samples_fn, anc, num_threads, precision, r_prob, m_prob,
                    prefix):
    sample_data = tsinfer.load(samples_fn)
    inferred_anc_ts = tsinfer.match_ancestors(
        sample_data,
        anc,
        num_threads=num_threads,
        precision=precision,
        recombination=r_prob,
        mismatch=m_prob,
        progress_monitor=True,
    )
    inferred_anc_ts.dump(prefix + ".atrees")
    return inferred_anc_ts
Exemplo n.º 16
0
def run_list(args):
    setup_logging(args)
    # First try to load with tskit.
    ts = None
    try:
        ts = tskit.load(args.path)
    except tskit.FileFormatError:
        pass
    if ts is None:
        tsinfer_file = tsinfer.load(args.path)
        if args.storage:
            print(tsinfer_file.info)
        else:
            print(tsinfer_file)
    else:
        summarise_tree_sequence(args.path, ts)
Exemplo n.º 17
0
def match_samples(samples_fn, inferred_anc_ts, num_threads, r_prob, m_prob,
                  precision, prefix):
    sample_data = tsinfer.load(samples_fn)
    inferred_ts = tsinfer.match_samples(
        sample_data,
        inferred_anc_ts,
        num_threads=num_threads,
        recombination=r_prob,
        mismatch=m_prob,
        precision=precision,
        progress_monitor=True,
        force_sample_times=True,
        simplify=False,
    )
    ts_path = prefix + ".nosimplify.trees"
    inferred_ts.dump(ts_path)
    return inferred_ts
def setup_sample_file(base_filename, args, num_threads=1):
    """
    Return a sample data file, the ancestors file, a corresponding recombination rate
    (a single number or a RateMap), a prefix to use for files, and None
    """
    gmap = args.genetic_map
    sd = tsinfer.load(base_filename + ".samples")

    anc = tsinfer.generate_ancestors(
        sd,
        num_threads=num_threads,
        path=base_filename + ".ancestors",
    )
    logger.info("GA done")

    inference_pos = anc.sites_position[:]

    match = re.search(r'(chr\d+)', base_filename)
    if match or gmap is not None:
        if gmap is not None:
            logger.info(f"Using {gmap} for the recombination map")
            rho = intervals.read_hapmap(gmap)
        else:
            chr = match.group(1)
            logger.info(
                f"Using {chr} from HapMapII_GRCh37 for the recombination map")
            gmap = stdpopsim.get_species("HomSap").get_genetic_map(
                id="HapMapII_GRCh37")
            if not gmap.is_cached():
                gmap.download()
            filename = os.path.join(gmap.map_cache_dir,
                                    gmap.file_pattern.format(id=chr))
            rho = intervals.read_hapmap(filename)
    else:
        rho = 1e-8  # shouldn't matter what this is - it it relative to mismatch

    #if np.any(d==0):
    #    w = np.where(d==0)
    #    raise ValueError("Zero recombination rates at", w, inference_pos[w])

    return sd.path, anc.path, rho, "", None
Exemplo n.º 19
0
def remove_outliers(args):
    tree_seq = tskit.load(args.ts)
    samples = tsinfer.load(args.samples)
    # Find number of mutations per site
    muts_per_site = np.unique(tree_seq.tables.mutations.site,
                              return_counts=True)
    mean_muts_per_site = np.mean(muts_per_site[1])
    std_muts_per_site = np.std(muts_per_site[1])
    print("Mean number of muts per site: ", mean_muts_per_site)
    print("Std number of muts per site: ", std_muts_per_site)
    # Find outliers: greater than 3 standard deviations from the mean number of mutations
    # per site
    outliers = muts_per_site[1] > mean_muts_per_site + 3 * std_muts_per_site
    # Remove outlier sites from tree sequence and sampledata files
    tree_seq = tree_seq.delete_sites(np.where(muts_per_site[0][outliers])[0])
    tree_seq.dump(args.output_ts)
    samples_subset = samples.subset(
        sites=np.where(muts_per_site[0][~outliers])[0])
    samples_subset_copy = samples_subset.copy(args.output_samples)
    samples_subset_copy.finalise()
    print(" Number of muts removed: ", np.sum(outliers))
Exemplo n.º 20
0
def run_sequential_augment(args):

    base = ".".join(args.input.split(".")[:-1])

    sample_data = tsinfer.load(args.input)
    num_samples = sample_data.num_samples
    ancestors_ts = tskit.load(base + ".ancestors.trees")

    # Compute the total samples required.
    n = 2
    total = 0
    while n < num_samples // 4:
        total += n
        n *= 2

    np.random.seed(args.seed)
    samples = np.random.choice(np.arange(num_samples),
                               size=total,
                               replace=False)
    np.save(base + ".augmented_samples.npy", samples)

    n = 2
    j = 0
    while n < num_samples // 4:
        augmented_file = base + ".augmented_{}.ancestors.trees".format(n)
        final_file = base + ".augmented_{}.nosimplify.trees".format(n)
        subset = samples[j:j + n]
        subset.sort()
        ancestors_ts = run_augment(sample_data, ancestors_ts, subset,
                                   args.num_threads)
        ancestors_ts.dump(augmented_file)
        j += n
        n *= 2

    final_ts = run_match_samples(sample_data, ancestors_ts, args.num_threads)
    final_ts.dump(final_file)
Exemplo n.º 21
0
def merge_sampledata_files(args):
    samples = []
    for cur_sample in args.input_sampledata:
        samples.append(tsinfer.load(cur_sample))
    merged_samples = samples[0]
    for index, other_samples in enumerate(samples[1:]):
        print("Loaded sampledata file # {}".format(index))
        intersect_sites = np.isin(merged_samples.sites_position[:],
                                  other_samples.sites_position[:])
        other_intersect_sites = np.where(
            np.isin(other_samples.sites_position[:],
                    merged_samples.sites_position[:]))[0]
        other_samples_metadata = other_samples.sites_metadata[:]
        for site_index, site_metadata in zip(
                other_intersect_sites,
                merged_samples.sites_metadata[:][intersect_sites]):
            other_samples_metadata[site_index] = site_metadata
        other_samples_copy = other_samples.copy()
        other_samples_copy.sites_metadata[:] = other_samples_metadata
        other_samples_copy.finalise()
        merged_samples = merged_samples.merge(other_samples_copy)
        print("Merged sampledata file # {}".format(index))
    merged_copy = merged_samples.copy(args.output)
    merged_copy.finalise()
Exemplo n.º 22
0
def run_build():

    sample_data = tsinfer.load(sys.argv[1])
    ad = tsinfer.generate_ancestors(sample_data)
    print(ad)
Exemplo n.º 23
0
def run_combine_ukbb_1kg(args):
    ukbb_samples_file = "ukbb_{}.samples".format(args.chromosome)
    tg_ancestors_ts_file = "1kg_{}.trees".format(args.chromosome)
    ancestors_ts_file = "1kg_ukbb_{}.ancestors.trees".format(args.chromosome)
    samples_file = "1kg_ukbb_{}.samples".format(args.chromosome)

    ukbb_samples = tsinfer.load(ukbb_samples_file)
    tg_ancestors_ts = tskit.load(tg_ancestors_ts_file)
    print("Loaded ts:", tg_ancestors_ts.num_nodes, tg_ancestors_ts.num_edges)

    # Subset the sites down to the UKBB sites.
    tables = tg_ancestors_ts.dump_tables()
    ukbb_sites = set(ukbb_samples.sites_position[:])
    ancestors_sites = set(tables.sites.position[:])
    intersecting_sites = ancestors_sites & ukbb_sites

    print("Intersecting sites = ", len(intersecting_sites))
    tables.sites.clear()
    tables.mutations.clear()
    for site in tg_ancestors_ts.sites():
        if site.position in intersecting_sites:
            # Sites must be 0/1 for the ancestors ts.
            site_id = tables.sites.add_row(position=site.position,
                                           ancestral_state="0")
            assert len(site.mutations) == 1
            mutation = site.mutations[0]
            tables.mutations.add_row(site=site_id,
                                     node=mutation.node,
                                     derived_state="1")

    # Reduce this to the site topology now to make things as quick as possible.
    tables.simplify(reduce_to_site_topology=True, filter_sites=False)
    reduced_ts = tables.tree_sequence()
    # Rewrite the nodes so that 0 is one older than all the other nodes.
    nodes = tables.nodes.copy()
    tables.nodes.clear()
    tables.nodes.add_row(flags=1, time=np.max(nodes.time) + 2)
    tables.nodes.append_columns(
        flags=np.bitwise_or(nodes.flags, 1),  # Everything is a sample.
        time=nodes.time + 1,  # Make sure that all times are > 0
        population=nodes.population,
        individual=nodes.individual,
        metadata=nodes.metadata,
        metadata_offset=nodes.metadata_offset)
    # Add one to all node references to account for this.
    tables.edges.set_columns(left=tables.edges.left,
                             right=tables.edges.right,
                             parent=tables.edges.parent + 1,
                             child=tables.edges.child + 1)
    tables.mutations.set_columns(
        node=tables.mutations.node + 1,
        site=tables.mutations.site,
        parent=tables.mutations.parent,
        derived_state=tables.mutations.derived_state,
        derived_state_offset=tables.mutations.derived_state_offset,
        metadata=tables.mutations.metadata,
        metadata_offset=tables.mutations.metadata_offset)

    trees = reduced_ts.trees()
    tree = next(trees)
    left = 0
    root = tree.root
    for tree in trees:
        if tree.root != root:
            tables.edges.add_row(left, tree.interval[0], 0, root + 1)
            root = tree.root
            left = tree.interval[0]
    tables.edges.add_row(left, reduced_ts.sequence_length, 0, root + 1)
    tables.sort()
    ancestors_ts = tables.tree_sequence()
    print("Writing ancestors_ts")
    ancestors_ts.dump(ancestors_ts_file)

    # Now create a new samples file to get rid of the missing sites.
    git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"])
    git_provenance = {
        "repo":
        "[email protected]:mcveanlab/treeseq-inference.git",
        "hash":
        git_hash.decode().strip(),
        "dir":
        "human-data",
        "notes:":
        ("Use the Makefile to download and process the upstream data files")
    }

    n = args.num_individuals
    if n is None:
        n = ukbb_samples.num_individuals
    with tsinfer.SampleData(
            path=samples_file,
            num_flush_threads=4,
            sequence_length=ukbb_samples.sequence_length) as samples:

        iterator = tqdm.tqdm(itertools.islice(
            tqdm.tqdm(ukbb_samples.individuals()), n),
                             total=n)
        for ind in iterator:
            samples.add_individual(ploidy=2,
                                   location=ind.location,
                                   metadata=ind.metadata)

        for variant in tqdm.tqdm(ukbb_samples.variants(),
                                 total=ukbb_samples.num_sites):
            if variant.site.position in intersecting_sites:
                samples.add_site(position=variant.site.position,
                                 alleles=variant.alleles,
                                 genotypes=variant.genotypes[:2 * n],
                                 metadata=variant.site.metadata)

        for timestamp, record in ukbb_samples.provenances():
            samples.add_provenance(timestamp, record)
        samples.record_provenance(command=sys.argv[0],
                                  args=sys.argv[1:],
                                  git=git_provenance)

    print(samples)
def setup_sampledata_from_simulation(prefix,
                                     random_seed,
                                     err=0,
                                     num_threads=1,
                                     cheat_breakpoints=False,
                                     use_sites_time=False,
                                     skip_existing=False):
    """
    Take the results of a simulation and return a sample data file, some reconstructed
    ancestors, a recombination rate array, a suffix to append to the file prefix, and
    the original tree sequence.
    
    If 'err' is 0, we do not inject any errors into the haplotypes. Otherwise
    we add empirical sequencing error and ancestral allele polarity error
    
    If "cheat_recombination" is True, multiply the recombination_rate for known
    recombination locations from the simulation by 20

    If "use_sites_time" is True, use the times
    
    If "skip_existing" is True, and the sample_data file and ancestors_file that were
    going to be generated already exist, then skip the actual simulation and just return
    those files and their data.
    """
    suffix = ""
    ts = tskit.load(prefix + ".trees")
    plain_samples = tsinfer.SampleData.from_tree_sequence(
        ts, use_sites_time=use_sites_time)
    if cheat_breakpoints:
        suffix += "cheat_breakpoints"
        logger.info("Cheating by using known breakpoints")
    if use_sites_time:
        suffix += "use_times"
        logger.info("Cheating by using known times")
    if err == 0:
        sd_path = prefix + suffix + ".samples"
        if skip_existing and os.path.exists(sd_path):
            logger.info(
                f"Simulation file {sd_path} already exists, loading that.")
            sd = tsinfer.load(sd_path)
        else:
            sd = plain_samples.copy(path=sd_path)  # Save the samples file
            sd.finalise()
    else:
        logger.info("Adding error")
        suffix += f"_ae{err}"
        sd_path = prefix + suffix + ".samples"
        if skip_existing and os.path.exists(sd_path):
            logger.info(f"Sample file {sd_path} already exists, loading that.")
            sd = tsinfer.load(sd_path)
        else:
            error_file = add_errors(plain_samples,
                                    err,
                                    random_seed=random_seed)
            sd = error_file.copy(path=prefix + suffix + ".samples")
            if use_sites_time:
                # Sites that were originally singletons have time 0, but could have been
                # converted to inference sites when adding error. Give these a nonzero time
                sites_time = sd.sites_time
                sites_time[sites_time == 0] = np.min(
                    sites_time[sites_time > 0]) / 1000.0
                sd.sites_time[:] = sites_time
            sd.finalise()
    for attribute in ('sequence_length', 'num_samples', 'num_sites'):
        if getattr(sd, attribute) != getattr(ts, attribute):
            raise ValueError(
                f"{attribute} differs between original ts and sample_data: "
                f"{getattr(sd, attribute)} vs {getattr(ts, attribute)}")

    anc_path = prefix + suffix + ".ancestors"
    if skip_existing and os.path.exists(anc_path):
        logger.info(f"Ancestors file {anc_path} already exists, loading that.")
        anc = tsinfer.load(anc_path)
    else:
        anc = tsinfer.generate_ancestors(
            sd,
            num_threads=num_threads,
            path=anc_path,
        )
        logger.info("GA done")

    inference_pos = anc.sites_position[:]

    rho = 1e-8  # shouldn't matter what this is - it it relative to mismatch
    if cheat_breakpoints:
        raise NotImplementedError(
            "Need to make a RateMap with higher r at breakpoints")
        breakpoint_positions = np.array(list(ts.breakpoints()))
        inference_positions = anc.sites_position[:]
        breakpoints = np.searchsorted(inference_positions,
                                      breakpoint_positions)
        # Any after the last inference position must be junked
        # (those before the first inference position make no difference)
        breakpoints = breakpoints[breakpoints != len(rho)]
        rho[breakpoints] *= 20
    return sd.path, anc.path, rho, suffix, ts
def run(params):
    """
    Run a single inference, with the specified rates
    """
    precision = params.precision
    logger.info(
        f"Starting {params.ma_mis_ratio} {params.ms_mis_ratio}. Precision {precision}"
    )
    prefix = None
    assert params.sample_file.endswith(".samples")
    assert params.anc_file.endswith(".ancestors")
    samples = tsinfer.load(params.sample_file)
    ancestors = tsinfer.load(params.anc_file)
    start_time = time.process_time()
    prefix = params.sample_file[0:-len(".samples")]
    inf_prefix = "{}_rma{:g}_rms{:g}_p{}".format(prefix, params.ma_mis_ratio,
                                                 params.ms_mis_ratio,
                                                 precision)

    ats_path = inf_prefix + ".atrees"
    if params.skip_existing and os.path.exists(ats_path):
        logger.info(
            f"Ancestors ts file {ats_path} already exists, loading that.")
        inferred_anc_ts = tskit.load(ats_path)
        prov = json.loads(inferred_anc_ts.provenances()[-1].record.encode())
        if ancestors.uuid != prov['parameters']['source']['uuid']:
            logger.warning(
                "The loaded ancestors ts does not match the ancestors file. "
                "Checking the site positions, and will abort if they don't match!"
            )
            # We might be re-running this, but the simulation file is the same
            # So double-check that the positions in the ats are a subset of those in the
            # used sample data file
            assert np.all(
                np.isin(inferred_anc_ts.tables.sites.position,
                        samples.sites_position[:]))

    else:
        logger.info(f"MA running: will save to {ats_path}")
        inferred_anc_ts = tsinfer.match_ancestors(
            samples,
            ancestors,
            num_threads=params.num_threads,
            precision=precision,
            recombination_rate=params.rec_rate,
            mismatch_ratio=params.ma_mis_ratio)
        inferred_anc_ts.dump(ats_path)
        logger.info(f"MA done: mismatch ratio = {params.ma_mis_ratio}")

    ts_path = inf_prefix + ".trees"
    if params.skip_existing and os.path.exists(ts_path):
        logger.info(
            f"Inferred ts file {ts_path} already exists, loading that.")
        inferred_ts = tskit.load(ts_path)
        try:
            user_data = inferred_ts.metadata['user_data']
            try:
                assert np.allclose(params.kc_max, user_data['kc_max'])
            except (KeyError, TypeError):
                pass  # could be NaN e.g. if this is real data
            return user_data
        except (TypeError, KeyError):
            logging.warning(
                "No metadata in {ts_path}: re-inferring these parameters")

    # Otherwise finish off the inference
    logger.info(
        f"MS running with {params.num_threads} threads: will save to {ts_path}"
    )
    inferred_ts = tsinfer.match_samples(samples,
                                        inferred_anc_ts,
                                        num_threads=params.num_threads,
                                        precision=precision,
                                        recombination_rate=params.rec_rate,
                                        mismatch_ratio=params.ms_mis_ratio)
    process_time = time.process_time() - start_time
    logger.info(f"MS done: mismatch ratio = {params.ms_mis_ratio}")
    simplified_inferred_ts = inferred_ts.simplify()  # Remove unary nodes
    # Calculate mean num children (polytomy-measure) for internal nodes
    nc_sum = 0
    nc_sum_sq = 0
    nc_tot = 0
    root_lengths = collections.defaultdict(float)
    for tree in simplified_inferred_ts.trees():
        for n in tree.nodes():
            n_children = tree.num_children(n)
            if n_children > 0:  # exclude leaves/samples
                nc_sum += n_children * tree.span
                nc_sum_sq += (n_children**2) * tree.span
                nc_tot += tree.span
    arity_mean = nc_sum / nc_tot
    arity_var = nc_sum_sq / nc_tot - (arity_mean**2
                                      )  # can't be bothered to adjust for n

    # Calculate span of root nodes in simplified tree

    sim_ts_bytes = sim_ts_min_bytes = None
    kc_poly = kc_split = None

    if params.ts_file is not None:
        try:
            simulated_ts = tskit.load(params.ts_file + ".trees")
            logger.info(f"Calculating KC distances for {ts_path}")
            sim_ts_bytes = simulated_ts.nbytes
            sim_ts_min_bytes = simulated_ts.simplify(
                keep_unary=True,
                reduce_to_site_topology=True,
                filter_sites=False).nbytes
            kc_poly = simplified_inferred_ts.kc_distance(simulated_ts)
            logger.debug("KC poly calculated")
            kc_split = 0
            for interval, orig_tree, new_tree in simulated_ts.coiterate(
                    simplified_inferred_ts, sample_lists=True):
                kc_split += interval.span * orig_tree.kc_distance(
                    new_tree.split_polytomies(random_seed=int(interval.left),
                                              sample_lists=True))
            kc_split /= simulated_ts.sequence_length
            logger.debug("KC split calculated")
        except FileNotFoundError:
            pass

    results = {
        'arity_mean': arity_mean,
        'arity_var': arity_var,
        'edges': inferred_ts.num_edges,
        'error': params.error,
        'kc_max_split': params.kc_max_split,
        'kc_max': params.kc_max,
        'kc_poly': kc_poly,
        'kc_split': kc_split,
        'muts': inferred_ts.num_mutations,
        'n': inferred_ts.num_samples,
        'num_sites': inferred_ts.num_sites,
        'num_trees': inferred_ts.num_trees,
        'precision': precision,
        'proc_time': process_time,
        'ma_mis_ratio': params.ma_mis_ratio,
        'ms_mis_ratio': params.ms_mis_ratio,
        'seed': params.seed,
        'sim_ts_min_bytes': sim_ts_min_bytes,
        'sim_ts_bytes': sim_ts_bytes,
        'source': params.source,
        'ts_bytes': inferred_ts.nbytes,
        'ts_path': ts_path,
    }
    # Save the results into the ts metadata - this should allow us to reconstruct the
    # results table should anything go awry, or if we need to add more
    tables = inferred_ts.dump_tables()
    if tables.metadata_schema != tskit.MetadataSchema({"codec": "json"}):
        if tables.metadata:
            raise RuntimeError(
                "Metadata already exists in the ts, and is not JSON")
        tables.metadata_schema = tskit.MetadataSchema({"codec": "json"})
        tables.metadata = {}
    tables.metadata = {"user_data": results, **tables.metadata}
    tables.tree_sequence().dump(ts_path)
    return results
def min_site_times_ancients(args):
    samples = tsinfer.load("all-data/1kg_ancients_noreich_chr20.samples")
    min_times = samples.min_site_times(individuals_only=True)
    df = pd.DataFrame(np.unique(min_times, return_counts=True))
    df.to_csv("data/1kg_ancients_chr20_min_site_times.csv")
Exemplo n.º 27
0
def main():

    description = """Simple CLI wrapper for tsinfer
        tskit version: {}
        tsinfer version: {}""".format(tskit.__version__, tsinfer.__version__)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--verbosity", "-v", action="count", default=0)
    parser.add_argument(
        "samples",
        help=
        "The samples file name, as saved by tsinfer.SampleData.initialise()",
    )
    parser.add_argument("prefix", help="The prefix of the output filename")
    parser.add_argument(
        "-t",
        "--threads",
        default=1,
        type=int,
        help="The number of worker threads to use",
    )
    parser.add_argument(
        "-s",
        "--step",
        default="infer",
        choices=["GA", "MA", "MS"],
        help=
        "Which step of the algorithm to run: generate ancestors (GA), match ancestors"
        "(MA), or match samples (MS) or all three (infer)",
    )
    parser.add_argument(
        "-m",
        "--genetic-map",
        default=None,
        help=
        "An alternative genetic map to be used for this analysis, in the format"
        "expected by msprime.RateMap.read_hapmap",
    )
    parser.add_argument(
        "-p",
        "--precision",
        default=None,
        type=int,
        help="The precision parameter to pass to the function",
    )

    parser.add_argument("-V",
                        "--version",
                        action="version",
                        version=description)

    args = parser.parse_args()

    if not os.path.isfile(args.samples):
        raise ValueError("No samples file")

    if args.step == "infer":
        anc = generate_ancestors(args.samples, args.threads, args.prefix)
        if args.genetic_map == "None":
            genetic_map = None
        r_prob, m_prob = get_rho(anc, genetic_map, args.prefix)
        inferred_anc_ts = match_ancestors(args.samples, anc, args.threads,
                                          args.precision, r_prob, m_prob)
        match_samples(args.samples, inferred_anc_ts, args.threads, r_prob,
                      m_prob, args.precision)
    if args.step == "GA":
        anc = generate_ancestors(args.samples, args.threads, args.prefix)
    if args.step == "MA":
        anc = tsinfer.load(args.prefix + ".truncated.ancestors")
        if args.genetic_map == "None":
            genetic_map = None
        else:
            genetic_map = args.genetic_map
        r_prob, m_prob = get_rho(anc, genetic_map, args.prefix)
        inferred_anc_ts = match_ancestors(args.samples, anc, args.threads,
                                          args.precision, r_prob, m_prob,
                                          args.prefix)
    if args.step == "MS":
        anc = tsinfer.load(args.prefix + ".truncated.ancestors")
        inferred_anc_ts = tskit.load(args.prefix + ".atrees")
        if args.genetic_map == "None":
            genetic_map = None
        else:
            genetic_map = args.genetic_map
        r_prob, m_prob = get_rho(anc, genetic_map, args.prefix)
        match_samples(
            args.samples,
            inferred_anc_ts,
            args.threads,
            r_prob,
            m_prob,
            args.precision,
            args.prefix,
        )
Exemplo n.º 28
0
and bins the resulting times to the nearest 10 (unless the time is <= 1).
"""

import argparse

import numpy as np
import tsinfer
import tskit

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("input_file",
                        help="A tsinfer sample file ending in '.samples")
    parser.add_argument("output_file",
                        help="A tsinfer sample file ending in '.samples")
    args = parser.parse_args()

    sd = tsinfer.load(args.input_file).copy(path=args.output_file)

    times = sd.sites_time[:]
    times[times > 1] = np.round(times[times > 1], -1)
    times[times == 0] = 1
    sd.sites_time[:] = times
    print(
        "Number of samples:",
        sd.num_samples,
        ". Number of discrete times:",
        len(np.unique(sd.sites_time[:])),
    )
    sd.finalise()
Exemplo n.º 29
0
    parser.add_argument("-p",
                        "--percent_of_genome",
                        type=float,
                        default=10,
                        help="The percent of the genome to include")
    parser.add_argument(
        "-s",
        "--genome_start_percent",
        type=int,
        default=0,
        help=
        "The genomic point at which to start the subsample, as a percentage of the"
        " total genome length")
    args = parser.parse_args()

    sd = tsinfer.load(args.input_file)
    num_samples = sd.num_samples if args.num_samples is None else args.num_samples
    assert num_samples <= sd.num_samples
    assert 0 < args.percent_of_genome <= 100
    assert args.percent_of_genome + args.genome_start_percent <= 100

    del_samples = np.random.choice(sd.num_samples,
                                   sd.num_samples - num_samples,
                                   replace=False)

    del_sites = np.ones(sd.num_sites, dtype=bool)
    start_keep = int(args.genome_start_percent / 100.0 * sd.num_sites)
    end_keep = start_keep + int(args.percent_of_genome / 100.0 * sd.num_sites)
    del_sites[np.arange(start_keep, end_keep)] = False

    small_sd = sd.delete(samples=del_samples,
def simulate_stdpopsim(
    species,
    model,
    contig,
    num_samples,
    mutation_file=None,
    seed=123,
    skip_existing=False,
    num_procs=1,
):
    base_fn = f"{model}_{contig}_n{num_samples}"
    tree_fn = f"{base_fn}_seed{seed}"
    logger.info(
        f"Using {species}:{contig} from stdpopsim using the {model} model")
    if skip_existing and os.path.exists(tree_fn + ".trees"):
        logger.info(
            f"Simulation file {tree_fn}.trees already exists, returning that.")
        return base_fn, tree_fn

    sample_data = None
    species = stdpopsim.get_species(species)
    model = species.get_demographic_model(model)
    num_pops = model.num_sampling_populations
    if num_samples < num_pops or num_samples % num_pops != 0:
        raise ValueError(
            f"num_samples must be an integer multiple of {num_pops} "
            f"(or 2 x {num_pops} if diploid sequencing error is injected)")
    pop_n = num_samples // num_pops
    logger.info(
        f"Simulating {num_pops}x{pop_n} samples, seed {seed}, file prefix '{tree_fn}'."
    )
    contig = species.get_contig(contig)
    l = contig.recombination_map.get_sequence_length()
    if mutation_file is not None:
        logger.debug(f"Loading {mutation_file}")
        sample_data = tsinfer.load(mutation_file)
        if sample_data.sequence_length != l:
            raise ValueError(
                f"Mismatching sequence_length between simulation and {mutation_file}"
            )
        # Reduce mutation rate to 0, as we will insert mutations later
        contig = stdpopsim.Contig(
            mutation_rate=0,
            recombination_map=contig.recombination_map,
            genetic_map=contig.genetic_map,
        )
    r_map = contig.recombination_map
    assert len(r_map.get_rates()) == 2  # Ensure a single rate over chr
    samples = model.get_samples(*([pop_n] * num_pops))
    engine = stdpopsim.get_engine('msprime')
    ts = engine.simulate(model, contig, samples, seed=seed)
    tables = ts.dump_tables()
    if sample_data is not None:
        pos = sample_data.sites_position[:]
        logger.info(
            f"Inserting {len(pos)} mutations at variable sites from {mutation_file}"
        )
        for tree in ts.trees():
            positions = pos[np.logical_and(pos >= tree.interval[0],
                                           pos < tree.interval[1])]
            if len(positions) == 0:
                continue
            muts = list(
                zip(
                    np.random.uniform(0,
                                      tree.total_branch_length,
                                      size=len(positions)), positions))
            muts.sort()
            tot = 0
            # place a mutation on a random branch, proportional to branch length
            try:
                for n in tree.nodes():
                    tot += tree.branch_length(n)
                    while muts[0][0] < tot:
                        _, position = muts.pop(0)
                        s = tables.sites.add_row(position=position,
                                                 ancestral_state="0")
                        tables.mutations.add_row(node=n,
                                                 site=s,
                                                 derived_state="1")
            except IndexError:
                # No more mutations - go to next tree
                continue
        tables.sort()
        logger.debug(
            f"Inserted mutations at density {ts.num_mutations/ts.sequence_length}"
        )
    interval = [int(l * 2 / 20),
                int(l * 2 / 20) + 1e7]  # 10Mb near the start, not centromeric
    tables.keep_intervals([interval])
    tables.trim()
    logger.debug(
        f"Cut down tree seq to  {interval} ({tables.sites.num_rows} sites) for speed"
    )

    # Add info to the top-level metadata
    user_data = {}

    logger.info(
        "Calculating the kc distance of the simulation against a flat tree")
    star_tree = tskit.Tree.generate_star(ts.num_samples,
                                         span=tables.sequence_length,
                                         record_provenance=False)
    user_data['kc_max'] = tables.tree_sequence().kc_distance(
        star_tree.tree_sequence)
    kc_array = []
    max_reps = 100
    ts = tables.tree_sequence()
    logger.info(
        f"Calculating KC distance of the sim against at most {max_reps} * {ts.num_trees}"
        f" random trees using {num_procs} parallel threads. This could take a while."
    )
    seeds = range(seed, seed + max_reps)
    with multiprocessing.Pool(num_procs) as pool:
        for i, kc in enumerate(
                pool.imap_unordered(rnd_kc, zip(itertools.repeat(ts), seeds))):
            kc_array.append(kc)
            if i > 10:
                se_mean = np.std(kc_array, ddof=1) / np.sqrt(i)
                # break if SEM < 1/100th of mean KC. This can take along time
                if se_mean / np.average(kc_array) < 0.01:
                    logger.info(
                        f"Stopped after {i} replicates as kc_max_split deemed accurate."
                    )
                    break
        user_data['kc_max_split'] = np.average(kc_array)

    if tables.metadata_schema != tskit.MetadataSchema({"codec": "json"}):
        if tables.metadata:
            raise RuntimeError("Metadata already exists, and is not JSON")
        tables.metadata_schema = tskit.MetadataSchema({"codec": "json"})
        tables.metadata = {}
    tables.metadata = {"user_data": user_data, **tables.metadata}
    tables.tree_sequence().dump(tree_fn + ".trees")
    return base_fn, tree_fn