예제 #1
0
 def test_generate_nucleotides_keep(self):
     ts = msprime.sim_ancestry(4, sequence_length=10, population_size=10)
     ts = pyslim.annotate_defaults(ts, model_type='nonWF', slim_generation=1)
     mts1 = msprime.sim_mutations(ts,
             model=msprime.SLiMMutationModel(type=1),
             rate=0.1,
             random_seed=23)
     mts1.dump("out.trees")
     nts1 = pyslim.generate_nucleotides(mts1, seed=10, keep=False)
     assert nts1.num_mutations > 0
     self.verify_generate_nucleotides(nts1, check_transitions=False)
     mts2 = msprime.sim_mutations(nts1,
             model=msprime.SLiMMutationModel(
                 type=2,
                 next_id=nts1.num_mutations,
             ),
             rate=0.1,
             random_seed=24,
     )
     # keep defaults to True
     nts2 = pyslim.generate_nucleotides(mts2, seed=12)
     assert nts2.num_mutations > nts1.num_mutations
     muts1 = {}
     for mut in nts1.mutations():
         for i, md in zip(mut.derived_state.split(","), mut.metadata['mutation_list']):
             muts1[i] = md['nucleotide']
     for mut in nts2.mutations():
         for i, md in zip(mut.derived_state.split(","), mut.metadata['mutation_list']):
             if md['mutation_type'] == 1:
                 assert i in muts1
                 assert muts1[i] == md['nucleotide']
             else:
                 assert md['nucleotide'] in [0, 1, 2, 3]
     nts3 = pyslim.generate_nucleotides(mts2, keep=False, seed=15)
     self.verify_generate_nucleotides(nts3, check_transitions=False)
예제 #2
0
 def test_mutate_model(self):
     ts = msprime.simulate(5, random_seed=1)
     ts = msprime.sim_mutations(ts, model="pam")
     decoded = self.decode(ts.provenance(1).record)
     assert decoded.schema_version == "1.0.0"
     assert decoded.parameters.command == "sim_mutations"
     assert decoded.parameters.model["__class__"] == "msprime.mutations.PAM"
예제 #3
0
def add_mutations(ts, mut_type, mu_rate, effect_sd, next_id=0):
    # s_fn draws the selection coefficient
    # need to assign metadata to be able to put the mutations in
    mut_model = msprime.SLiMMutationModel(type=mut_type, next_id=next_id)
    mts = msprime.sim_mutations(
        ts,
        mu_rate,
        model=mut_model,
    )
    print(f"The tree sequence now has {mts.num_mutations} mutations, at "
          f"{mts.num_sites} distinct sites.")
    tables = mts.tables
    tables.mutations.clear()
    mut_map = {}
    for m in mts.mutations():
        md_list = m.metadata["mutation_list"]
        slim_ids = m.derived_state.split(",")
        assert len(slim_ids) == len(md_list)
        for sid, md in zip(slim_ids, md_list):
            if sid not in mut_map:
                mut_map[sid] = np.random.normal(loc=0.0, scale=effect_sd)
            md["selection_coeff"] = mut_map[sid]
        tables.mutations.add_row(site=m.site,
                                 node=m.node,
                                 time=m.time,
                                 derived_state=m.derived_state,
                                 parent=m.parent,
                                 metadata={"mutation_list": md_list})
    assert tables.mutations.num_rows == mts.num_mutations
    print(
        f"The selection coefficients range from {min(mut_map.values()):0.2e}")
    print(f"to {max(mut_map.values()):0.2e}.")
    return tables.tree_sequence()
예제 #4
0
파일: test_popgen.py 프로젝트: aktech/sgkit
def simulate_ts(
    sample_size: int,
    length: int = 100,
    mutation_rate: float = 0.05,
    random_seed: int = 42,
) -> tskit.TreeSequence:
    """
    Simulate some data using msprime with recombination and mutation and
    return the resulting tskit TreeSequence.

    Note this method currently simulates with ploidy=1 to minimise the
    update from an older version. We should update to simulate data under
    a range of ploidy values.
    """
    ancestry_ts = msprime.sim_ancestry(
        sample_size,
        ploidy=1,
        recombination_rate=0.01,
        sequence_length=length,
        random_seed=random_seed,
    )
    # Make sure we generate some data that's not all from the same tree
    assert ancestry_ts.num_trees > 1
    return msprime.sim_mutations(ancestry_ts,
                                 rate=mutation_rate,
                                 random_seed=random_seed)
예제 #5
0
 def test_all_fields(self):
     demography = msprime.Demography()
     demography.add_population(name="A", initial_size=10_000)
     demography.add_population(name="B", initial_size=5_000)
     demography.add_population(name="C", initial_size=1_000)
     demography.add_population_split(time=1000, derived=["A", "B"], ancestral="C")
     ts = msprime.sim_ancestry(
         samples={"A": 1, "B": 1},
         demography=demography,
         random_seed=42,
         record_migrations=True,
     )
     ts = msprime.sim_mutations(ts, rate=1, random_seed=42)
     tables = ts.dump_tables()
     for name, table in tables.table_name_map.items():
         if name not in ["provenances", "edges"]:
             table.metadata_schema = tskit.MetadataSchema({"codec": "json"})
             metadatas = [f'{{"foo":"n_{name}_{u}"}}' for u in range(len(table))]
             metadata, metadata_offset = tskit.pack_strings(metadatas)
             table.set_columns(
                 **{
                     **table.asdict(),
                     "metadata": metadata,
                     "metadata_offset": metadata_offset,
                 }
             )
     tables.metadata_schema = tskit.MetadataSchema({"codec": "json"})
     tables.metadata = "Test metadata"
     self.verify(tables.tree_sequence())
예제 #6
0
 def test_current_ts(self):
     ts1 = msprime.sim_ancestry(5, random_seed=1)
     ts2 = msprime.sim_mutations(ts1)
     command, prov = msprime.provenance.parse_provenance(
         ts2.provenance(1), ts1)
     assert command == "sim_mutations"
     assert prov["tree_sequence"] == ts1
예제 #7
0
def run_mutate(args):
    tree_sequence = tskit.load(args.tree_sequence)
    tree_sequence = msprime.sim_mutations(
        tree_sequence=tree_sequence,
        rate=args.mutation_rate,
        random_seed=args.random_seed,
        keep=args.keep,
        start_time=args.start_time,
        end_time=args.end_time,
        discrete_genome=args.discrete_genome,
    )
    tree_sequence.dump(args.output_tree_sequence)
예제 #8
0
def full_ts():
    """
    A tree sequence with data in all fields - duplicated from tskit's conftest.py
    as other test suites using this file will not have that fixture defined.
    """
    demography = msprime.Demography()
    demography.add_population(initial_size=100, name="A")
    demography.add_population(initial_size=100, name="B")
    demography.add_population(initial_size=100, name="C")
    demography.add_population_split(time=10, ancestral="C", derived=["A", "B"])

    ts = msprime.sim_ancestry(
        {"A": 5, "B": 5},
        demography=demography,
        random_seed=1,
        sequence_length=10,
        record_migrations=True,
    )
    assert ts.num_migrations > 0
    assert ts.num_individuals > 0
    ts = msprime.sim_mutations(ts, rate=0.1, random_seed=2)
    assert ts.num_mutations > 0
    tables = ts.dump_tables()
    tables.individuals.clear()

    for ind in ts.individuals():
        tables.individuals.add_row(flags=0, location=[ind.id, ind.id], parents=[-1, -1])

    for name, table in tables.table_name_map.items():
        if name != "provenances":
            table.metadata_schema = tskit.MetadataSchema({"codec": "json"})
            metadatas = [f"n_{name}_{u}" for u in range(len(table))]
            metadata, metadata_offset = tskit.pack_strings(metadatas)
            table.set_columns(
                **{
                    **table.asdict(),
                    "metadata": metadata,
                    "metadata_offset": metadata_offset,
                }
            )
    tables.metadata_schema = tskit.MetadataSchema({"codec": "json"})
    tables.metadata = {"A": "Test metadata"}

    tables.reference_sequence.data = "A" * int(tables.sequence_length)
    tables.reference_sequence.url = "https://example.com/sequence"
    tables.reference_sequence.metadata_schema = tskit.MetadataSchema.permissive_json()
    tables.reference_sequence.metadata = {"A": "Test metadata"}

    # Add some more provenance so we have enough rows for the offset deletion test.
    for j in range(10):
        tables.provenances.add_row(timestamp="x" * j, record="y" * j)
    return tables.tree_sequence()
예제 #9
0
 def test_mutate_map(self):
     ts = msprime.simulate(5, random_seed=1)
     rate_map = msprime.RateMap(position=[0, 0.5, 1], rate=[0, 1])
     ts = msprime.sim_mutations(ts, rate=rate_map)
     decoded = self.decode(ts.provenance(1).record)
     assert decoded.schema_version == "1.0.0"
     assert decoded.parameters.command == "sim_mutations"
     assert decoded.parameters.rate[
         "__class__"] == "msprime.intervals.RateMap"
     assert decoded.parameters.rate["position"]["__ndarray__"] == list(
         rate_map.position)
     assert decoded.parameters.rate["rate"]["__ndarray__"] == list(
         rate_map.rate)
예제 #10
0
def alignment_example(sequence_length, include_reference=True):
    ts = msprime.sim_ancestry(
        samples=5, sequence_length=sequence_length, random_seed=123
    )
    ts = msprime.sim_mutations(ts, rate=0.1, random_seed=1234)
    tables = ts.dump_tables()
    if include_reference:
        tables.reference_sequence.data = tskit.random_nucleotides(
            ts.sequence_length, seed=1234
        )
    ts = tables.tree_sequence()
    assert ts.num_sites > 5
    return ts
예제 #11
0
def run_mutations(args):
    input_ts = tskit.load(args.input)
    output_ts = msprime.sim_mutations(
        tree_sequence=input_ts,
        rate=args.mutation_rate,
        random_seed=args.random_seed,
        keep=True,
        start_time=args.start_time,
        end_time=args.end_time,
        discrete_genome=True,
        model=args.model,
    )
    output_ts.dump(args.output)
예제 #12
0
 def test_just_simulate(self, helper_functions, tmp_path):
     ts = msprime.simulate(sample_size=4,
                           Ne=10,
                           length=10,
                           mutation_rate=0.0,
                           recombination_rate=0.01)
     ts = msprime.sim_mutations(ts, rate=0.1)
     slim_ts = pyslim.annotate_defaults(ts,
                                        model_type="WF",
                                        slim_generation=1)
     loaded_ts = helper_functions.run_msprime_restart(slim_ts,
                                                      tmp_path,
                                                      WF=True)
     self.verify_annotated_trees(ts, loaded_ts)
예제 #13
0
 def test_dont_annotate_mutations(self, helper_functions):
     # Test the option to not overwrite mutation annotations
     ts = msprime.sim_ancestry(10)
     ts = msprime.sim_mutations(ts, rate=5, random_seed=3)
     assert ts.num_mutations > 0
     tables = ts.dump_tables()
     pre_mutations = tables.mutations.copy()
     pyslim.annotate_defaults_tables(tables,
                                     model_type="WF",
                                     slim_generation=1,
                                     annotate_mutations=False)
     # this is necessary because b'' actually is decoded to
     # an empty mutation_list by the schema
     pre_mutations.metadata_schema = tables.mutations.metadata_schema
     assert tables.mutations.equals(pre_mutations)
예제 #14
0
def ts_fixture():
    """
    A tree sequence with data in all fields
    """
    demography = msprime.Demography()
    demography.add_population(name="A", initial_size=10_000)
    demography.add_population(name="B", initial_size=5_000)
    demography.add_population(name="C", initial_size=1_000)
    demography.add_population(name="D", initial_size=500)
    demography.add_population(name="E", initial_size=100)
    demography.add_population_split(time=1000, derived=["A", "B"], ancestral="C")
    ts = msprime.sim_ancestry(
        samples={"A": 10, "B": 10},
        demography=demography,
        sequence_length=5,
        random_seed=42,
        record_migrations=True,
        record_provenance=True,
    )
    ts = msprime.sim_mutations(ts, rate=0.001, random_seed=42)
    tables = ts.dump_tables()
    # Add locations to individuals
    individuals_copy = tables.individuals.copy()
    tables.individuals.clear()
    for i, individual in enumerate(individuals_copy):
        tables.individuals.append(
            individual.replace(location=[i, i + 1], parents=[i - 1, i - 1])
        )
    for name, table in tables.name_map.items():
        if name != "provenances":
            table.metadata_schema = tskit.MetadataSchema({"codec": "json"})
            metadatas = [f'{{"foo":"n_{name}_{u}"}}' for u in range(len(table))]
            metadata, metadata_offset = tskit.pack_strings(metadatas)
            table.set_columns(
                **{
                    **table.asdict(),
                    "metadata": metadata,
                    "metadata_offset": metadata_offset,
                }
            )
    tables.metadata_schema = tskit.MetadataSchema({"codec": "json"})
    tables.metadata = "Test metadata"

    # Add some more rows to provenance to have enough for testing.
    for _ in range(3):
        tables.provenances.add_row(record="A")

    return tables.tree_sequence()
예제 #15
0
 def test_mutate_numpy(self):
     ts = msprime.simulate(5, random_seed=1)
     ts = msprime.sim_mutations(
         ts,
         rate=np.array([2])[0],
         random_seed=np.array([1])[0],
         start_time=np.array([0])[0],
         end_time=np.array([100][0]),
     )
     decoded = self.decode(ts.provenance(1).record)
     assert decoded.schema_version == "1.0.0"
     assert decoded.parameters.command == "sim_mutations"
     assert decoded.parameters.random_seed == 1
     assert decoded.parameters.rate == 2
     assert decoded.parameters.start_time == 0
     assert decoded.parameters.end_time == 100
예제 #16
0
 def test_generate_nucleotides_refseq(self):
     ts = msprime.sim_ancestry(
             4,
             sequence_length=10,
             population_size=10,
             random_seed=10,
     )
     ts = pyslim.annotate_defaults(ts, model_type='nonWF', slim_generation=1)
     mts = msprime.sim_mutations(ts,
             model=msprime.SLiMMutationModel(type=1),
             rate=0.5,
             random_seed=23)
     refseq = "A" * int(mts.sequence_length)
     nts = pyslim.generate_nucleotides(mts, reference_sequence=refseq, seed=6)
     self.verify_generate_nucleotides(nts, check_transitions=True)
     assert nts.reference_sequence.data == refseq
예제 #17
0
 def test_convert_alleles_errors(self):
     ts = msprime.sim_ancestry(4, sequence_length=10, population_size=10)
     with pytest.raises(ValueError, match="must have a valid reference sequence"):
         _ = pyslim.convert_alleles(ts)
     ts = pyslim.annotate_defaults(ts, model_type="nonWF", slim_generation=1)
     with pytest.raises(ValueError, match="must have a valid reference sequence"):
         _ = pyslim.convert_alleles(ts)
     mts = msprime.sim_mutations(ts,
             model=msprime.SLiMMutationModel(type=1),
             rate=0.1,
             random_seed=23)
     assert mts.num_mutations > 0
     mtt = mts.dump_tables()
     mtt.reference_sequence.data = 'A' * int(mts.sequence_length)
     mts = mtt.tree_sequence()
     with pytest.raises(ValueError, match="must be nucleotide mutations"):
         _ = pyslim.convert_alleles(mts)
예제 #18
0
 def test_sim_mutations(self):
     ts = msprime.simulate(5, random_seed=1)
     ts = msprime.sim_mutations(ts,
                                rate=2,
                                random_seed=1,
                                start_time=0,
                                end_time=100,
                                keep=False)
     decoded = self.decode(ts.provenance(1).record)
     assert decoded.schema_version == "1.0.0"
     assert decoded.parameters.command == "sim_mutations"
     assert decoded.parameters.random_seed == 1
     assert decoded.parameters.rate == 2
     assert decoded.parameters.start_time == 0
     assert decoded.parameters.end_time == 100
     assert not decoded.parameters.keep
     assert (decoded.parameters.model["__class__"] ==
             "msprime.mutations.JC69MutationModel")
예제 #19
0
    def _mutate(self):
        """Mutatates the recapitated TreeSequence.

        This applies a mutation model to edges of the tree sequence.
        Does it know which regions to mutate or not mutate? For example,
        all recapitated edges should be mutated, but also the neutral
        genomic regions of the SLiM time frame should be mutated.
        """
        # logger report before adding mutations
        self._report_mutations(allow_m0=False)

        # add mutations
        self.tree_sequence = msprime.sim_mutations(
            self.tree_sequence,
            rate=self.mut,
            random_seed=self.rng.integers(2**31),
            keep=True,  # whether to keep existing mutations.
            model=msprime.SLiMMutationModel(type=0),
        )
        self.tree_sequence = pyslim.SlimTreeSequence(self.tree_sequence)

        # logger report after adding mutations
        self._report_mutations(allow_m0=True)
예제 #20
0
    def __next__(self):
        """
        return haplotype, recombination points and coalescent time
        """
        if self._data is None:
            self.run_simulation()

        try:
            tree = next(self._data)
        except StopIteration:
            raise StopIteration

        mutated_ts = msprime.sim_mutations(
            tree, rate=self.mutation_rate)  # random_seed

        #times = [0]*self.len
        d_times = [0] * self.len
        mutations = [0] * self.len
        prior_dist = [0.0] * self.number_intervals

        for m in mutated_ts.mutations():
            mutations[int(m.position)] = 1

        for t in mutated_ts.aslist():
            interval = t.get_interval()
            left = interval.left
            right = interval.right
            time = t.get_total_branch_length() / 2
            #times[int(left):int(right)] = [time]*int(right-left)
            d_times[int(left):int(right)] = [
                self.splitter(time, self.number_intervals)
            ] * int(right - left)
            prior_dist[self.splitter(
                time, self.number_intervals)] += (int(right - left)) / self.len

        return mutations, d_times, prior_dist
def benchmark_single_tree(sample_size):
    """
    Benchmark running a simulation on a single large tree.
    """
    print("Generating ancestry")
    ts = msprime.sim_ancestry(sample_size,
                              ploidy=1,
                              sequence_length=10**4,
                              random_seed=1234)
    print("Done")

    ts.dump("tmp/big_tree.trees")

    before = time.perf_counter()
    # Factor in the time required to load the files
    ts = tskit.load("tmp/big_tree.trees")
    ts = msprime.sim_mutations(ts, model="BLOSUM62", rate=1, random_seed=42)
    ts.dump("tmp/big_tree_mutations.trees")
    duration = time.perf_counter() - before
    print(ts)

    print("simulated ", ts.num_mutations, "mutations at ", ts.num_sites,
          "sites")
    print("msprime = ", duration)
예제 #22
0
    time_units="myr",
    initial_size=pop_size,
    generation_time=generation_length)

ts = msprime.sim_ancestry(samples=[
    msprime.SampleSet(1, population="true", time=true_age),
    msprime.SampleSet(1, population="query", time=query_age),
    msprime.SampleSet(1, population="false", time=false_age)
],
                          demography=demography,
                          recombination_rate=recomb_rate,
                          ploidy=2,
                          sequence_length=seqlength,
                          random_seed=123456)

mts = msprime.sim_mutations(ts, rate=mutation_rate, random_seed=5678)
# Default mutation model is msprime.JC69.

# Create arbitrary sequence of same length, because msprime doesn't bother to simulate non-variable sites.
bases = ["A", "C", "T", "G"]
bgseq = random.choices(bases, k=seqlength)
# Assumes each nucleotide occurs at 25% frequency.

# Now use msprime output to create true and false reference sequences, replacing the variable sites in bgseq. Recall python is 0-based but msprime is not.
trueseq = list(bgseq)
falseseq = list(bgseq)
queryseq1 = list(bgseq)
queryseq2 = list(bgseq)
for var in mts.variants():
    pos = var.site.position
    # Here we are abritrarily picking one strand of the diploid true and false sequences to be our "reference", but sampling query reads from both strands of its sequence.
예제 #23
0
ots = msprime.sim_ancestry(
    samples=1000,  # number of individividuals sampled?
    demography=demog_model,
    # random_seed=5,
    recombination_rate=recomb_map)

ots = pyslim.annotate_defaults(ots, model_type="nonWF", slim_generation=1)
# this is adding anotation or metadata to all of the individuals
mut_map = msprime.RateMap(position=breaks,
                          rate=[1e-10, 1e-10,
                                1e-10])  # what rate(s) would I put in here
mut_model = msprime.SLiMMutationModel(type=2)  # mutation "m2"
ots = msprime.sim_mutations(
    ots,
    rate=mut_map,
    model=mut_model,
    keep=True,
    # random_seed=9
)
print(f"The tree sequence now has {ots.num_mutations} mutations, at "
      f"{ots.num_sites} distinct sites.")

tables = ots.tables
tables.mutations.clear()
mut_map = {}
for m in ots.mutations():
    md_list = m.metadata["mutation_list"]
    slim_ids = m.derived_state.split(",")
    assert len(slim_ids) == len(md_list)
    for sid, md in zip(slim_ids, md_list):
        if sid not in mut_map:
예제 #24
0
    def simulate(
        self,
        demographic_model,
        contig,
        samples,
        *,
        seed=None,
        msprime_model=None,
        msprime_change_model=None,
        dry_run=False,
        **kwargs,
    ):
        """
        Simulate the demographic model using msprime.
        See :meth:`.Engine.simulate()` for definitions of parameters defined
        for all engines.

        :param msprime_model: The msprime simulation model to be used.
            One of ``hudson``, ``dtwf``, ``smc``, or ``smc_prime``.
            See msprime API documentation for details.
        :type msprime_model: str
        :param msprime_change_model: A list of (time, model) tuples, which
            changes the simulation model to the new model at the time specified.
        :type msprime_change_model: list of (float, str) tuples
        :param dry_run: If True, ``end_time=0`` is passed to :meth:`msprime.simulate()`
            to initialise the simulation and then immediately return.
        :type dry_run: bool
        :param \\**kwargs: Further arguments passed to :meth:`msprime.sim_ancestry()`
        """

        model, citations = self._convert_model_spec(msprime_model,
                                                    msprime_change_model)
        self.citations.extend(citations)

        if "random_seed" in kwargs.keys():
            if seed is None:
                seed = kwargs["random_seed"]
                del kwargs["random_seed"]
            else:
                raise ValueError("Cannot set both seed and random_seed")

        # TODO: remove this after a release or two. See #745.
        self._warn_zigzag(demographic_model)
        self._warn_mutation_rate_mismatch(contig, demographic_model)

        rng = np.random.default_rng(seed)
        seeds = rng.integers(1, 2**31 - 1, size=2)

        ts = msprime.sim_ancestry(
            samples=samples,
            recombination_rate=contig.recombination_map,
            demography=demographic_model.model,
            ploidy=2,
            random_seed=seeds[0],
            model=model,
            end_time=0 if dry_run else None,
            **kwargs,
        )
        ts = msprime.sim_mutations(
            ts,
            end_time=0 if dry_run else None,
            random_seed=seeds[1],
            rate=contig.mutation_rate,
        )

        if contig.inclusion_mask is not None:
            ts = stdpopsim.utils.mask_tree_sequence(ts, contig.inclusion_mask,
                                                    False)
        if contig.exclusion_mask is not None:
            ts = stdpopsim.utils.mask_tree_sequence(ts, contig.exclusion_mask,
                                                    True)

        if dry_run:
            ts = None
        return ts
예제 #25
0
def mutated_tree():
    """
    Make a figure with (a) a tree and (b) some mutations added to it.
    """
    ts = msprime.sim_ancestry(
        3,
        population_size=1e4,
        recombination_rate=1e-8,
        sequence_length=1000,
        random_seed=96,
    )

    model = msprime.F84(kappa=2)
    mts = msprime.sim_mutations(ts, rate=1e-7, model=model, random_seed=45)

    height = 210  # height of the plotting box for each TS
    width = 370
    top = 50

    colours = plt.rcParams['axes.prop_cycle'].by_key()['color']

    # print(colours)

    def do_svg(ts, **kwargs):

        # The page style here is just for Chromium. We shouldn't
        # need it for other output options.
        style = """\
            @media print {
              @page { margin: 0; size: 3.5in 5in}
              body { margin: 0cm; }
            }
            text {
                font-family: "Dejavu Sans", sans-serif;
            }
            """
        for j in range(ts.num_individuals):
            style += f"\n.node.i{j} > .sym {{fill: {colours[j]}}}"

        mut_colour = colours[ts.num_individuals]
        style += (f".mut text {{fill: {mut_colour}; font-style: italic}}"
                  f".mut .sym {{fill: none; stroke: {mut_colour}}}")

        return ts.draw_svg(
            size=(width, height),
            node_labels={},
            mutation_labels={m.id: m.derived_state
                             for m in ts.mutations()},
            symbol_size=5,
            style=style,
            **kwargs,
        )

    font_size = 15

    # I think serif is the default, and matches what we're using for labels?
    def make_text(text, y, font_family=None):
        html = f'<text x="{width/2}" y="{y}" text-anchor="middle" font-size="{font_size}"'
        if font_family is not None:
            html += f' style="font-family: {font_family}"'
        html += f">{text}</text>"
        return html

    params = {}
    # params = {'y_axis': True, 'y_ticks': {float(x): x for x in ["0", "1e4", "2e4", "3e4"]}}
    svg1 = do_svg(ts, **params)
    svg2 = do_svg(mts, **params)
    fig = (
        f'<svg baseProfile="full" height="{(height+top)*2}" version="1.1" width="{width}" '
        'xmlns="http://www.w3.org/2000/svg" xmlns:ev="http://www.w3.org/2001/xml-events" '
        'xmlns:xlink="http://www.w3.org/1999/xlink">'
        f'<g transform="translate(0 {top})">' + make_text("(A)", y=-25) +
        make_text("ts = sim_ancestry(3, ...)", y=-8, font_family="monospace") +
        svg1 + "</g>"
        f'<g transform="translate(0 {(height+top) + top})">' +
        make_text("(B)", y=-25) + make_text(
            "mts = sim_mutations(ts, ...)", y=-8, font_family="monospace") +
        svg2 + "</g>"
        "</svg>")
    with open("illustrations/mutated_tree.svg", "w") as f:
        f.write(fig)
예제 #26
0
def run_simulation(param_df):
    """Run msprime simulation.

    Parameters
    ----------
    param_df : TYPE
        DESCRIPTION.
    check_demo : TYPE, optional
        DESCRIPTION. The default is True.
    run_stats : TYPE, optional
        DESCRIPTION. The default is False.

    Returns
    -------
    ts : TYPE
        DESCRIPTION.

    """
    demo_events = msp.Demography()

    # set samples sizes, here in diploids. so nsam/2
    sample_sizes = model_dt["sampleSize"]
    samples = {
        f'pop_{i}': sample_size / 2
        for i, sample_size in enumerate(sample_sizes)
    }

    # set population sizes
    init_sizes = [size * ploidy for size in model_dt["initialSize"]]
    for i, init in enumerate(init_sizes):
        demo_events.add_population(name=f"pop_{i}", initial_size=init)

    # set migration rates from migration matrix if > 0
    mig_mat = model_dt["migmat"]
    if np.sum(mig_mat) > 0:
        sym_rates = [
            model_dt["migmat"][i, j] for i, j in zip(
                *np.where(~np.eye(model_dt["migmat"].shape[0], dtype=bool)))
        ]
        if sym_rates.count(sym_rates[0]) == len(sym_rates):
            demo_events.set_migration_rate(source=None,
                                           dest=None,
                                           rate=sym_rates[0])
        else:
            mig_matrix = zip(*mig_mat)
            for p, pop_m in enumerate(mig_matrix):
                for i, m in pop_m:
                    if p != i and m > 0:
                        demo_events.set_migration_rate(source=p,
                                                       dest=i,
                                                       rate=m)

    # build demographic command line
    demo_events = demo_config(param_df, demo_events)

    # set hybrid models
    if hybrid_switch_over:
        model_list = [
            msp.DiscreteTimeWrightFisher(duration=hybrid_switch_over),
            msp.StandardCoalescent(),
        ]
    else:
        model_list = msp.StandardCoalescent()

    # check demo
    if dry_run:
        checkDemo(demo_events)
        return None

    elif vcf:
        tree = msp.sim_ancestry(samples,
                                recombination_rate=param_df["rec_t"],
                                demography=demo_events,
                                sequence_length=model_dt["contig_length"],
                                model=model_list)
        tree = msp.sim_mutations(tree, rate=param_df["mu_t"])
        return tree

    else:
        trees = msp.sim_ancestry(samples,
                                 recombination_rate=param_df["rec_t"],
                                 demography=demo_events,
                                 num_replicates=model_dt["loci"],
                                 sequence_length=model_dt["contig_length"],
                                 model=model_list)
        # calc stats
        stat_mat = np.zeros([model_dt["loci"], header_len])
        length_bp = stats_dt["length_bp"]
        pfe = stats_dt["perfixder"]
        for i, tree in enumerate(trees):
            tree = msp.sim_mutations(tree,
                                     rate=param_df["mu_t"],
                                     model="binary")
            stats_ls = []
            pos, haps, counts, bp = read_trees(tree,
                                               length_bp,
                                               pfe,
                                               seq_error=True)
            stats_dt["breakpoints"] = bp
            popsumstats = PopSumStats(pos, haps, counts, stats_dt)
            for stat in stats_dt["calc_stats"]:
                stat_fx = getattr(popsumstats, stat)
                try:
                    ss = stat_fx()
                    # print(f"{stat} =  {len(ss)}")
                except IndexError:
                    ss = [np.nan] * len(stats_dt["pw_quants"])
                stats_ls.extend(ss)
            stat_mat[i, :] = stats_ls

        return np.nanmean(stat_mat, axis=0)
def mutated_tree():
    """
    Make a figure with (a) a tree and (b) some mutations added to it.
    """
    ts = msprime.sim_ancestry(
        3,
        population_size=1e4,
        recombination_rate=1e-8,
        sequence_length=1000,
        random_seed=96,
    )

    model = msprime.F84(kappa=2)
    mts = msprime.sim_mutations(ts, rate=1e-7, model=model, random_seed=4)

    height = 280
    width = 700
    top = 50

    colours = plt.rcParams['axes.prop_cycle'].by_key()['color']
    print(colours)

    def do_svg(ts, **kwargs):

        # The page style here is just for Chromium. We shouldn't
        # need it for other vonersion options.
        style = """\
            @media print {
              @page { margin: 0; size: 6in 2.5in}
              body { margin: 1.6cm; }
            }
            text {
                font-family:DejaVuSans;
            }
            """
        for j in range(ts.num_individuals):
            style += f"\n.node.i{j} > .sym {{fill: {colours[j]}}}"

        mut_colour = colours[ts.num_individuals]
        style += (f".mut text {{fill: {mut_colour}; font-style: italic}}"
                  f".mut .sym {{fill: none; stroke: {mut_colour}}}")

        return ts.draw_svg(
            size=(width / 2, height - top),
            node_labels={},
            mutation_labels={m.id: m.derived_state
                             for m in ts.mutations()},
            symbol_size=5,
            style=style,
            **kwargs,
        )

    font_size = 15

    # I think serif is the default, and matches what we're using for labels?
    def make_text(text, y, font_family="sans"):
        return (f'<text x="{width / 4}" y="{y}" font-size="{font_size}" '
                f'font-family="{font_family}" text-anchor="middle">'
                f"{text}</text>")

    svg1 = do_svg(ts)
    svg2 = do_svg(mts)
    fig = (
        f'<svg baseProfile="full" height="{height+top}" version="1.1" width="{width}" '
        'xmlns="http://www.w3.org/2000/svg" xmlns:ev="http://www.w3.org/2001/xml-events" '
        'xmlns:xlink="http://www.w3.org/1999/xlink">'
        f'<g transform="translate(0 {top})">' + make_text("(A)", y=-20) +
        make_text("ts = sim_ancestry(3, ...)", y=-5, font_family="monospace") +
        svg1 + "</g>"
        f'<g transform="translate({width/2} {top})">' +
        make_text("(B)", y=-20) + make_text(
            "mts = sim_mutations(ts, ...)", y=-5, font_family="monospace") +
        svg2 + "</g>"
        "</svg>")
    with open("illustrations/mutated_tree.svg", "w") as f:
        f.write(fig)