def test_recombination_n100(self): rng = _msprime.RandomGenerator(100) sim = msprime.simulator_factory(100, recombination_rate=0.2, record_full_arg=True, random_generator=rng) self.verify(sim)
def verify_simulation(self, n, m, r): """ Verifies a simulation for the specified parameters. """ recomb_map = msprime.RecombinationMap.uniform_map(m, r) rng = _msprime.RandomGenerator(1) sim = msprime.simulator_factory( n, recombination_map=recomb_map, random_generator=rng, discrete_genome=True, ) self.assertEqual(sim.random_generator, rng) sim.run() self.assertEqual(sim.num_breakpoints, len(sim.breakpoints)) self.assertGreater(sim.time, 0) self.assertGreater(sim.num_avl_node_blocks, 0) self.assertGreater(sim.num_segment_blocks, 0) self.assertGreater(sim.num_node_mapping_blocks, 0) tree_sequence = next(sim.run_replicates(1)) t = 0.0 for record in tree_sequence.nodes(): if record.time > t: t = record.time self.assertEqual(sim.time, t) self.assertGreater(sim.num_common_ancestor_events, 0) self.assertGreaterEqual(sim.num_recombination_events, 0) self.assertGreaterEqual(np.sum(sim.num_migration_events), 0) self.assertGreaterEqual(sim.num_multiple_recombination_events, 0)
def test_random_seed(self): seed = 12345 sim = msprime.simulator_factory(10, random_seed=seed) self.assertEqual(sim.random_generator.get_seed(), seed) # It's an error to specify both seed and generator. with self.assertRaises(ValueError): msprime.simulator_factory( 10, random_seed=1234, random_generator=_msprime.RandomGenerator(1234))
def test_no_recombination(self): rng = _msprime.RandomGenerator(1) sim = msprime.simulator_factory(10, random_generator=rng, record_full_arg=True) ts = self.verify(sim) ts_simplified = ts.simplify() t1 = ts.tables t2 = ts_simplified.tables self.assertEqual(t1.nodes, t2.nodes) self.assertEqual(t1.edges, t2.edges)
def test_hudson(self): threshold = 20 sim = ancestry._parse_simulate( sample_size=10, recombination_rate=10, random_generator=_msprime.RandomGenerator(2), ) sim.run() assert sim.num_common_ancestor_events > threshold assert sim.num_recombination_events > threshold assert sim.num_rejected_common_ancestor_events == 0 sim2 = ancestry._parse_simulate( sample_size=10, recombination_rate=10, model="hudson", random_generator=_msprime.RandomGenerator(2), ) sim2.run() assert sim2.num_common_ancestor_events == sim.num_common_ancestor_events assert sim2.num_recombination_events == sim.num_recombination_events assert sim2.num_rejected_common_ancestor_events == 0
def test_smc_variants(self): for model in ["smc", "smc_prime"]: threshold = 20 sim = msprime.simulator_factory( sample_size=10, recombination_rate=5, model=model, random_generator=_msprime.RandomGenerator(3), ) sim.run() self.assertGreater(sim.num_rejected_common_ancestor_events, 0) self.assertGreater(sim.num_common_ancestor_events, threshold) self.assertGreater(sim.num_recombination_events, threshold)
def test_multimerger(self): rng = _msprime.RandomGenerator(1234) sim = msprime.simulator_factory( 100, recombination_rate=0.1, record_full_arg=True, random_generator=rng, demographic_events=[ msprime.InstantaneousBottleneck(time=0.1, population=0, strength=5) ], ) self.verify(sim, multiple_mergers=True)
def test_hudson(self): threshold = 20 sim = ancestry._parse_simulate( sample_size=10, recombination_rate=10, random_generator=_msprime.RandomGenerator(2), ) sim.run() self.assertGreater(sim.num_common_ancestor_events, threshold) self.assertGreater(sim.num_recombination_events, threshold) self.assertEqual(sim.num_rejected_common_ancestor_events, 0) sim2 = ancestry._parse_simulate( sample_size=10, recombination_rate=10, model="hudson", random_generator=_msprime.RandomGenerator(2), ) sim2.run() self.assertEqual(sim2.num_common_ancestor_events, sim.num_common_ancestor_events) self.assertEqual(sim2.num_recombination_events, sim.num_recombination_events) self.assertEqual(sim2.num_rejected_common_ancestor_events, 0)
def test_random_generator(self): seed = 12345 rng = _msprime.RandomGenerator(seed) sim = msprime.simulator_factory(10, random_generator=rng) self.assertEqual(rng, sim.random_generator) self.assertEqual(rng.get_seed(), seed)
def sim_mutations( tree_sequence, rate=None, *, random_seed=None, model=None, keep=None, start_time=None, end_time=None, discrete_genome=None, kept_mutations_before_end_time=None, ): """ Simulates mutations on the specified ancestry and returns the resulting :class:`tskit.TreeSequence`. Mutations are generated at the specified rate per unit of sequence_length, per generation. By default, mutations are generated at discrete sites along the genome and multiple mutations can occur at any given site. A continuous sequence, infinite-sites model can also be specified by setting the ``discrete_genome`` parameter to False. If the ``model`` parameter is specified, this determines the model under which mutations are generated. The default mutation model is :class:`msprime.BinaryMutationModel` a simple binary model with alleles 0 and 1. See :ref:`sec_api_mutation_models` for details of available models. If a random seed is specified, this is used to seed the random number generator. If the same seed is specified and all other parameters are equal then the same mutations will be generated. If no random seed is specified then one is generated automatically. By default, sites and mutations in the input tree sequence are discarded. If the ``keep`` parameter is true, however, *additional* mutations are simulated. Under the infinite sites mutation model, all new mutations generated will occur at distinct positions from each other and from any existing mutations (by rejection sampling). Furthermore, if sites are discrete, trying to simulate mutations at time periods that are older than mutations kept from the original tree sequence is an error, because this would create an extra transition (from the new allele to the old one below it) that may be incorrect according to the model of mutation. Under a state-independent mutation model, however (e.g., Jukes-Cantor), there is no problem, and ``kept_mutations_before_end_time=True`` may be set to allow adding new mutations around or above existing ones. The time interval over which mutations can occur may be controlled using the ``start_time`` and ``end_time`` parameters. The ``start_time`` defines the lower bound (in time-ago) on this interval and ``max_time`` the upper bound. Note that we may have mutations associated with nodes with time <= ``start_time`` since mutations store the node at the bottom (i.e., towards the leaves) of the branch that they occur on. :param tskit.TreeSequence tree_sequence: The tree sequence onto which we wish to throw mutations. :param float rate: The rate of mutation per generation, as either a single number (for a uniform rate) or as a :class:`.RateMap`. (Default: 0). :param int random_seed: The random seed. If this is `None`, a random seed will be automatically generated. Valid random seeds must be between 1 and :math:`2^{32} - 1`. :param MutationModel model: The mutation model to use when generating mutations. This can either be a string (e.g., ``"jc69"``) or an instance of a simulation model class e.g, ``msprime.F84MutationModel(kappa=0.5)``. If not specified or None, the :class:`.BinaryMutationModel` mutation model is used. Please see the :ref:`sec_api_simulation_models` section for more details on specifying simulations models. :param bool keep: Whether to keep existing mutations (default: False). :param float start_time: The minimum time ago at which a mutation can occur. (Default: no restriction.) :param float end_time: The maximum time ago at which a mutation can occur (Default: no restriction). :param bool discrete_genome: Whether to generate mutations at only integer positions along the genome (Default=True). :param bool kept_mutations_before_end_time: Whether to allow mutations to be added ancestrally to existing (kept) mutations. This flag has no effect if either keep or discrete_genome are False. :return: The :class:`tskit.TreeSequence` object resulting from overlaying mutations on the input tree sequence. :rtype: :class:`tskit.TreeSequence` """ try: tables = tree_sequence.tables except AttributeError: raise ValueError("First argument must be a TreeSequence instance.") seed = random_seed if random_seed is None: seed = core.get_random_seed() else: seed = int(seed) if rate is None: rate = 0 try: rate = float(rate) rate_map = intervals.RateMap.uniform(tree_sequence.sequence_length, rate) except TypeError: rate_map = rate if not isinstance(rate_map, intervals.RateMap): raise TypeError("rate must be a float or a RateMap") start_time = -sys.float_info.max if start_time is None else float( start_time) end_time = sys.float_info.max if end_time is None else float(end_time) if start_time > end_time: raise ValueError("start_time must be <= end_time") discrete_genome = core._parse_flag(discrete_genome, default=True) keep = core._parse_flag(keep, default=False) kept_mutations_before_end_time = core._parse_flag( kept_mutations_before_end_time, default=False) model = mutation_model_factory(model) argspec = inspect.getargvalues(inspect.currentframe()) parameters = { "command": "sim_mutations", **{arg: argspec.locals[arg] for arg in argspec.args}, } parameters["random_seed"] = seed encoded_provenance = provenance.json_encode_provenance( provenance.get_provenance_dict(parameters)) rng = _msprime.RandomGenerator(seed) lwt = _msprime.LightweightTableCollection() lwt.fromdict(tables.asdict()) _msprime.sim_mutations( tables=lwt, random_generator=rng, rate_map=rate_map.asdict(), model=model, discrete_genome=discrete_genome, keep=keep, kept_mutations_before_end_time=kept_mutations_before_end_time, start_time=start_time, end_time=end_time, ) tables = tskit.TableCollection.fromdict(lwt.asdict()) tables.provenances.add_row(encoded_provenance) return tables.tree_sequence()
def __init__( self, sample_size=1, num_loci=1, scaled_recombination_rate=0, num_replicates=1, migration_matrix=None, population_configurations=None, demographic_events=None, scaled_mutation_rate=0, print_trees=False, precision=3, random_seeds=None, scaled_gene_conversion_rate=0, gene_conversion_track_length=1, hotspots=None, ): self._sample_size = sample_size self._num_loci = num_loci self._num_replicates = num_replicates self._recombination_rate = scaled_recombination_rate self._mutation_rate = scaled_mutation_rate # For strict ms-compability we want to have m non-recombining loci if hotspots is None: self._recomb_map = msprime.RecombinationMap.uniform_map( num_loci, self._recombination_rate) else: self._recomb_map = hotspots_to_recomb_map(hotspots, self._recombination_rate, num_loci) # sort out the random seeds ms_seeds = random_seeds if random_seeds is None: ms_seeds = generate_seeds() seed = get_single_seed(ms_seeds) self._random_generator = _msprime.RandomGenerator(seed) self._ms_random_seeds = ms_seeds # If we have specified any population_configurations we don't want # to give the overall sample size. sample_size = self._sample_size if population_configurations is not None: sample_size = None # msprime measure's time in units of generations, given a specific # Ne value whereas ms uses coalescent time. To be compatible with ms, # we therefore need to use an Ne value of 1/4. self._simulator = msprime.simulator_factory( Ne=0.25, sample_size=sample_size, recombination_map=self._recomb_map, population_configurations=population_configurations, migration_matrix=migration_matrix, demographic_events=demographic_events, gene_conversion_rate=scaled_gene_conversion_rate, gene_conversion_track_length=gene_conversion_track_length, random_generator=self._random_generator, discrete_genome=True, ) self._precision = precision self._print_trees = print_trees
def sim_mutations( tree_sequence, rate=None, *, random_seed=None, model=None, start_time=None, end_time=None, discrete_genome=None, keep=None, add_ancestral=None, ): """ Simulates mutations on the specified ancestry and returns the resulting :class:`tskit.TreeSequence`. Mutations are generated at the specified rate per unit of sequence length, per generation. By default, mutations are generated at discrete sites along the genome and multiple mutations can occur at any given site. A continuous sequence, infinite-sites model can also be specified by setting the ``discrete_genome`` parameter to False. If the ``model`` parameter is specified, this determines the model under which mutations are generated. The default mutation model is :class:`msprime.JC69MutationModel` a symmetrical mutation model among the ACGT alleles. See :ref:`sec_mutations_models` for details of available models. If a random seed is specified, this is used to seed the random number generator. If the same seed is specified and all other parameters are equal then the same mutations will be generated. If no random seed is specified then one is generated automatically. The time interval over which mutations can occur may be controlled using the ``start_time`` and ``end_time`` parameters. The ``start_time`` defines the lower bound (in time-ago) on this interval and ``max_time`` the upper bound. Note that we may have mutations associated with nodes with time <= ``start_time`` since mutations store the node at the bottom (i.e., towards the leaves) of the branch that they occur on. If the tree sequence already has mutations, these are by default retained, but can be discarded by passing ``keep=False``. However, adding new mutations to a tree sequence with existing mutations must be done with caution, since it can lead to incorrect or nonsensical results if mutation probabilities differ by ancestral state. (As an extreme example, suppose that X->Y and X->Z are allowable transitions, but Y->Z is not. If a branch already has an X->Y mutation on it, then calling `sim_mutations(..., keep=True)` might insert an X->Z mutation above the existing mutation, thus implying the impossible chain X->Y->Z.) For this reason, if this method attempts to add a new mutation ancestral to any existing mutation, an error will occur, unless ``add_ancestral=True``. The ``add_ancestral`` parameter has no effect if ``keep=False``. In summary, to add more mutations to a tree sequence with existing mutations, you need to either ensure that no new mutations are ancestral to existing ones (e.g., using the ``end_time`` parameter), or set ``add_ancestral=True`` and ensure that the mutational processes involved are compatible. .. note:: when ``add_ancestral=True`` there is the possibility of mutations that result in a silent transition (e.g., placing a mutation to A above an existing mutation to A). Such mutations are harmless and are required for us to guarantee the statistical properties of the process of sequentially adding mutations to a tree sequence. :param tskit.TreeSequence tree_sequence: The tree sequence onto which we wish to throw mutations. :param float rate: The rate of mutation per generation, as either a single number (for a uniform rate) or as a :class:`.RateMap`. (Default: 0). :param int random_seed: The random seed. If this is `None`, a random seed will be automatically generated. Valid random seeds must be between 1 and :math:`2^{32} - 1`. :param MutationModel model: The mutation model to use when generating mutations. This can either be a string (e.g., ``"jc69"``) or an instance of a simulation model class e.g, ``msprime.F84MutationModel(kappa=0.5)``. If not specified or None, the :class:`.BinaryMutationModel` mutation model is used. Please see the :ref:`sec_mutations_models` section for more details on specifying mutation models. :param float start_time: The minimum time ago at which a mutation can occur. (Default: no restriction.) :param float end_time: The maximum time ago at which a mutation can occur (Default: no restriction). :param bool discrete_genome: Whether to generate mutations at only integer positions along the genome (Default=True). :param bool keep: Whether to keep existing mutations. (default: True) :param bool add_ancestral: Whether to allow the addition of new mutations ancestral to existing ones. (default: False) :return: The :class:`tskit.TreeSequence` object resulting from overlaying mutations on the input tree sequence. :rtype: :class:`tskit.TreeSequence` """ try: tables = tree_sequence.tables except AttributeError: raise ValueError("First argument must be a TreeSequence instance.") seed = random_seed if random_seed is None: seed = core.get_random_seed() else: seed = int(seed) if rate is None: rate = 0 try: rate = float(rate) rate_map = intervals.RateMap.uniform(tree_sequence.sequence_length, rate) except TypeError: rate_map = rate if not isinstance(rate_map, intervals.RateMap): raise TypeError("rate must be a float or a RateMap") start_time = -sys.float_info.max if start_time is None else float( start_time) end_time = sys.float_info.max if end_time is None else float(end_time) if start_time > end_time: raise ValueError("start_time must be <= end_time") discrete_genome = core._parse_flag(discrete_genome, default=True) keep = core._parse_flag(keep, default=True) add_ancestral = core._parse_flag(add_ancestral, default=False) model = mutation_model_factory(model) argspec = inspect.getargvalues(inspect.currentframe()) parameters = { "command": "sim_mutations", **{arg: argspec.locals[arg] for arg in argspec.args}, } parameters["random_seed"] = seed encoded_provenance = provenance.json_encode_provenance( provenance.get_provenance_dict(parameters)) rng = _msprime.RandomGenerator(seed) lwt = _msprime.LightweightTableCollection() lwt.fromdict(tables.asdict()) _msprime.sim_mutations( tables=lwt, random_generator=rng, rate_map=rate_map.asdict(), model=model, discrete_genome=discrete_genome, keep=keep, kept_mutations_before_end_time=add_ancestral, start_time=start_time, end_time=end_time, ) tables = tskit.TableCollection.fromdict(lwt.asdict()) tables.provenances.add_row(encoded_provenance) return tables.tree_sequence()
def mutate( tree_sequence, rate=None, random_seed=None, model=None, keep=False, start_time=None, end_time=None, discrete=False, ): """ Simulates mutations on the specified ancestry and returns the resulting :class:`tskit.TreeSequence`. Mutations are generated at the specified rate in measured generations. Mutations are generated under the infinite sites model, and so the rate of new mutations is per unit of sequence length per generation. If a random seed is specified, this is used to seed the random number generator. If the same seed is specified and all other parameters are equal then the same mutations will be generated. If no random seed is specified then one is generated automatically. If the ``model`` parameter is specified, this determines the model under which mutations are generated. Currently only the :class:`.InfiniteSites` mutation model is supported. This parameter is useful if you wish to obtain sequences with letters from the nucleotide alphabet rather than the default 0/1 states. By default mutations from the infinite sites model with a binary alphabet are generated. By default, sites and mutations in the parameter tree sequence are discarded. If the ``keep`` parameter is true, however, *additional* mutations are simulated. Under the infinite sites mutation model, all new mutations generated will occur at distinct positions from each other and from any existing mutations (by rejection sampling). The time interval over which mutations can occur may be controlled using the ``start_time`` and ``end_time`` parameters. The ``start_time`` defines the lower bound (in time-ago) on this interval and ``max_time`` the upper bound. Note that we may have mutations associated with nodes with time <= ``start_time`` since mutations store the node at the bottom (i.e., towards the leaves) of the branch that they occur on. :param tskit.TreeSequence tree_sequence: The tree sequence onto which we wish to throw mutations. :param float rate: The rate of mutation per generation, as either a single number (for a uniform rate) or as a :class:`.RateMap`. (Default: 0). :param int random_seed: The random seed. If this is `None`, a random seed will be automatically generated. Valid random seeds must be between 1 and :math:`2^{32} - 1`. :param MutationModel model: The mutation model to use when generating mutations. If not specified or None, the :class:`.BinaryMutations` mutation model is used. :param bool keep: Whether to keep existing mutations (default: False). :param float start_time: The minimum time ago at which a mutation can occur. (Default: no restriction.) :param float end_time: The maximum time ago at which a mutation can occur (Default: no restriction). :param bool discrete: Whether to generate mutations at only integer positions along the genome. Default is False, which produces infinite-sites mutations at floating-point positions. :return: The :class:`tskit.TreeSequence` object resulting from overlaying mutations on the input tree sequence. :rtype: :class:`tskit.TreeSequence` """ try: tables = tree_sequence.tables except AttributeError: raise ValueError("First argument must be a TreeSequence instance.") seed = random_seed if random_seed is None: seed = core.get_random_seed() else: seed = int(seed) if rate is None: rate = 0 try: rate = float(rate) rate_map = intervals.RateMap.uniform(tree_sequence.sequence_length, rate) except TypeError: rate_map = rate if not isinstance(rate_map, intervals.RateMap): raise TypeError("rate must be a float or a RateMap") if start_time is None: start_time = -sys.float_info.max else: start_time = float(start_time) if end_time is None: end_time = sys.float_info.max else: end_time = float(end_time) if start_time > end_time: raise ValueError("start_time must be <= end_time") keep = bool(keep) discrete = bool(discrete) if model is None: model = BinaryMutations() if not isinstance(model, BaseMutationModel): raise TypeError("model must be a MutationModel") argspec = inspect.getargvalues(inspect.currentframe()) parameters = { "command": "mutate", **{arg: argspec.locals[arg] for arg in argspec.args}, } parameters["random_seed"] = seed encoded_provenance = provenance.json_encode_provenance( provenance.get_provenance_dict(parameters)) rng = _msprime.RandomGenerator(seed) lwt = _msprime.LightweightTableCollection() lwt.fromdict(tables.asdict()) _msprime.sim_mutations( tables=lwt, random_generator=rng, rate_map=rate_map.asdict(), model=model, discrete_sites=discrete, keep=keep, start_time=start_time, end_time=end_time, ) tables = tskit.TableCollection.fromdict(lwt.asdict()) tables.provenances.add_row(encoded_provenance) return tables.tree_sequence()