Exemplo n.º 1
0
    def verify_required_columns(self, tables, table_name, required_cols):
        d = tables.asdict()
        table_dict = {col: None for col in d[table_name].keys()}
        for col in required_cols:
            table_dict[col] = d[table_name][col]
        lwt = c_module.LightweightTableCollection()
        d[table_name] = table_dict
        lwt.fromdict(d)
        other = lwt.asdict()
        for col in required_cols:
            assert np.array_equal(other[table_name][col], table_dict[col])

        # Any one of these required columns as None gives an error.
        for col in required_cols:
            d = tables.asdict()
            copy = dict(table_dict)
            copy[col] = None
            d[table_name] = copy
            lwt = c_module.LightweightTableCollection()
            with pytest.raises(TypeError):
                lwt.fromdict(d)

        # Removing any one of these required columns gives an error.
        for col in required_cols:
            d = tables.asdict()
            copy = dict(table_dict)
            del copy[col]
            d[table_name] = copy
            lwt = c_module.LightweightTableCollection()
            with pytest.raises(TypeError):
                lwt.fromdict(d)
Exemplo n.º 2
0
 def test_top_keys_match(self):
     tables = get_example_tables()
     d1 = tables.asdict()
     lwt = c_module.LightweightTableCollection()
     lwt.fromdict(d1)
     d2 = lwt.asdict()
     self.assertEqual(d1.keys(), d2.keys())
Exemplo n.º 3
0
 def test_missing_sequence_length(self):
     tables = get_example_tables()
     d = tables.asdict()
     del d["sequence_length"]
     lwt = c_module.LightweightTableCollection()
     with pytest.raises(TypeError):
         lwt.fromdict(d)
Exemplo n.º 4
0
 def verify_metadata_schema(self, tables, table_name):
     d = tables.asdict()
     d[table_name]["metadata_schema"] = None
     lwt = c_module.LightweightTableCollection()
     lwt.fromdict(d)
     out = lwt.asdict()
     assert "metadata_schema" not in out[table_name]
     tables = tskit.TableCollection.fromdict(out)
     assert str(getattr(tables, table_name).metadata_schema) == ""
Exemplo n.º 5
0
 def test_missing_metadata_schema(self):
     tables = get_example_tables()
     assert str(tables.metadata_schema) != ""
     d = tables.asdict()
     del d["metadata_schema"]
     lwt = c_module.LightweightTableCollection()
     lwt.fromdict(d)
     tables = tskit.TableCollection.fromdict(lwt.asdict())
     assert str(tables.metadata_schema) == ""
Exemplo n.º 6
0
 def verify_metadata_schema(self, tables, table_name):
     d = tables.asdict()
     d[table_name]["metadata_schema"] = None
     lwt = c_module.LightweightTableCollection()
     lwt.fromdict(d)
     out = lwt.asdict()
     self.assertNotIn("metadata_schema", out[table_name])
     tables = tskit.TableCollection.fromdict(out)
     self.assertEqual(str(getattr(tables, table_name).metadata_schema), "")
Exemplo n.º 7
0
 def verify_optional_column(self, tables, table_len, table_name, col_name):
     d = tables.asdict()
     table_dict = d[table_name]
     table_dict[col_name] = None
     lwt = c_module.LightweightTableCollection()
     lwt.fromdict(d)
     out = lwt.asdict()
     assert np.array_equal(out[table_name][col_name],
                           np.zeros(table_len, dtype=np.int32) - 1)
Exemplo n.º 8
0
 def test_missing_metadata(self):
     tables = get_example_tables()
     assert tables.metadata != b""
     d = tables.asdict()
     del d["metadata"]
     lwt = c_module.LightweightTableCollection()
     lwt.fromdict(d)
     tables = tskit.TableCollection.fromdict(lwt.asdict())
     # Empty byte field still gets interpreted by schema
     assert tables.metadata == {"top-level": []}
Exemplo n.º 9
0
 def test_bad_top_level_types(self):
     tables = get_example_tables()
     d = tables.asdict()
     for key in set(d.keys()) - {"encoding_version"}:
         bad_type_dict = tables.asdict()
         # A list should be a ValueError for both the tables and sequence_length
         bad_type_dict[key] = ["12345"]
         lwt = c_module.LightweightTableCollection()
         with pytest.raises(TypeError):
             lwt.fromdict(bad_type_dict)
Exemplo n.º 10
0
 def test_missing_tables(self):
     tables = get_example_tables()
     d = tables.asdict()
     table_names = set(d.keys()) - {"sequence_length"}
     for table_name in table_names:
         d = tables.asdict()
         del d[table_name]
         lwt = c_module.LightweightTableCollection()
         with self.assertRaises(ValueError):
             lwt.fromdict(d)
Exemplo n.º 11
0
 def test_top_level_metadata_schema(self):
     tables = get_example_tables()
     d = tables.asdict()
     # None should give default value
     d["metadata_schema"] = None
     lwt = c_module.LightweightTableCollection()
     lwt.fromdict(d)
     out = lwt.asdict()
     assert "metadata_schema" not in out
     tables = tskit.TableCollection.fromdict(out)
     assert str(tables.metadata_schema) == ""
Exemplo n.º 12
0
 def test_top_level_metadata_schema(self):
     tables = get_example_tables()
     d = tables.asdict()
     # None should give default value
     d["metadata_schema"] = None
     lwt = c_module.LightweightTableCollection()
     lwt.fromdict(d)
     out = lwt.asdict()
     self.assertNotIn("metadata_schema", out)
     tables = tskit.TableCollection.fromdict(out)
     self.assertEqual(str(tables.metadata_schema), "")
Exemplo n.º 13
0
 def test_top_level_metadata(self):
     tables = get_example_tables()
     d = tables.asdict()
     # None should give default value
     d["metadata"] = None
     lwt = c_module.LightweightTableCollection()
     lwt.fromdict(d)
     out = lwt.asdict()
     assert "metadata" not in out
     tables = tskit.TableCollection.fromdict(out)
     # We only removed the metadata, not the schema. So empty bytefield
     # still gets interpreted
     assert tables.metadata == {"top-level": []}
Exemplo n.º 14
0
    def verify_offset_pair(self, tables, table_len, table_name, col_name):
        offset_col = col_name + "_offset"

        d = tables.asdict()
        table_dict = d[table_name]
        table_dict[col_name] = None
        table_dict[offset_col] = None
        lwt = c_module.LightweightTableCollection()
        lwt.fromdict(d)
        out = lwt.asdict()
        self.assertEqual(out[table_name][col_name].shape, (0, ))
        self.assertTrue(
            np.array_equal(out[table_name][offset_col],
                           np.zeros(table_len + 1, dtype=np.uint32)))

        # Setting one or the other raises a ValueError
        d = tables.asdict()
        table_dict = d[table_name]
        table_dict[col_name] = None
        lwt = c_module.LightweightTableCollection()
        with self.assertRaises(TypeError):
            lwt.fromdict(d)

        d = tables.asdict()
        table_dict = d[table_name]
        table_dict[offset_col] = None
        lwt = c_module.LightweightTableCollection()
        with self.assertRaises(TypeError):
            lwt.fromdict(d)

        d = tables.asdict()
        table_dict = d[table_name]
        bad_offset = np.zeros_like(table_dict[offset_col])
        bad_offset[:-1] = table_dict[offset_col][:-1][::-1]
        bad_offset[-1] = table_dict[offset_col][-1]
        table_dict[offset_col] = bad_offset
        lwt = c_module.LightweightTableCollection()
        with self.assertRaises(c_module.LibraryError):
            lwt.fromdict(d)
Exemplo n.º 15
0
 def verify_columns(self, value):
     tables = get_example_tables()
     d = tables.asdict()
     table_names = set(d.keys()) - {"sequence_length"}
     for table_name in table_names:
         table_dict = d[table_name]
         for colname in table_dict.keys():
             copy = dict(table_dict)
             copy[colname] = value
             lwt = c_module.LightweightTableCollection()
             d = tables.asdict()
             d[table_name] = copy
             with self.assertRaises(ValueError):
                 lwt.fromdict(d)
Exemplo n.º 16
0
 def test_missing_tables(self):
     tables = get_example_tables()
     d = tables.asdict()
     table_names = d.keys() - {
         "sequence_length",
         "metadata",
         "metadata_schema",
         "encoding_version",
     }
     for table_name in table_names:
         d = tables.asdict()
         del d[table_name]
         lwt = c_module.LightweightTableCollection()
         with pytest.raises(TypeError):
             lwt.fromdict(d)
Exemplo n.º 17
0
    def verify(self, num_rows):

        tables = get_example_tables()
        d = tables.asdict()
        table_names = set(d.keys()) - {"sequence_length"}
        for table_name in sorted(table_names):
            table_dict = d[table_name]
            for colname in sorted(table_dict.keys()):
                copy = dict(table_dict)
                copy[colname] = table_dict[colname][:num_rows].copy()
                lwt = c_module.LightweightTableCollection()
                d = tables.asdict()
                d[table_name] = copy
                with self.assertRaises(ValueError):
                    lwt.fromdict(d)
Exemplo n.º 18
0
 def test_mutations(self):
     tables = get_example_tables()
     self.verify_required_columns(
         tables,
         "mutations",
         ["site", "node", "derived_state", "derived_state_offset"],
     )
     self.verify_offset_pair(tables, len(tables.mutations), "mutations",
                             "metadata")
     self.verify_metadata_schema(tables, "mutations")
     # Verify optional time column
     d = tables.asdict()
     d["mutations"]["time"] = None
     lwt = c_module.LightweightTableCollection()
     lwt.fromdict(d)
     out = lwt.asdict()
     assert all(np.isnan(val) for val in out["mutations"]["time"])
Exemplo n.º 19
0
 def verify_columns(self, value):
     tables = get_example_tables()
     d = tables.asdict()
     table_names = set(d.keys()) - {
         "sequence_length",
         "metadata",
         "metadata_schema",
         "encoding_version",
     }
     for table_name in table_names:
         table_dict = d[table_name]
         for colname in set(table_dict.keys()) - {"metadata_schema"}:
             copy = dict(table_dict)
             copy[colname] = value
             lwt = c_module.LightweightTableCollection()
             d = tables.asdict()
             d[table_name] = copy
             with pytest.raises(ValueError):
                 lwt.fromdict(d)
Exemplo n.º 20
0
 def test_table_columns_match(self):
     tables = get_example_tables()
     d1 = tables.asdict()
     lwt = c_module.LightweightTableCollection()
     lwt.fromdict(d1)
     d2 = lwt.asdict()
     tables = [
         "individuals",
         "nodes",
         "edges",
         "migrations",
         "sites",
         "mutations",
         "populations",
         "provenances",
     ]
     for table_name in tables:
         t1 = d1[table_name]
         t2 = d2[table_name]
         self.assertEqual(t1.keys(), t2.keys())
Exemplo n.º 21
0
def log_arg_likelihood(ts, recombination_rate, Ne=1):
    """
    Returns the log probability of the stored tree sequence under the Hudson ARG.
    An exact expression for this probability is given in equation (1) of
    `Kuhner et al. (2000) <https://www.genetics.org/content/156/3/1393>`_.

    We assume branch lengths stored in generations, resulting in a coalescence
    rate of :math:`1 / (2 N_e)` per pair of lineages.

    .. warning::
        The stored tree sequence must store the full realisation of the ARG,
        including all recombination events and all common ancestor events,
        regardless of whether the recombinations cause a change in the ancestral
        tree or whether the common ancestor events cause coalescence of ancestral
        material. See :ref:`sec_ancestry_full_arg` for details of this
        data structure, and how to generate them using ``msprime``.

    .. warning::
        This method only supports continuous genomes.
        See :ref:`sec_ancestry_discrete_genome` for how these can be specified
        when simulating tree sequences using ``msprime``.

    :param tskit.TreeSequence ts: The tree sequence object.
    :param float recombination_rate: The per-link, per-generation recombination
        probability. Must be non-negative.
    :param float Ne: The diploid effective population size.
    :return: The log probability of the tree sequence under the Hudson ancestral
        recombination graph model. If the recombination rate is zero and the tree
        sequence contains at least one recombination event, then returns
        `-DBL_MAX`.
    """
    # Get the tables into the format we need to interchange with the low-level code.
    lw_tables = _msprime.LightweightTableCollection()
    lw_tables.fromdict(ts.tables.asdict())
    return _msprime.log_likelihood_arg(lw_tables,
                                       Ne=Ne,
                                       recombination_rate=recombination_rate)
Exemplo n.º 22
0
 def test_version(self):
     lwt = c_module.LightweightTableCollection()
     assert lwt.asdict()["encoding_version"] == (1, 1)
Exemplo n.º 23
0
    def verify_offset_pair(self,
                           tables,
                           table_len,
                           table_name,
                           col_name,
                           required=False):
        offset_col = col_name + "_offset"

        if not required:
            d = tables.asdict()
            table_dict = d[table_name]
            table_dict[col_name] = None
            table_dict[offset_col] = None
            lwt = c_module.LightweightTableCollection()
            lwt.fromdict(d)
            out = lwt.asdict()
            assert out[table_name][col_name].shape == (0, )
            assert np.array_equal(
                out[table_name][offset_col],
                np.zeros(table_len + 1, dtype=np.uint32),
            )
            d = tables.asdict()
            table_dict = d[table_name]
            del table_dict[col_name]
            del table_dict[offset_col]
            lwt = c_module.LightweightTableCollection()
            lwt.fromdict(d)
            out = lwt.asdict()
            assert out[table_name][col_name].shape == (0, )
            assert np.array_equal(
                out[table_name][offset_col],
                np.zeros(table_len + 1, dtype=np.uint32),
            )

        # Setting one or the other raises a TypeError
        d = tables.asdict()
        table_dict = d[table_name]
        table_dict[col_name] = None
        lwt = c_module.LightweightTableCollection()
        with pytest.raises(TypeError):
            lwt.fromdict(d)

        d = tables.asdict()
        table_dict = d[table_name]
        del table_dict[col_name]
        lwt = c_module.LightweightTableCollection()
        with pytest.raises(TypeError):
            lwt.fromdict(d)

        d = tables.asdict()
        table_dict = d[table_name]
        table_dict[offset_col] = None
        lwt = c_module.LightweightTableCollection()
        with pytest.raises(TypeError):
            lwt.fromdict(d)

        d = tables.asdict()
        table_dict = d[table_name]
        del table_dict[offset_col]
        lwt = c_module.LightweightTableCollection()
        with pytest.raises(TypeError):
            lwt.fromdict(d)

        d = tables.asdict()
        table_dict = d[table_name]
        bad_offset = np.zeros_like(table_dict[offset_col])
        bad_offset[:-1] = table_dict[offset_col][:-1][::-1]
        bad_offset[-1] = table_dict[offset_col][-1]
        table_dict[offset_col] = bad_offset
        lwt = c_module.LightweightTableCollection()
        with pytest.raises(c_module.LibraryError):
            lwt.fromdict(d)
Exemplo n.º 24
0
def sim_mutations(
    tree_sequence,
    rate=None,
    *,
    random_seed=None,
    model=None,
    start_time=None,
    end_time=None,
    discrete_genome=None,
    keep=None,
    add_ancestral=None,
):
    """
    Simulates mutations on the specified ancestry and returns the resulting
    :class:`tskit.TreeSequence`. Mutations are generated at the specified rate
    per unit of sequence length, per generation. By default, mutations are
    generated at discrete sites along the genome and multiple mutations
    can occur at any given site. A continuous sequence, infinite-sites model
    can also be specified by setting the ``discrete_genome`` parameter to
    False.

    If the ``model`` parameter is specified, this determines the model under
    which mutations are generated. The default mutation model is
    :class:`msprime.JC69MutationModel` a symmetrical mutation model among
    the ACGT alleles. See :ref:`sec_mutations_models` for details of available models.

    If a random seed is specified, this is used to seed the random number
    generator. If the same seed is specified and all other parameters are equal
    then the same mutations will be generated. If no random seed is specified
    then one is generated automatically.

    The time interval over which mutations can occur may be controlled
    using the ``start_time`` and ``end_time`` parameters. The ``start_time``
    defines the lower bound (in time-ago) on this interval and ``max_time``
    the upper bound. Note that we may have mutations associated with
    nodes with time <= ``start_time`` since mutations store the node at the
    bottom (i.e., towards the leaves) of the branch that they occur on.

    If the tree sequence already has mutations, these are by default retained,
    but can be discarded by passing ``keep=False``. However, adding new
    mutations to a tree sequence with existing mutations must be done with
    caution, since it can lead to incorrect or nonsensical results if mutation
    probabilities differ by ancestral state. (As an extreme example, suppose
    that X->Y and X->Z are allowable transitions, but Y->Z is not. If a branch
    already has an X->Y mutation on it, then calling `sim_mutations(...,
    keep=True)` might insert an X->Z mutation above the existing mutation, thus
    implying the impossible chain X->Y->Z.)  For this reason, if this method
    attempts to add a new mutation ancestral to any existing mutation, an error
    will occur, unless ``add_ancestral=True``.
    The ``add_ancestral`` parameter has no effect if ``keep=False``.

    In summary, to add more mutations to a tree sequence with existing
    mutations, you need to either ensure that no new mutations are ancestral to
    existing ones (e.g., using the ``end_time`` parameter), or set
    ``add_ancestral=True`` and ensure that the mutational processes involved
    are compatible.

    .. note:: when ``add_ancestral=True`` there is the possibility of
        mutations that result in a silent transition (e.g., placing a mutation
        to A above an existing mutation to A). Such mutations are harmless and
        are required for us to guarantee the statistical properties of the
        process of sequentially adding mutations to a tree sequence.

    :param tskit.TreeSequence tree_sequence: The tree sequence onto which we
        wish to throw mutations.
    :param float rate: The rate of mutation per generation, as either a
        single number (for a uniform rate) or as a
        :class:`.RateMap`. (Default: 0).
    :param int random_seed: The random seed. If this is `None`, a
        random seed will be automatically generated. Valid random
        seeds must be between 1 and :math:`2^{32} - 1`.
    :param MutationModel model: The mutation model to use when generating
        mutations. This can either be a string (e.g., ``"jc69"``) or
        an instance of a simulation model class
        e.g, ``msprime.F84MutationModel(kappa=0.5)``.
        If not specified or None, the :class:`.BinaryMutationModel`
        mutation model is used. Please see the
        :ref:`sec_mutations_models` section for more details
        on specifying mutation models.
    :param float start_time: The minimum time ago at which a mutation can
        occur. (Default: no restriction.)
    :param float end_time: The maximum time ago at which a mutation can occur
        (Default: no restriction).
    :param bool discrete_genome: Whether to generate mutations at only integer positions
        along the genome (Default=True).
    :param bool keep: Whether to keep existing mutations. (default: True)
    :param bool add_ancestral: Whether to allow the addition of new mutations
        ancestral to existing ones. (default: False)
    :return: The :class:`tskit.TreeSequence` object resulting from overlaying
        mutations on the input tree sequence.
    :rtype: :class:`tskit.TreeSequence`
    """
    try:
        tables = tree_sequence.tables
    except AttributeError:
        raise ValueError("First argument must be a TreeSequence instance.")
    seed = random_seed
    if random_seed is None:
        seed = core.get_random_seed()
    else:
        seed = int(seed)

    if rate is None:
        rate = 0
    try:
        rate = float(rate)
        rate_map = intervals.RateMap.uniform(tree_sequence.sequence_length,
                                             rate)
    except TypeError:
        rate_map = rate
    if not isinstance(rate_map, intervals.RateMap):
        raise TypeError("rate must be a float or a RateMap")

    start_time = -sys.float_info.max if start_time is None else float(
        start_time)
    end_time = sys.float_info.max if end_time is None else float(end_time)
    if start_time > end_time:
        raise ValueError("start_time must be <= end_time")
    discrete_genome = core._parse_flag(discrete_genome, default=True)
    keep = core._parse_flag(keep, default=True)
    add_ancestral = core._parse_flag(add_ancestral, default=False)

    model = mutation_model_factory(model)

    argspec = inspect.getargvalues(inspect.currentframe())
    parameters = {
        "command": "sim_mutations",
        **{arg: argspec.locals[arg]
           for arg in argspec.args},
    }
    parameters["random_seed"] = seed
    encoded_provenance = provenance.json_encode_provenance(
        provenance.get_provenance_dict(parameters))

    rng = _msprime.RandomGenerator(seed)
    lwt = _msprime.LightweightTableCollection()
    lwt.fromdict(tables.asdict())
    _msprime.sim_mutations(
        tables=lwt,
        random_generator=rng,
        rate_map=rate_map.asdict(),
        model=model,
        discrete_genome=discrete_genome,
        keep=keep,
        kept_mutations_before_end_time=add_ancestral,
        start_time=start_time,
        end_time=end_time,
    )

    tables = tskit.TableCollection.fromdict(lwt.asdict())
    tables.provenances.add_row(encoded_provenance)
    return tables.tree_sequence()
Exemplo n.º 25
0
 def test_version(self):
     lwt = c_module.LightweightTableCollection()
     self.assertEqual(lwt.asdict()["encoding_version"], (1, 1))
Exemplo n.º 26
0
def sim_mutations(
    tree_sequence,
    rate=None,
    *,
    random_seed=None,
    model=None,
    keep=None,
    start_time=None,
    end_time=None,
    discrete_genome=None,
    kept_mutations_before_end_time=None,
):
    """
    Simulates mutations on the specified ancestry and returns the resulting
    :class:`tskit.TreeSequence`. Mutations are generated at the specified rate
    per unit of sequence_length, per generation. By default, mutations are
    generated at discrete sites along the genome and multiple mutations
    can occur at any given site. A continuous sequence, infinite-sites model
    can also be specified by setting the ``discrete_genome`` parameter to
    False.

    If the ``model`` parameter is specified, this determines the model under
    which mutations are generated. The default mutation model is
    :class:`msprime.BinaryMutationModel` a simple binary model with alleles
    0 and 1. See :ref:`sec_api_mutation_models` for details of available models.

    If a random seed is specified, this is used to seed the random number
    generator. If the same seed is specified and all other parameters are equal
    then the same mutations will be generated. If no random seed is specified
    then one is generated automatically.

    By default, sites and mutations in the input tree sequence are
    discarded. If the ``keep`` parameter is true, however, *additional*
    mutations are simulated. Under the infinite sites mutation model, all new
    mutations generated will occur at distinct positions from each other and
    from any existing mutations (by rejection sampling). Furthermore, if sites
    are discrete, trying to simulate mutations at time periods that are older
    than mutations kept from the original tree sequence is an error, because
    this would create an extra transition (from the new allele to the old
    one below it) that may be incorrect according to the model of mutation.
    Under a state-independent mutation model, however (e.g., Jukes-Cantor),
    there is no problem, and ``kept_mutations_before_end_time=True`` may be
    set to allow adding new mutations around or above existing ones.

    The time interval over which mutations can occur may be controlled
    using the ``start_time`` and ``end_time`` parameters. The ``start_time``
    defines the lower bound (in time-ago) on this interval and ``max_time``
    the upper bound. Note that we may have mutations associated with
    nodes with time <= ``start_time`` since mutations store the node at the
    bottom (i.e., towards the leaves) of the branch that they occur on.

    :param tskit.TreeSequence tree_sequence: The tree sequence onto which we
        wish to throw mutations.
    :param float rate: The rate of mutation per generation, as either a
        single number (for a uniform rate) or as a
        :class:`.RateMap`. (Default: 0).
    :param int random_seed: The random seed. If this is `None`, a
        random seed will be automatically generated. Valid random
        seeds must be between 1 and :math:`2^{32} - 1`.
    :param MutationModel model: The mutation model to use when generating
        mutations. This can either be a string (e.g., ``"jc69"``) or
        an instance of a simulation model class
        e.g, ``msprime.F84MutationModel(kappa=0.5)``.
        If not specified or None, the :class:`.BinaryMutationModel`
        mutation model is used. Please see the
        :ref:`sec_api_simulation_models` section for more details
        on specifying simulations models.
    :param bool keep: Whether to keep existing mutations (default: False).
    :param float start_time: The minimum time ago at which a mutation can
        occur. (Default: no restriction.)
    :param float end_time: The maximum time ago at which a mutation can occur
        (Default: no restriction).
    :param bool discrete_genome: Whether to generate mutations at only integer positions
        along the genome (Default=True).
    :param bool kept_mutations_before_end_time: Whether to allow mutations to be added
        ancestrally to existing (kept) mutations. This flag has no effect
        if either keep or discrete_genome are False.
    :return: The :class:`tskit.TreeSequence` object resulting from overlaying
        mutations on the input tree sequence.
    :rtype: :class:`tskit.TreeSequence`
    """
    try:
        tables = tree_sequence.tables
    except AttributeError:
        raise ValueError("First argument must be a TreeSequence instance.")
    seed = random_seed
    if random_seed is None:
        seed = core.get_random_seed()
    else:
        seed = int(seed)

    if rate is None:
        rate = 0
    try:
        rate = float(rate)
        rate_map = intervals.RateMap.uniform(tree_sequence.sequence_length,
                                             rate)
    except TypeError:
        rate_map = rate
    if not isinstance(rate_map, intervals.RateMap):
        raise TypeError("rate must be a float or a RateMap")

    start_time = -sys.float_info.max if start_time is None else float(
        start_time)
    end_time = sys.float_info.max if end_time is None else float(end_time)
    if start_time > end_time:
        raise ValueError("start_time must be <= end_time")
    discrete_genome = core._parse_flag(discrete_genome, default=True)
    keep = core._parse_flag(keep, default=False)
    kept_mutations_before_end_time = core._parse_flag(
        kept_mutations_before_end_time, default=False)

    model = mutation_model_factory(model)

    argspec = inspect.getargvalues(inspect.currentframe())
    parameters = {
        "command": "sim_mutations",
        **{arg: argspec.locals[arg]
           for arg in argspec.args},
    }
    parameters["random_seed"] = seed
    encoded_provenance = provenance.json_encode_provenance(
        provenance.get_provenance_dict(parameters))

    rng = _msprime.RandomGenerator(seed)
    lwt = _msprime.LightweightTableCollection()
    lwt.fromdict(tables.asdict())
    _msprime.sim_mutations(
        tables=lwt,
        random_generator=rng,
        rate_map=rate_map.asdict(),
        model=model,
        discrete_genome=discrete_genome,
        keep=keep,
        kept_mutations_before_end_time=kept_mutations_before_end_time,
        start_time=start_time,
        end_time=end_time,
    )

    tables = tskit.TableCollection.fromdict(lwt.asdict())
    tables.provenances.add_row(encoded_provenance)
    return tables.tree_sequence()
Exemplo n.º 27
0
 def verify(self, tables):
     lwt = c_module.LightweightTableCollection()
     lwt.fromdict(tables.asdict())
     other_tables = tskit.TableCollection.fromdict(lwt.asdict())
     assert tables == other_tables
Exemplo n.º 28
0
def mutate(
    tree_sequence,
    rate=None,
    random_seed=None,
    model=None,
    keep=False,
    start_time=None,
    end_time=None,
    discrete=False,
):
    """
    Simulates mutations on the specified ancestry and returns the resulting
    :class:`tskit.TreeSequence`. Mutations are generated at the specified rate in
    measured generations. Mutations are generated under the infinite sites
    model, and so the rate of new mutations is per unit of sequence length per
    generation.

    If a random seed is specified, this is used to seed the random number
    generator. If the same seed is specified and all other parameters are equal
    then the same mutations will be generated. If no random seed is specified
    then one is generated automatically.

    If the ``model`` parameter is specified, this determines the model under
    which mutations are generated. Currently only the :class:`.InfiniteSites`
    mutation model is supported. This parameter is useful if you wish to obtain
    sequences with letters from the nucleotide alphabet rather than the default
    0/1 states. By default mutations from the infinite sites model with a binary
    alphabet are generated.

    By default, sites and mutations in the parameter tree sequence are
    discarded. If the ``keep`` parameter is true, however, *additional*
    mutations are simulated. Under the infinite sites mutation model, all new
    mutations generated will occur at distinct positions from each other and
    from any existing mutations (by rejection sampling).

    The time interval over which mutations can occur may be controlled
    using the ``start_time`` and ``end_time`` parameters. The ``start_time``
    defines the lower bound (in time-ago) on this interval and ``max_time``
    the upper bound. Note that we may have mutations associated with
    nodes with time <= ``start_time`` since mutations store the node at the
    bottom (i.e., towards the leaves) of the branch that they occur on.

    :param tskit.TreeSequence tree_sequence: The tree sequence onto which we
        wish to throw mutations.
    :param float rate: The rate of mutation per generation, as either a
        single number (for a uniform rate) or as a
        :class:`.RateMap`. (Default: 0).
    :param int random_seed: The random seed. If this is `None`, a
        random seed will be automatically generated. Valid random
        seeds must be between 1 and :math:`2^{32} - 1`.
    :param MutationModel model: The mutation model to use when generating
        mutations. If not specified or None, the :class:`.BinaryMutations`
        mutation model is used.
    :param bool keep: Whether to keep existing mutations (default: False).
    :param float start_time: The minimum time ago at which a mutation can
        occur. (Default: no restriction.)
    :param float end_time: The maximum time ago at which a mutation can occur
        (Default: no restriction).
    :param bool discrete: Whether to generate mutations at only integer positions
        along the genome.  Default is False, which produces infinite-sites
        mutations at floating-point positions.
    :return: The :class:`tskit.TreeSequence` object  resulting from overlaying
        mutations on the input tree sequence.
    :rtype: :class:`tskit.TreeSequence`
    """
    try:
        tables = tree_sequence.tables
    except AttributeError:
        raise ValueError("First argument must be a TreeSequence instance.")
    seed = random_seed
    if random_seed is None:
        seed = core.get_random_seed()
    else:
        seed = int(seed)

    if rate is None:
        rate = 0
    try:
        rate = float(rate)
        rate_map = intervals.RateMap.uniform(tree_sequence.sequence_length,
                                             rate)
    except TypeError:
        rate_map = rate
    if not isinstance(rate_map, intervals.RateMap):
        raise TypeError("rate must be a float or a RateMap")

    if start_time is None:
        start_time = -sys.float_info.max
    else:
        start_time = float(start_time)
    if end_time is None:
        end_time = sys.float_info.max
    else:
        end_time = float(end_time)
    if start_time > end_time:
        raise ValueError("start_time must be <= end_time")
    keep = bool(keep)
    discrete = bool(discrete)

    if model is None:
        model = BinaryMutations()
    if not isinstance(model, BaseMutationModel):
        raise TypeError("model must be a MutationModel")

    argspec = inspect.getargvalues(inspect.currentframe())
    parameters = {
        "command": "mutate",
        **{arg: argspec.locals[arg]
           for arg in argspec.args},
    }
    parameters["random_seed"] = seed
    encoded_provenance = provenance.json_encode_provenance(
        provenance.get_provenance_dict(parameters))

    rng = _msprime.RandomGenerator(seed)
    lwt = _msprime.LightweightTableCollection()
    lwt.fromdict(tables.asdict())
    _msprime.sim_mutations(
        tables=lwt,
        random_generator=rng,
        rate_map=rate_map.asdict(),
        model=model,
        discrete_sites=discrete,
        keep=keep,
        start_time=start_time,
        end_time=end_time,
    )

    tables = tskit.TableCollection.fromdict(lwt.asdict())
    tables.provenances.add_row(encoded_provenance)
    return tables.tree_sequence()