Exemplo n.º 1
0
def _get_upgrade_provenance(root):
    """
    Returns the provenance string from upgrading the specified HDF5 file.
    """
    # TODO add more parameters here like filename, etc.
    parameters = {
        "source_version": list(map(int, root.attrs["format_version"]))
    }
    s = json.dumps(provenance.get_provenance_dict("upgrade", parameters))
    return s.encode()
Exemplo n.º 2
0
def _get_v2_provenance(command, attrs):
    """
    Returns the V2 tree provenance attributes reformatted as a provenance record.
    """
    environment = {}
    parameters = {}
    # Try to get the provenance strings. Malformed JSON should not prevent us
    # from finishing the conversion.
    try:
        environment = json.loads(str(attrs["environment"]))
    except ValueError:
        logging.warn("Failed to convert environment provenance")
    try:
        parameters = json.loads(str(attrs["parameters"]))
    except ValueError:
        logging.warn("Failed to convert parameters provenance")
    provenance_dict = provenance.get_provenance_dict(command, parameters)
    provenance_dict["version"] = environment.get("msprime_version",
                                                 "Unknown_version")
    provenance_dict["environment"] = environment
    return json.dumps(provenance_dict).encode()
Exemplo n.º 3
0
def mutate(tree_sequence,
           rate=None,
           random_seed=None,
           model=None,
           keep=False,
           start_time=None,
           end_time=None):
    """
    Simulates mutations on the specified ancestry and returns the resulting
    :class:`tskit.TreeSequence`. Mutations are generated at the specified rate in
    measured generations. Mutations are generated under the infinite sites
    model, and so the rate of new mutations is per unit of sequence length per
    generation.

    If a random seed is specified, this is used to seed the random number
    generator. If the same seed is specified and all other parameters are equal
    then the same mutations will be generated. If no random seed is specified
    then one is generated automatically.

    If the ``model`` parameter is specified, this determines the model under
    which mutations are generated. Currently only the :class:`.InfiniteSites`
    mutation model is supported. This parameter is useful if you wish to obtain
    sequences with letters from the nucleotide alphabet rather than the default
    0/1 states. By default mutations from the infinite sites model with a binary
    alphabet are generated.

    By default, sites and mutations in the parameter tree sequence are
    discarded. If the ``keep`` parameter is true, however, *additional*
    mutations are simulated. Under the infinite sites mutation model, all new
    mutations generated will occur at distinct positions from each other and
    from any existing mutations (by rejection sampling).

    The time interval over which mutations can occur may be controlled
    using the ``start_time`` and ``end_time`` parameters. The ``start_time``
    defines the lower bound (in time-ago) on this interval and ``max_time``
    the upper bound. Note that we may have mutations associated with
    nodes with time <= ``start_time`` since mutations store the node at the
    bottom (i.e., towards the leaves) of the branch that they occur on.

    :param tskit.TreeSequence tree_sequence: The tree sequence onto which we
        wish to throw mutations.
    :param float rate: The rate of mutation per generation. (Default: 0).
    :param int random_seed: The random seed. If this is `None`, a
        random seed will be automatically generated. Valid random
        seeds must be between 1 and :math:`2^{32} - 1`.
    :param MutationModel model: The mutation model to use when generating
        mutations. If not specified or None, the :class:`.InfiniteSites`
        mutation model is used.
    :param bool keep: Whether to keep existing mutations (default: False).
    :param float start_time: The minimum time at which a mutation can
        occur. (Default: no restriction.)
    :param float end_time: The maximum time at which a mutation can occur
        (Default: no restriction).
    :return: The :class:`tskit.TreeSequence` object  resulting from overlaying
        mutations on the input tree sequence.
    :rtype: :class:`tskit.TreeSequence`
    """
    try:
        tables = tree_sequence.tables
    except AttributeError:
        raise ValueError("First argument must be a TreeSequence instance.")
    if random_seed is None:
        random_seed = simulations._get_random_seed()
    random_seed = int(random_seed)

    rng = _msprime.RandomGenerator(random_seed)
    if model is None:
        model = InfiniteSites()
    try:
        alphabet = model.alphabet
    except AttributeError:
        raise TypeError("model must be an InfiniteSites instance")
    if rate is None:
        rate = 0
    rate = float(rate)
    keep = bool(keep)

    parameters = {
        "command": "mutate",
        "rate": rate,
        "random_seed": random_seed,
        "keep": keep
    }

    if start_time is None:
        start_time = -sys.float_info.max
    else:
        start_time = float(start_time)
        parameters["start_time"] = start_time

    if end_time is None:
        end_time = sys.float_info.max
    else:
        end_time = float(end_time)
        parameters["end_time"] = end_time
    # TODO Add a JSON representation of the model to the provenance.
    provenance_dict = provenance.get_provenance_dict(parameters)

    if start_time > end_time:
        raise ValueError("start_time must be <= end_time")

    mutation_generator = _msprime.MutationGenerator(rng,
                                                    rate,
                                                    alphabet=alphabet,
                                                    start_time=start_time,
                                                    end_time=end_time)
    lwt = _msprime.LightweightTableCollection()
    lwt.fromdict(tables.asdict())
    mutation_generator.generate(lwt, keep=keep)

    tables = tskit.TableCollection.fromdict(lwt.asdict())
    tables.provenances.add_row(json.dumps(provenance_dict))
    return tables.tree_sequence()
Exemplo n.º 4
0
def add_provenance(provenance_table, method_name):
    d = provenance.get_provenance_dict("tsutil.{}".format(method_name))
    provenance_table.add_row(json.dumps(d))
Exemplo n.º 5
0
def simulate(sample_size=None,
             Ne=1,
             length=None,
             recombination_rate=None,
             recombination_map=None,
             mutation_rate=None,
             population_configurations=None,
             migration_matrix=None,
             demographic_events=[],
             samples=None,
             model=None,
             record_migrations=False,
             random_seed=None,
             mutation_generator=None,
             num_replicates=None):
    """
    Simulates the coalescent with recombination under the specified model
    parameters and returns the resulting :class:`.TreeSequence`.

    :param int sample_size: The number of individuals in our sample.
        If not specified or None, this defaults to the sum of the
        subpopulation sample sizes. Either ``sample_size``,
        ``population_configurations`` or ``samples`` must be specified.
    :param float Ne: The effective (diploid) population size for the reference
        population. This determines the factor by which the per-generation
        recombination and mutation rates are scaled in the simulation.
        This defaults to 1 if not specified.
    :param float length: The length of the simulated region in bases.
        This parameter cannot be used along with ``recombination_map``.
        Defaults to 1 if not specified.
    :param float recombination_rate: The rate of recombination per base
        per generation. This parameter cannot be used along with
        ``recombination_map``. Defaults to 0 if not specified.
    :param recombination_map: The map
        describing the changing rates of recombination along the simulated
        chromosome. This parameter cannot be used along with the
        ``recombination_rate`` or ``length`` parameters, as these
        values are encoded within the map. Defaults to a uniform rate as
        described in the ``recombination_rate`` parameter if not specified.
    :type recombination_map: :class:`.RecombinationMap`
    :param float mutation_rate: The rate of mutation per base per
        generation. If not specified, no mutations are generated.
    :param list population_configurations: The list of
        :class:`.PopulationConfiguration` instances describing the
        sampling configuration, relative sizes and growth rates of
        the populations to be simulated. If this is not specified,
        a single population with a sample of size ``sample_size``
        is assumed.
    :type population_configurations: list or None.
    :param list migration_matrix: The matrix describing the rates
        of migration between all pairs of populations. If :math:`N`
        populations are defined in the ``population_configurations``
        parameter, then the migration matrix must be an
        :math:`N\\times N` matrix consisting of :math:`N` lists of
        length :math:`N` or an :math`N\\times N` numpy array.
    :param list demographic_events: The list of demographic events to
        simulate. Demographic events describe changes to the populations
        in the past. Events should be supplied in non-decreasing
        order of time. Events with the same time value will be applied
        sequentially in the order that they were supplied before the
        simulation algorithm continues with the next time step.
    :param list samples: The list specifying the location and time of
        all samples. This parameter may be used to specify historical
        samples, and cannot be used in conjunction with the ``sample_size``
        parameter. Each sample is a (``population``, ``time``) pair
        such that the sample in position ``j`` in the list of samples
        is drawn in the specified population at the specfied time. Time
        is measured in generations, as elsewhere.
    :param int random_seed: The random seed. If this is `None`, a
        random seed will be automatically generated. Valid random
        seeds must be between 1 and :math:`2^{32} - 1`.
    :param int num_replicates: The number of replicates of the specified
        parameters to simulate. If this is not specified or None,
        no replication is performed and a :class:`.TreeSequence` object
        returned. If :obj:`num_replicates` is provided, the specified
        number of replicates is performed, and an iterator over the
        resulting :class:`.TreeSequence` objects returned.
    :return: The :class:`.TreeSequence` object representing the results
        of the simulation if no replication is performed, or an
        iterator over the independent replicates simulated if the
        :obj:`num_replicates` parameter has been used.
    :rtype: :class:`.TreeSequence` or an iterator over
        :class:`.TreeSequence` replicates.
    :warning: If using replication, do not store the results of the
        iterator in a list! For performance reasons, the same
        underlying object may be used for every TreeSequence
        returned which will most likely lead to unexpected behaviour.
    """
    seed = random_seed
    if random_seed is None:
        seed = _get_random_seed()
    # To support numpy integer inputs here too we convert to integer.
    rng = RandomGenerator(int(seed))
    sim = simulator_factory(
        sample_size=sample_size,
        random_generator=rng,
        Ne=Ne,
        length=length,
        recombination_rate=recombination_rate,
        recombination_map=recombination_map,
        population_configurations=population_configurations,
        migration_matrix=migration_matrix,
        demographic_events=demographic_events,
        samples=samples,
        model=model,
        record_migrations=record_migrations)
    # The provenance API is very tentative, and only included now as a
    # pre-alpha feature.
    parameters = {"TODO": "encode simulation parameters"}
    provenance_dict = provenance.get_provenance_dict("simulate", parameters)
    if mutation_generator is None:
        mu = 0 if mutation_rate is None else mutation_rate
        mutation_generator = MutationGenerator(rng, mu)
    else:
        if mutation_rate is not None:
            raise ValueError(
                "Cannot specify both mutation_rate and mutation_generator")
    if num_replicates is None:
        return next(
            _replicate_generator(sim, mutation_generator, 1, provenance_dict))
    else:
        return _replicate_generator(sim, mutation_generator, num_replicates,
                                    provenance_dict)
Exemplo n.º 6
0
def mutate(
        tree_sequence, rate=None, random_seed=None, model=None, keep=False,
        start_time=None, end_time=None):
    """
    Simulates mutations on the specified ancestry and returns the resulting
    :class:`tskit.TreeSequence`. Mutations are generated at the specified rate in
    measured generations. Mutations are generated under the infinite sites
    model, and so the rate of new mutations is per unit of sequence length per
    generation.

    If a random seed is specified, this is used to seed the random number
    generator. If the same seed is specified and all other parameters are equal
    then the same mutations will be generated. If no random seed is specified
    then one is generated automatically.

    If the ``model`` parameter is specified, this determines the model under
    which mutations are generated. Currently only the :class:`.InfiniteSites`
    mutation model is supported. This parameter is useful if you wish to obtain
    sequences with letters from the nucleotide alphabet rather than the default
    0/1 states. By default mutations from the infinite sites model with a binary
    alphabet are generated.

    By default, sites and mutations in the parameter tree sequence are
    discarded. If the ``keep`` parameter is true, however, *additional*
    mutations are simulated. Under the infinite sites mutation model, all new
    mutations generated will occur at distinct positions from each other and
    from any existing mutations (by rejection sampling).

    The time interval over which mutations can occur may be controlled
    using the ``start_time`` and ``end_time`` parameters. The ``start_time``
    defines the lower bound (in time-ago) on this interval and ``max_time``
    the upper bound. Note that we may have mutations associated with
    nodes with time <= ``start_time`` since mutations store the node at the
    bottom (i.e., towards the leaves) of the branch that they occur on.

    :param tskit.TreeSequence tree_sequence: The tree sequence onto which we
        wish to throw mutations.
    :param float rate: The rate of mutation per generation. (Default: 0).
    :param int random_seed: The random seed. If this is `None`, a
        random seed will be automatically generated. Valid random
        seeds must be between 1 and :math:`2^{32} - 1`.
    :param MutationModel model: The mutation model to use when generating
        mutations. If not specified or None, the :class:`.InfiniteSites`
        mutation model is used.
    :param bool keep: Whether to keep existing mutations (default: False).
    :param float start_time: The minimum time at which a mutation can
        occur. (Default: no restriction.)
    :param float end_time: The maximum time at which a mutation can occur
        (Default: no restriction).
    :return: The :class:`tskit.TreeSequence` object  resulting from overlaying
        mutations on the input tree sequence.
    :rtype: :class:`tskit.TreeSequence`
    """
    try:
        tables = tree_sequence.tables
    except AttributeError:
        raise ValueError("First argument must be a TreeSequence instance.")
    if random_seed is None:
        random_seed = simulations._get_random_seed()
    random_seed = int(random_seed)

    rng = _msprime.RandomGenerator(random_seed)
    if model is None:
        model = InfiniteSites()
    try:
        alphabet = model.alphabet
    except AttributeError:
        raise TypeError("model must be an InfiniteSites instance")
    if rate is None:
        rate = 0
    rate = float(rate)
    keep = bool(keep)

    parameters = {
        "command": "mutate", "rate": rate, "random_seed": random_seed, "keep": keep}

    if start_time is None:
        start_time = -sys.float_info.max
    else:
        start_time = float(start_time)
        parameters["start_time"] = start_time

    if end_time is None:
        end_time = sys.float_info.max
    else:
        end_time = float(end_time)
        parameters["end_time"] = end_time
    # TODO Add a JSON representation of the model to the provenance.
    provenance_dict = provenance.get_provenance_dict(parameters)

    if start_time > end_time:
        raise ValueError("start_time must be <= end_time")

    mutation_generator = _msprime.MutationGenerator(
        rng, rate, alphabet=alphabet, start_time=start_time, end_time=end_time)
    lwt = _msprime.LightweightTableCollection()
    lwt.fromdict(tables.asdict())
    mutation_generator.generate(lwt, keep=keep)

    tables = tskit.TableCollection.fromdict(lwt.asdict())
    tables.provenances.add_row(json.dumps(provenance_dict))
    return tables.tree_sequence()