Exemplo n.º 1
0
def irradiate(contig, x=20):
    """
    Increase mutation rate by a factor of `x`.
    """
    return stdpopsim.Contig(recombination_map=contig.recombination_map,
                            mutation_rate=x * contig.mutation_rate,
                            genetic_map=contig.genetic_map)
Exemplo n.º 2
0
 def test_simulation_runs(self):
     # With a recombination_map of None, we simulate a coalescent without
     # recombination in msprime, with no mutation.
     contig = stdpopsim.Contig()
     samples = self.model.get_samples(*([2] * self.model.num_populations))
     ts = self.model.simulate(contig, samples)
     self.assertEqual(ts.num_populations, self.model.num_populations)
Exemplo n.º 3
0
 def test_model(self):
     contig = stdpopsim.Contig()
     species = stdpopsim.get_species("homsap")
     model = species.get_model("ooa_3")
     stdout, stderr = capture_output(cli.write_citations, contig, model)
     self.assertEqual(len(stderr), 0)
     # TODO Parse out the output for the model and check that the text is
     # in there.
     self.assertGreater(len(stdout), 0)
Exemplo n.º 4
0
 def test_model_citations(self):
     contig = stdpopsim.Contig()
     species = stdpopsim.get_species("HomSap")
     model = species.get_demographic_model("OutOfAfrica_3G09")
     engine = stdpopsim.get_default_engine()
     stdout, stderr = capture_output(
             cli.write_citations, engine, model, contig, species)
     self.assertEqual(len(stdout), 0)
     genetic_map = None
     self.check_citations(engine, species, genetic_map, model, stderr)
Exemplo n.º 5
0
    def test_exclusion_of_drawn_mutation(self):
        coordinate = round(self.contig.recombination_map.get_length() / 2)
        extended_events = [
            stdpopsim.ext.DrawMutation(
                time=self.T_mut,
                mutation_type_id=self.mut_id,
                population_id=0,
                coordinate=coordinate,
                save=True,
            ),
            stdpopsim.ext.ConditionOnAlleleFrequency(
                start_time=0,
                end_time=0,
                mutation_type_id=self.mut_id,
                population_id=0,
                op=">",
                allele_frequency=0,
            ),
        ]
        contig = stdpopsim.Contig(
            mutation_rate=0,
            recombination_map=self.contig.recombination_map,
            genetic_map=self.contig.genetic_map,
        )
        slim = stdpopsim.get_engine("slim")
        with mock.patch("warnings.warn", autospec=True):
            ts = slim.simulate(
                demographic_model=self.model,
                contig=contig,
                samples=self.samples,
                mutation_types=self.mutation_types,
                extended_events=extended_events,
                slim_scaling_factor=10,
                slim_burn_in=0.1,
                seed=1,
            )
        self.assertEqual(ts.num_mutations, 1)
        ts_af = self.allele_frequency(ts)
        self.assertGreaterEqual(ts_af, 0)

        rng = np.random.default_rng(seed=31415)
        A, af = convert.ts2mat(ts, 32, 0, rng, exclude_mut_with_metadata=False)
        self.assertGreater(A.sum(), 0)
        self.assertEqual(len(af), 1)
        self.assertEqual(ts_af, af[0])

        A, af = convert.ts2mat(ts, 32, 0, rng, exclude_mut_with_metadata=True)
        self.assertEqual(A.sum(), 0)
        self.assertEqual(len(af), 1)
        self.assertEqual(ts_af, af[0])
Exemplo n.º 6
0
    def recap_and_rescale(self,
                          ts,
                          demographic_model,
                          contig,
                          samples,
                          mutation_types=None,
                          extended_events=None,
                          slim_scaling_factor=1.0,
                          seed=None,
                          **kwargs):
        """
        Apply post-SLiM transformations to ``ts``. This rescales node times,
        does recapitation, simplification, and adds neutral mutations.

        If the SLiM engine was used to output a SLiM script, and the script was
        run outside of stdpopsim, this function can be used to transform the
        SLiM tree sequence following the procedure that would have been used
        if stdpopsim had run SLiM itself.
        The parameters after ``ts`` have the same meaning as for :func:`simulate`,
        and the values for ``demographic_model``, ``contig``, ``samples``,
        and ``slim_scaling_factor`` should match those that were used to
        generate the SLiM script with :func:`simulate`.

        :param ts: The tree sequence output by SLiM.
        :type ts: :class:`pyslim.SlimTreeSequence`

        .. warning::
            The :func:`recap_and_rescale` function is provided in the hope that
            it will be useful. But as we can't anticipate what changes you'll
            make to the SLiM code before using it, the stdpopsim source code
            should be consulted to determine if it's behaviour is appropriate
            for your case.
        """
        # Only "weighted" mutations are introduced by SLiM.
        mutation_rate = contig.mutation_rate
        slim_frac = stdpopsim.ext.slim_mutation_frac(mutation_types)
        contig = stdpopsim.Contig(recombination_map=contig.recombination_map,
                                  mutation_rate=slim_frac * mutation_rate,
                                  genetic_map=contig.genetic_map)

        with open(os.devnull, "w") as script_file:
            recap_epoch = slim_makescript(script_file, "unused.trees",
                                          demographic_model, contig, samples,
                                          mutation_types, extended_events,
                                          slim_scaling_factor, 1)

        ts = self._recap_and_rescale(ts, seed, recap_epoch, contig,
                                     mutation_rate, slim_frac,
                                     slim_scaling_factor)
        return ts
Exemplo n.º 7
0
 def test_simulation_runs(self):
     # With a recombination_map of None, we simulate a coalescent without
     # recombination in msprime, with no mutation.
     contig = stdpopsim.Contig()
     # Generate vector with 2 samples for each pop with sampling enabled
     sample_count = []
     for p in self.model.populations:
         if p.allow_samples:
             sample_count.append(2)
         else:
             sample_count.append(0)
     samples = self.model.get_samples(*sample_count)
     engine = stdpopsim.get_default_engine()
     ts = engine.simulate(self.model, contig, samples)
     self.assertEqual(ts.num_populations, self.model.num_populations)
Exemplo n.º 8
0
    def get_contig(self, chromosome, genetic_map=None, length_multiplier=1):
        """
        Returns a :class:`.Contig` instance describing a section of genome that
        is to be simulated based on empirical information for a given species
        and chromosome.

        :param str chromosome: The ID of the chromosome to simulate.
        :param str genetic_map: If specified, obtain recombination rate information
            from the genetic map with the specified ID. If None, simulate
            using a default uniform recombination rate on a region with the length of
            the specified chromosome. The default rates are species- and chromosome-
            specific, and can be found in the :ref:`sec_catalog`. (Default: None)
        :param float length_multiplier: If specified, simulate a region of length
            `length_multiplier` times the length of the specified chromosome with the
            same chromosome-specific mutation and recombination rates.
            This option cannot currently be used in conjunction with the
            ``genetic_map`` argument.
        :rtype: :class:`.Contig`
        :return: A :class:`.Contig` describing a simulation of the section of genome.
        """
        # TODO: add non-autosomal support
        if (chromosome is not None and chromosome.lower()
                in ("x", "y", "m", "mt", "chrx", "chry", "chrm")):
            warnings.warn(
                stdpopsim.NonAutosomalWarning(
                    "Non-autosomal simulations are not yet supported. See "
                    "https://github.com/popsim-consortium/stdpopsim/issues/383 and "
                    "https://github.com/popsim-consortium/stdpopsim/issues/406"
                ))
        chrom = self.genome.get_chromosome(chromosome)
        if genetic_map is None:
            logger.debug(
                f"Making flat chromosome {length_multiplier} * {chrom.id}")
            gm = None
            recomb_map = msprime.RecombinationMap.uniform_map(
                chrom.length * length_multiplier, chrom.recombination_rate)
        else:
            if length_multiplier != 1:
                raise ValueError(
                    "Cannot use length multiplier with empirical maps")
            logger.debug(f"Getting map for {chrom.id} from {genetic_map}")
            gm = self.get_genetic_map(genetic_map)
            recomb_map = gm.get_chromosome_map(chrom.id)

        ret = stdpopsim.Contig(recombination_map=recomb_map,
                               mutation_rate=chrom.mutation_rate,
                               genetic_map=gm)
        return ret
Exemplo n.º 9
0
    def simulate(self,
                 demographic_model=None,
                 contig=None,
                 samples=None,
                 seed=None,
                 slim_path=None,
                 slim_script=False,
                 slim_scaling_factor=1.0,
                 slim_burn_in=10.0,
                 dry_run=False,
                 **kwargs):
        """
        Simulate the demographic model using SLiM.
        See :meth:`.Engine.simulate()` for definitions of the
        ``demographic_model``, ``contig``, and ``samples`` parameters.

        :param seed: The seed for the random number generator.
        :type seed: int
        :param slim_path: The full path to the slim executable, or the name of
            a command in the current PATH.
        :type slim_path: str
        :param slim_script: If true, the simulation will not be executed.
            Instead the generated SLiM script will be printed to stdout.
        :type slim_script: bool
        :param slim_scaling_factor: Rescale model parameters by the given value,
            to speed up simulation. Population sizes and generation times are
            divided by this factor, whereas the mutation rate, recombination
            rate, and growth rates are multiplied by the factor.
            See SLiM manual: `5.5 Rescaling population sizes to improve
            simulation performance.`
        :type slim_scaling_factor: float
        :param slim_burn_in: Length of the burn-in phase, in units of N
            generations.
        :type slim_burn_in: float
        :param dry_run: If True, run the first generation setup and then end the
            simulation.
        :type dry_run: bool
        """

        if slim_scaling_factor <= 0:
            raise ValueError("slim_scaling_factor must be positive")
        if slim_burn_in < 0:
            raise ValueError("slim_burn_in must be non-negative")

        run_slim = not slim_script

        mutation_rate = contig.mutation_rate
        # Ensure no mutations are introduced by SLiM.
        contig = stdpopsim.Contig(recombination_map=contig.recombination_map,
                                  mutation_rate=0,
                                  genetic_map=contig.genetic_map)

        mktemp = functools.partial(tempfile.NamedTemporaryFile, mode="w")

        @contextlib.contextmanager
        def script_file_f():
            f = mktemp(suffix=".slim") if not slim_script else sys.stdout
            yield f
            # Don't close sys.stdout.
            if not slim_script:
                f.close()

        with script_file_f() as script_file, mktemp(suffix=".ts") as ts_file:

            recap_epoch = slim_makescript(script_file, ts_file.name,
                                          demographic_model, contig, samples,
                                          slim_scaling_factor, slim_burn_in)

            script_file.flush()

            if not run_slim:
                return None

            self._run_slim(script_file.name,
                           slim_path=slim_path,
                           seed=seed,
                           dry_run=dry_run)

            if dry_run:
                return None

            ts = pyslim.load(ts_file.name)

        ts = self._recap_and_rescale(ts, seed, recap_epoch, contig,
                                     mutation_rate, slim_scaling_factor)
        return ts
Exemplo n.º 10
0
    def simulate(
            self, demographic_model=None, contig=None, samples=None, seed=None,
            verbosity=0, slim_path=None, slim_script=False, slim_scaling_factor=10,
            slim_no_recapitation=False, slim_no_burnin=False, **kwargs):
        """
        Simulate the demographic model using SLiM.
        See :meth:`.Engine.simulate()` for definitions of the
        ``demographic_model``, ``contig``, and ``samples`` parameters.

        :param seed: The seed for the random number generator.
        :type seed: int
        :param slim_path: The full path to the slim executable, or the name of
            a command in the current PATH.
        :type slim_path: str
        :param slim_script: If true, the simulation will not be executed.
            Instead the generated SLiM script will be printed to stdout.
        :type slim_script: bool
        :param slim_scaling_factor: Rescale model parameters by the given value,
            to speed up simulation. Population sizes and generation times are
            divided by this factor, whereas the mutation rate, recombination
            rate, and growth rates are multiplied by the factor.
            See SLiM manual: `5.5 Rescaling population sizes to improve
            simulation performance.`
        :type slim_scaling_factor: float
        :param slim_no_recapitation: Do an explicit burn in, and add
            mutations, within the SLiM simulation. This may be much slower than
            the defaults (recapitation and neutral mutation overlay with
            msprime). The burn in behaviour is to wait until all individuals in
            the ancestral populations have a common ancestor within their
            respective population, and then wait another 10*N generations.
        :type slim_no_recapitation: bool
        :param slim_no_burnin: Do not perform a burn in at the start of the
            simulation.  This option is only relevant when
            ``slim_no_recapitation=True``.
        :type slim_no_burnin: bool
        """

        run_slim = not slim_script
        do_recap = not slim_no_recapitation
        check_coalescence = slim_no_recapitation and not slim_no_burnin

        if slim_path is None:
            slim_path = self.slim_path()

        if do_recap:
            mutation_rate = contig.mutation_rate
            # Ensure no mutations are introduced by SLiM.
            contig = stdpopsim.Contig(
                    recombination_map=contig.recombination_map,
                    mutation_rate=0,
                    genetic_map=contig.genetic_map)

        slim_cmd = [slim_path]
        if seed is not None:
            slim_cmd.extend(["-s", f"{seed}"])

        mktemp = functools.partial(tempfile.NamedTemporaryFile, mode="w")

        @contextlib.contextmanager
        def script_file_f():
            f = mktemp(suffix=".slim") if not slim_script else sys.stdout
            yield f
            # Don't close sys.stdout.
            if not slim_script:
                f.close()

        with script_file_f() as script_file, mktemp(suffix=".ts") as ts_file:

            recap_epoch = slim_makescript(
                    script_file, ts_file.name,
                    demographic_model, contig, samples,
                    slim_scaling_factor, check_coalescence, verbosity)

            script_file.flush()

            if not run_slim:
                return None

            slim_cmd.append(script_file.name)
            stdout = subprocess.DEVNULL if verbosity == 0 else None
            subprocess.check_call(slim_cmd, stdout=stdout)

            ts = pyslim.load(ts_file.name)

        # Node times come from SLiM generation numbers, which may have been
        # divided by a scaling factor for computational tractibility.
        tables = ts.dump_tables()
        for table in (tables.nodes, tables.migrations):
            table.time *= slim_scaling_factor
        ts = pyslim.SlimTreeSequence.load_tables(tables)
        ts.slim_generation *= slim_scaling_factor

        if do_recap:
            rng = random.Random(seed)
            s1, s2 = rng.randrange(1, 2**32), rng.randrange(1, 2**32)

            population_configurations = [
                    msprime.PopulationConfiguration(
                        initial_size=pop.start_size,
                        growth_rate=pop.growth_rate)
                    for pop in recap_epoch.populations]
            ts = ts.recapitate(
                    recombination_rate=contig.recombination_map.mean_recombination_rate,
                    population_configurations=population_configurations,
                    migration_matrix=recap_epoch.migration_matrix,
                    random_seed=s1)

        ts = simplify_remembered(ts)

        if do_recap:
            # Add neutral mutations.
            ts = pyslim.SlimTreeSequence(msprime.mutate(
                ts, rate=mutation_rate, keep=True, random_seed=s2))

        return ts
Exemplo n.º 11
0
    def get_contig(
        self,
        chromosome=None,
        genetic_map=None,
        length_multiplier=1,
        length=None,
        inclusion_mask=None,
        exclusion_mask=None,
    ):
        """
        Returns a :class:`.Contig` instance describing a section of genome that
        is to be simulated based on empirical information for a given species
        and chromosome.

        :param str chromosome: The ID of the chromosome to simulate.
            A complete list of chromosome IDs for each species can be found in the
            "Genome" subsection for the species in the :ref:`sec_catalog`.
            If the chromosome is not given, we specify a "generic" contig with given
            ``length``.
        :param str genetic_map: If specified, obtain recombination rate information
            from the genetic map with the specified ID. If None, simulate
            using a default uniform recombination rate on a region with the length of
            the specified chromosome. The default rates are species- and chromosome-
            specific, and can be found in the :ref:`sec_catalog`. (Default: None)
        :param float length_multiplier: If specified, simulate a region of length
            `length_multiplier` times the length of the specified chromosome with the
            same chromosome-specific mutation and recombination rates.
            This option cannot currently be used in conjunction with the
            ``genetic_map`` argument.
        :param inclusion_mask: If specified, simulated genomes are subset to only
            inlude regions given by the mask. The mask can be specified by the
            path and file name of a bed file or as a list or array of intervals
            given by the left and right end points of the intervals.
        :param exclusion_mask: If specified, simulated genomes are subset to exclude
            regions given by the mask. The mask can be specified by the
            path and file name of a bed file or as a list or array of intervals
            given by the left and right end points of the intervals.
        :param float length: Used with a "generic" contig, specifies the
            length of genome sequence for this contig. For a generic contig, mutation
            and recombination rates are equal to the genome-wide average across all
            autosomal chromosomes.
        :rtype: :class:`.Contig`
        :return: A :class:`.Contig` describing the section of the genome.
        """
        # TODO: add non-autosomal support
        non_autosomal_lower = ["x", "y", "m", "mt", "chrx", "chry", "chrm"]
        if chromosome is not None and chromosome.lower(
        ) in non_autosomal_lower:
            warnings.warn(
                stdpopsim.NonAutosomalWarning(
                    "Non-autosomal simulations are not yet supported. See "
                    "https://github.com/popsim-consortium/stdpopsim/issues/383 and "
                    "https://github.com/popsim-consortium/stdpopsim/issues/406"
                ))
        if chromosome is None:
            if genetic_map is not None:
                raise ValueError("Cannot use genetic map with generic contic")
            if length_multiplier != 1:
                raise ValueError(
                    "Cannot use length multiplier for generic contig")
            if inclusion_mask is not None or exclusion_mask is not None:
                raise ValueError("Cannot use mask with generic contig")
            if length is None:
                raise ValueError(
                    "Must specify sequence length of generic contig")
            L_tot = 0
            r_tot = 0
            u_tot = 0
            for chrom_data in self.genome.chromosomes:
                if chrom_data.id.lower() not in non_autosomal_lower:
                    L_tot += chrom_data.length
                    r_tot += chrom_data.length * chrom_data.recombination_rate
                    u_tot += chrom_data.length * chrom_data.mutation_rate
            u = u_tot / L_tot
            r = r_tot / L_tot
            recomb_map = msprime.RateMap.uniform(length, r)
            ret = stdpopsim.Contig(recombination_map=recomb_map,
                                   mutation_rate=u)
        else:
            if length is not None:
                raise ValueError(
                    "Cannot specify sequence length for named contig")
            if inclusion_mask is not None and exclusion_mask is not None:
                raise ValueError(
                    "Cannot specify both inclusion and exclusion masks")
            chrom = self.genome.get_chromosome(chromosome)
            if genetic_map is None:
                logger.debug(
                    f"Making flat chromosome {length_multiplier} * {chrom.id}")
                gm = None
                recomb_map = msprime.RateMap.uniform(
                    round(chrom.length * length_multiplier),
                    chrom.recombination_rate)
            else:
                if length_multiplier != 1:
                    raise ValueError(
                        "Cannot use length multiplier with empirical maps")
                logger.debug(f"Getting map for {chrom.id} from {genetic_map}")
                gm = self.get_genetic_map(genetic_map)
                recomb_map = gm.get_chromosome_map(chrom.id)

            inclusion_intervals = None
            exclusion_intervals = None
            if inclusion_mask is not None:
                if length_multiplier != 1:
                    raise ValueError("Cannot use length multiplier with mask")
                if isinstance(inclusion_mask, str):
                    inclusion_intervals = stdpopsim.utils.read_bed(
                        inclusion_mask, chromosome)
                else:
                    inclusion_intervals = inclusion_mask
            if exclusion_mask is not None:
                if length_multiplier != 1:
                    raise ValueError("Cannot use length multiplier with mask")
                if isinstance(exclusion_mask, str):
                    exclusion_intervals = stdpopsim.utils.read_bed(
                        exclusion_mask, chromosome)
                else:
                    exclusion_intervals = exclusion_mask

            ret = stdpopsim.Contig(
                recombination_map=recomb_map,
                mutation_rate=chrom.mutation_rate,
                genetic_map=gm,
                inclusion_mask=inclusion_intervals,
                exclusion_mask=exclusion_intervals,
            )

        return ret
Exemplo n.º 12
0
    def species_contig(
        *,
        species,
        chromosome=None,
        genetic_map=None,
        length_multiplier=1,
        length=None,
        mutation_rate=None,
        inclusion_mask=None,
        exclusion_mask=None,
    ):
        """
        Build a Contig for a species.
        """
        # TODO: add non-autosomal support
        non_autosomal_lower = ["x", "y", "m", "mt", "chrx", "chry", "chrm"]
        if chromosome is not None and chromosome.lower(
        ) in non_autosomal_lower:
            warnings.warn(
                stdpopsim.NonAutosomalWarning(
                    "Non-autosomal simulations are not yet supported. See "
                    "https://github.com/popsim-consortium/stdpopsim/issues/383 and "
                    "https://github.com/popsim-consortium/stdpopsim/issues/406"
                ))
        if chromosome is None:
            if genetic_map is not None:
                raise ValueError("Cannot use genetic map with generic contig")
            if length_multiplier != 1:
                raise ValueError(
                    "Cannot use length multiplier for generic contig")
            if inclusion_mask is not None or exclusion_mask is not None:
                raise ValueError("Cannot use mask with generic contig")
            if length is None:
                raise ValueError(
                    "Must specify sequence length of generic contig")
            L_tot = 0
            r_tot = 0
            u_tot = 0
            for chrom_data in species.genome.chromosomes:
                if chrom_data.id.lower() not in non_autosomal_lower:
                    L_tot += chrom_data.length
                    r_tot += chrom_data.length * chrom_data.recombination_rate
                    u_tot += chrom_data.length * chrom_data.mutation_rate
            if mutation_rate is None:
                mutation_rate = u_tot / L_tot
            r = r_tot / L_tot
            contig = Contig.basic_contig(
                length=length,
                mutation_rate=mutation_rate,
                recombination_rate=r,
            )
        else:
            if length is not None:
                raise ValueError(
                    "Cannot specify sequence length for named contig")
            if inclusion_mask is not None and exclusion_mask is not None:
                raise ValueError(
                    "Cannot specify both inclusion and exclusion masks")
            chrom = species.genome.get_chromosome(chromosome)
            if genetic_map is None:
                logger.debug(
                    f"Making flat chromosome {length_multiplier} * {chrom.id}")
                gm = None
                recomb_map = msprime.RateMap.uniform(
                    round(chrom.length * length_multiplier),
                    chrom.recombination_rate)
            else:
                if length_multiplier != 1:
                    raise ValueError(
                        "Cannot use length multiplier with empirical maps")
                logger.debug(f"Getting map for {chrom.id} from {genetic_map}")
                gm = species.get_genetic_map(genetic_map)
                recomb_map = gm.get_chromosome_map(chrom.id)

            inclusion_intervals = None
            exclusion_intervals = None
            if inclusion_mask is not None:
                if length_multiplier != 1:
                    raise ValueError("Cannot use length multiplier with mask")
                if isinstance(inclusion_mask, str):
                    inclusion_intervals = stdpopsim.utils.read_bed(
                        inclusion_mask, chromosome)
                else:
                    inclusion_intervals = inclusion_mask
            if exclusion_mask is not None:
                if length_multiplier != 1:
                    raise ValueError("Cannot use length multiplier with mask")
                if isinstance(exclusion_mask, str):
                    exclusion_intervals = stdpopsim.utils.read_bed(
                        exclusion_mask, chromosome)
                else:
                    exclusion_intervals = exclusion_mask

            if mutation_rate is None:
                mutation_rate = chrom.mutation_rate

            contig = stdpopsim.Contig(
                recombination_map=recomb_map,
                mutation_rate=mutation_rate,
                genetic_map=gm,
                inclusion_mask=inclusion_intervals,
                exclusion_mask=exclusion_intervals,
            )

        return contig
Exemplo n.º 13
0
 def test_simulation_runs(self):
     model = pongo.LockeEtAlPongoIM()
     contig = stdpopsim.Contig()
     samples = model.get_samples(2)
     ts = model.simulate(contig, samples)
     self.assertEqual(ts.num_populations, 2)
Exemplo n.º 14
0
    def simulate(
        self,
        demographic_model=None,
        contig=None,
        samples=None,
        seed=None,
        mutation_types=None,
        extended_events=None,
        slim_path=None,
        slim_script=False,
        slim_scaling_factor=1.0,
        slim_burn_in=10.0,
        dry_run=False,
    ):
        """
        Simulate the demographic model using SLiM.
        See :meth:`.Engine.simulate()` for definitions of the
        ``demographic_model``, ``contig``, and ``samples`` parameters.

        :param seed: The seed for the random number generator.
        :type seed: int
        :param slim_path: The full path to the slim executable, or the name of
            a command in the current PATH.
        :type slim_path: str
        :param slim_script: If true, the simulation will not be executed.
            Instead the generated SLiM script will be printed to stdout.
        :type slim_script: bool
        :param slim_scaling_factor: Rescale model parameters by the given value,
            to speed up simulation. Population sizes and generation times are
            divided by this factor, whereas the mutation rate, recombination
            rate, and growth rates are multiplied by the factor.
            See SLiM manual: `5.5 Rescaling population sizes to improve
            simulation performance.`
        :type slim_scaling_factor: float
        :param slim_burn_in: Length of the burn-in phase, in units of N
            generations.
        :type slim_burn_in: float
        :param dry_run: If True, run the first generation setup and then end the
            simulation.
        :type dry_run: bool
        """

        if slim_scaling_factor <= 0:
            raise ValueError("slim_scaling_factor must be positive")
        if slim_burn_in < 0:
            raise ValueError("slim_burn_in must be non-negative")

        if slim_scaling_factor != 1:
            warnings.warn(
                stdpopsim.SLiMScalingFactorWarning(
                    f"You're using a scaling factor ({slim_scaling_factor}). "
                    "This should give similar results for many situations, "
                    "but is not equivalent, especially in the presence of selection. "
                    "When using rescaling, you should be careful---do checks and "
                    "compare results across different values of the scaling factor."
                ))

        run_slim = not slim_script

        # Ensure only "weighted" mutations are introduced by SLiM.
        mutation_rate = contig.mutation_rate
        slim_frac = stdpopsim.ext.slim_mutation_frac(mutation_types)
        contig = stdpopsim.Contig(
            recombination_map=contig.recombination_map,
            mutation_rate=slim_frac * mutation_rate,
            genetic_map=contig.genetic_map,
            inclusion_mask=contig.inclusion_mask,
            exclusion_mask=contig.exclusion_mask,
        )

        mktemp = functools.partial(tempfile.NamedTemporaryFile, mode="w")

        @contextlib.contextmanager
        def script_file_f():
            f = mktemp(suffix=".slim") if not slim_script else sys.stdout
            yield f
            # Don't close sys.stdout.
            if not slim_script:
                f.close()

        with script_file_f() as script_file, mktemp(suffix=".ts") as ts_file:

            recap_epoch = slim_makescript(
                script_file,
                ts_file.name,
                demographic_model,
                contig,
                samples,
                mutation_types,
                extended_events,
                slim_scaling_factor,
                slim_burn_in,
            )

            script_file.flush()

            if not run_slim:
                return None

            self._run_slim(script_file.name,
                           slim_path=slim_path,
                           seed=seed,
                           dry_run=dry_run)

            if dry_run:
                return None

            ts = pyslim.load(ts_file.name)

        ts = self._recap_and_rescale(ts, seed, recap_epoch, contig,
                                     mutation_rate, slim_frac,
                                     slim_scaling_factor)

        if contig.inclusion_mask is not None:
            ts = stdpopsim.utils.mask_tree_sequence(ts, contig.inclusion_mask,
                                                    False)
        if contig.exclusion_mask is not None:
            ts = stdpopsim.utils.mask_tree_sequence(ts, contig.exclusion_mask,
                                                    True)

        return ts
def simulate_stdpopsim(
    species,
    model,
    contig,
    num_samples,
    mutation_file=None,
    seed=123,
    skip_existing=False,
    num_procs=1,
):
    base_fn = f"{model}_{contig}_n{num_samples}"
    tree_fn = f"{base_fn}_seed{seed}"
    logger.info(
        f"Using {species}:{contig} from stdpopsim using the {model} model")
    if skip_existing and os.path.exists(tree_fn + ".trees"):
        logger.info(
            f"Simulation file {tree_fn}.trees already exists, returning that.")
        return base_fn, tree_fn

    sample_data = None
    species = stdpopsim.get_species(species)
    model = species.get_demographic_model(model)
    num_pops = model.num_sampling_populations
    if num_samples < num_pops or num_samples % num_pops != 0:
        raise ValueError(
            f"num_samples must be an integer multiple of {num_pops} "
            f"(or 2 x {num_pops} if diploid sequencing error is injected)")
    pop_n = num_samples // num_pops
    logger.info(
        f"Simulating {num_pops}x{pop_n} samples, seed {seed}, file prefix '{tree_fn}'."
    )
    contig = species.get_contig(contig)
    l = contig.recombination_map.get_sequence_length()
    if mutation_file is not None:
        logger.debug(f"Loading {mutation_file}")
        sample_data = tsinfer.load(mutation_file)
        if sample_data.sequence_length != l:
            raise ValueError(
                f"Mismatching sequence_length between simulation and {mutation_file}"
            )
        # Reduce mutation rate to 0, as we will insert mutations later
        contig = stdpopsim.Contig(
            mutation_rate=0,
            recombination_map=contig.recombination_map,
            genetic_map=contig.genetic_map,
        )
    r_map = contig.recombination_map
    assert len(r_map.get_rates()) == 2  # Ensure a single rate over chr
    samples = model.get_samples(*([pop_n] * num_pops))
    engine = stdpopsim.get_engine('msprime')
    ts = engine.simulate(model, contig, samples, seed=seed)
    tables = ts.dump_tables()
    if sample_data is not None:
        pos = sample_data.sites_position[:]
        logger.info(
            f"Inserting {len(pos)} mutations at variable sites from {mutation_file}"
        )
        for tree in ts.trees():
            positions = pos[np.logical_and(pos >= tree.interval[0],
                                           pos < tree.interval[1])]
            if len(positions) == 0:
                continue
            muts = list(
                zip(
                    np.random.uniform(0,
                                      tree.total_branch_length,
                                      size=len(positions)), positions))
            muts.sort()
            tot = 0
            # place a mutation on a random branch, proportional to branch length
            try:
                for n in tree.nodes():
                    tot += tree.branch_length(n)
                    while muts[0][0] < tot:
                        _, position = muts.pop(0)
                        s = tables.sites.add_row(position=position,
                                                 ancestral_state="0")
                        tables.mutations.add_row(node=n,
                                                 site=s,
                                                 derived_state="1")
            except IndexError:
                # No more mutations - go to next tree
                continue
        tables.sort()
        logger.debug(
            f"Inserted mutations at density {ts.num_mutations/ts.sequence_length}"
        )
    interval = [int(l * 2 / 20),
                int(l * 2 / 20) + 1e7]  # 10Mb near the start, not centromeric
    tables.keep_intervals([interval])
    tables.trim()
    logger.debug(
        f"Cut down tree seq to  {interval} ({tables.sites.num_rows} sites) for speed"
    )

    # Add info to the top-level metadata
    user_data = {}

    logger.info(
        "Calculating the kc distance of the simulation against a flat tree")
    star_tree = tskit.Tree.generate_star(ts.num_samples,
                                         span=tables.sequence_length,
                                         record_provenance=False)
    user_data['kc_max'] = tables.tree_sequence().kc_distance(
        star_tree.tree_sequence)
    kc_array = []
    max_reps = 100
    ts = tables.tree_sequence()
    logger.info(
        f"Calculating KC distance of the sim against at most {max_reps} * {ts.num_trees}"
        f" random trees using {num_procs} parallel threads. This could take a while."
    )
    seeds = range(seed, seed + max_reps)
    with multiprocessing.Pool(num_procs) as pool:
        for i, kc in enumerate(
                pool.imap_unordered(rnd_kc, zip(itertools.repeat(ts), seeds))):
            kc_array.append(kc)
            if i > 10:
                se_mean = np.std(kc_array, ddof=1) / np.sqrt(i)
                # break if SEM < 1/100th of mean KC. This can take along time
                if se_mean / np.average(kc_array) < 0.01:
                    logger.info(
                        f"Stopped after {i} replicates as kc_max_split deemed accurate."
                    )
                    break
        user_data['kc_max_split'] = np.average(kc_array)

    if tables.metadata_schema != tskit.MetadataSchema({"codec": "json"}):
        if tables.metadata:
            raise RuntimeError("Metadata already exists, and is not JSON")
        tables.metadata_schema = tskit.MetadataSchema({"codec": "json"})
        tables.metadata = {}
    tables.metadata = {"user_data": user_data, **tables.metadata}
    tables.tree_sequence().dump(tree_fn + ".trees")
    return base_fn, tree_fn
Exemplo n.º 16
0
    out = np.copy(pgen)
    for i in range(out.shape[0]):
        out[i, :] = np.random.binomial(2, out[i, :])
    return out


bingen = binomialBinGenotypes(pgen)

############################ run OOA simulation and prep allele counts #####################
import msprime as msp
import stdpopsim
print("simulating")
species = stdpopsim.get_species("HomSap")
contig = species.get_contig("chr22", genetic_map="HapMapII_GRCh37")
new_contig = stdpopsim.Contig(recombination_map=contig.recombination_map,
                              mutation_rate=2.35e-8,
                              genetic_map=contig.genetic_map)
model = species.get_demographic_model(
    'OutOfAfrica_3G09'
)  #similar results with OutOfAfrica_3G09 and OutOfAfricaArchaicAdmixture_5R19
simsamples = model.get_samples(100, 100, 100)
engine = stdpopsim.get_engine('msprime')
sim = engine.simulate(model, new_contig, simsamples, seed=12345)
sim_gen = allel.HaplotypeArray(sim.genotype_matrix()).to_genotypes(ploidy=2)
sim_pos = np.array([s.position for s in sim.sites()], dtype="int32")
m2 = np.isin(sim_pos, keep)
sim_gen = sim_gen[m2, :, :]
sim_pos = sim_pos[m2]
# sim_gen=sim_gen[sim_pos<3.8e7,:,:]
# sim_pos=sim_pos[sim_pos<3.8e7]