def irradiate(contig, x=20): """ Increase mutation rate by a factor of `x`. """ return stdpopsim.Contig(recombination_map=contig.recombination_map, mutation_rate=x * contig.mutation_rate, genetic_map=contig.genetic_map)
def test_simulation_runs(self): # With a recombination_map of None, we simulate a coalescent without # recombination in msprime, with no mutation. contig = stdpopsim.Contig() samples = self.model.get_samples(*([2] * self.model.num_populations)) ts = self.model.simulate(contig, samples) self.assertEqual(ts.num_populations, self.model.num_populations)
def test_model(self): contig = stdpopsim.Contig() species = stdpopsim.get_species("homsap") model = species.get_model("ooa_3") stdout, stderr = capture_output(cli.write_citations, contig, model) self.assertEqual(len(stderr), 0) # TODO Parse out the output for the model and check that the text is # in there. self.assertGreater(len(stdout), 0)
def test_model_citations(self): contig = stdpopsim.Contig() species = stdpopsim.get_species("HomSap") model = species.get_demographic_model("OutOfAfrica_3G09") engine = stdpopsim.get_default_engine() stdout, stderr = capture_output( cli.write_citations, engine, model, contig, species) self.assertEqual(len(stdout), 0) genetic_map = None self.check_citations(engine, species, genetic_map, model, stderr)
def test_exclusion_of_drawn_mutation(self): coordinate = round(self.contig.recombination_map.get_length() / 2) extended_events = [ stdpopsim.ext.DrawMutation( time=self.T_mut, mutation_type_id=self.mut_id, population_id=0, coordinate=coordinate, save=True, ), stdpopsim.ext.ConditionOnAlleleFrequency( start_time=0, end_time=0, mutation_type_id=self.mut_id, population_id=0, op=">", allele_frequency=0, ), ] contig = stdpopsim.Contig( mutation_rate=0, recombination_map=self.contig.recombination_map, genetic_map=self.contig.genetic_map, ) slim = stdpopsim.get_engine("slim") with mock.patch("warnings.warn", autospec=True): ts = slim.simulate( demographic_model=self.model, contig=contig, samples=self.samples, mutation_types=self.mutation_types, extended_events=extended_events, slim_scaling_factor=10, slim_burn_in=0.1, seed=1, ) self.assertEqual(ts.num_mutations, 1) ts_af = self.allele_frequency(ts) self.assertGreaterEqual(ts_af, 0) rng = np.random.default_rng(seed=31415) A, af = convert.ts2mat(ts, 32, 0, rng, exclude_mut_with_metadata=False) self.assertGreater(A.sum(), 0) self.assertEqual(len(af), 1) self.assertEqual(ts_af, af[0]) A, af = convert.ts2mat(ts, 32, 0, rng, exclude_mut_with_metadata=True) self.assertEqual(A.sum(), 0) self.assertEqual(len(af), 1) self.assertEqual(ts_af, af[0])
def recap_and_rescale(self, ts, demographic_model, contig, samples, mutation_types=None, extended_events=None, slim_scaling_factor=1.0, seed=None, **kwargs): """ Apply post-SLiM transformations to ``ts``. This rescales node times, does recapitation, simplification, and adds neutral mutations. If the SLiM engine was used to output a SLiM script, and the script was run outside of stdpopsim, this function can be used to transform the SLiM tree sequence following the procedure that would have been used if stdpopsim had run SLiM itself. The parameters after ``ts`` have the same meaning as for :func:`simulate`, and the values for ``demographic_model``, ``contig``, ``samples``, and ``slim_scaling_factor`` should match those that were used to generate the SLiM script with :func:`simulate`. :param ts: The tree sequence output by SLiM. :type ts: :class:`pyslim.SlimTreeSequence` .. warning:: The :func:`recap_and_rescale` function is provided in the hope that it will be useful. But as we can't anticipate what changes you'll make to the SLiM code before using it, the stdpopsim source code should be consulted to determine if it's behaviour is appropriate for your case. """ # Only "weighted" mutations are introduced by SLiM. mutation_rate = contig.mutation_rate slim_frac = stdpopsim.ext.slim_mutation_frac(mutation_types) contig = stdpopsim.Contig(recombination_map=contig.recombination_map, mutation_rate=slim_frac * mutation_rate, genetic_map=contig.genetic_map) with open(os.devnull, "w") as script_file: recap_epoch = slim_makescript(script_file, "unused.trees", demographic_model, contig, samples, mutation_types, extended_events, slim_scaling_factor, 1) ts = self._recap_and_rescale(ts, seed, recap_epoch, contig, mutation_rate, slim_frac, slim_scaling_factor) return ts
def test_simulation_runs(self): # With a recombination_map of None, we simulate a coalescent without # recombination in msprime, with no mutation. contig = stdpopsim.Contig() # Generate vector with 2 samples for each pop with sampling enabled sample_count = [] for p in self.model.populations: if p.allow_samples: sample_count.append(2) else: sample_count.append(0) samples = self.model.get_samples(*sample_count) engine = stdpopsim.get_default_engine() ts = engine.simulate(self.model, contig, samples) self.assertEqual(ts.num_populations, self.model.num_populations)
def get_contig(self, chromosome, genetic_map=None, length_multiplier=1): """ Returns a :class:`.Contig` instance describing a section of genome that is to be simulated based on empirical information for a given species and chromosome. :param str chromosome: The ID of the chromosome to simulate. :param str genetic_map: If specified, obtain recombination rate information from the genetic map with the specified ID. If None, simulate using a default uniform recombination rate on a region with the length of the specified chromosome. The default rates are species- and chromosome- specific, and can be found in the :ref:`sec_catalog`. (Default: None) :param float length_multiplier: If specified, simulate a region of length `length_multiplier` times the length of the specified chromosome with the same chromosome-specific mutation and recombination rates. This option cannot currently be used in conjunction with the ``genetic_map`` argument. :rtype: :class:`.Contig` :return: A :class:`.Contig` describing a simulation of the section of genome. """ # TODO: add non-autosomal support if (chromosome is not None and chromosome.lower() in ("x", "y", "m", "mt", "chrx", "chry", "chrm")): warnings.warn( stdpopsim.NonAutosomalWarning( "Non-autosomal simulations are not yet supported. See " "https://github.com/popsim-consortium/stdpopsim/issues/383 and " "https://github.com/popsim-consortium/stdpopsim/issues/406" )) chrom = self.genome.get_chromosome(chromosome) if genetic_map is None: logger.debug( f"Making flat chromosome {length_multiplier} * {chrom.id}") gm = None recomb_map = msprime.RecombinationMap.uniform_map( chrom.length * length_multiplier, chrom.recombination_rate) else: if length_multiplier != 1: raise ValueError( "Cannot use length multiplier with empirical maps") logger.debug(f"Getting map for {chrom.id} from {genetic_map}") gm = self.get_genetic_map(genetic_map) recomb_map = gm.get_chromosome_map(chrom.id) ret = stdpopsim.Contig(recombination_map=recomb_map, mutation_rate=chrom.mutation_rate, genetic_map=gm) return ret
def simulate(self, demographic_model=None, contig=None, samples=None, seed=None, slim_path=None, slim_script=False, slim_scaling_factor=1.0, slim_burn_in=10.0, dry_run=False, **kwargs): """ Simulate the demographic model using SLiM. See :meth:`.Engine.simulate()` for definitions of the ``demographic_model``, ``contig``, and ``samples`` parameters. :param seed: The seed for the random number generator. :type seed: int :param slim_path: The full path to the slim executable, or the name of a command in the current PATH. :type slim_path: str :param slim_script: If true, the simulation will not be executed. Instead the generated SLiM script will be printed to stdout. :type slim_script: bool :param slim_scaling_factor: Rescale model parameters by the given value, to speed up simulation. Population sizes and generation times are divided by this factor, whereas the mutation rate, recombination rate, and growth rates are multiplied by the factor. See SLiM manual: `5.5 Rescaling population sizes to improve simulation performance.` :type slim_scaling_factor: float :param slim_burn_in: Length of the burn-in phase, in units of N generations. :type slim_burn_in: float :param dry_run: If True, run the first generation setup and then end the simulation. :type dry_run: bool """ if slim_scaling_factor <= 0: raise ValueError("slim_scaling_factor must be positive") if slim_burn_in < 0: raise ValueError("slim_burn_in must be non-negative") run_slim = not slim_script mutation_rate = contig.mutation_rate # Ensure no mutations are introduced by SLiM. contig = stdpopsim.Contig(recombination_map=contig.recombination_map, mutation_rate=0, genetic_map=contig.genetic_map) mktemp = functools.partial(tempfile.NamedTemporaryFile, mode="w") @contextlib.contextmanager def script_file_f(): f = mktemp(suffix=".slim") if not slim_script else sys.stdout yield f # Don't close sys.stdout. if not slim_script: f.close() with script_file_f() as script_file, mktemp(suffix=".ts") as ts_file: recap_epoch = slim_makescript(script_file, ts_file.name, demographic_model, contig, samples, slim_scaling_factor, slim_burn_in) script_file.flush() if not run_slim: return None self._run_slim(script_file.name, slim_path=slim_path, seed=seed, dry_run=dry_run) if dry_run: return None ts = pyslim.load(ts_file.name) ts = self._recap_and_rescale(ts, seed, recap_epoch, contig, mutation_rate, slim_scaling_factor) return ts
def simulate( self, demographic_model=None, contig=None, samples=None, seed=None, verbosity=0, slim_path=None, slim_script=False, slim_scaling_factor=10, slim_no_recapitation=False, slim_no_burnin=False, **kwargs): """ Simulate the demographic model using SLiM. See :meth:`.Engine.simulate()` for definitions of the ``demographic_model``, ``contig``, and ``samples`` parameters. :param seed: The seed for the random number generator. :type seed: int :param slim_path: The full path to the slim executable, or the name of a command in the current PATH. :type slim_path: str :param slim_script: If true, the simulation will not be executed. Instead the generated SLiM script will be printed to stdout. :type slim_script: bool :param slim_scaling_factor: Rescale model parameters by the given value, to speed up simulation. Population sizes and generation times are divided by this factor, whereas the mutation rate, recombination rate, and growth rates are multiplied by the factor. See SLiM manual: `5.5 Rescaling population sizes to improve simulation performance.` :type slim_scaling_factor: float :param slim_no_recapitation: Do an explicit burn in, and add mutations, within the SLiM simulation. This may be much slower than the defaults (recapitation and neutral mutation overlay with msprime). The burn in behaviour is to wait until all individuals in the ancestral populations have a common ancestor within their respective population, and then wait another 10*N generations. :type slim_no_recapitation: bool :param slim_no_burnin: Do not perform a burn in at the start of the simulation. This option is only relevant when ``slim_no_recapitation=True``. :type slim_no_burnin: bool """ run_slim = not slim_script do_recap = not slim_no_recapitation check_coalescence = slim_no_recapitation and not slim_no_burnin if slim_path is None: slim_path = self.slim_path() if do_recap: mutation_rate = contig.mutation_rate # Ensure no mutations are introduced by SLiM. contig = stdpopsim.Contig( recombination_map=contig.recombination_map, mutation_rate=0, genetic_map=contig.genetic_map) slim_cmd = [slim_path] if seed is not None: slim_cmd.extend(["-s", f"{seed}"]) mktemp = functools.partial(tempfile.NamedTemporaryFile, mode="w") @contextlib.contextmanager def script_file_f(): f = mktemp(suffix=".slim") if not slim_script else sys.stdout yield f # Don't close sys.stdout. if not slim_script: f.close() with script_file_f() as script_file, mktemp(suffix=".ts") as ts_file: recap_epoch = slim_makescript( script_file, ts_file.name, demographic_model, contig, samples, slim_scaling_factor, check_coalescence, verbosity) script_file.flush() if not run_slim: return None slim_cmd.append(script_file.name) stdout = subprocess.DEVNULL if verbosity == 0 else None subprocess.check_call(slim_cmd, stdout=stdout) ts = pyslim.load(ts_file.name) # Node times come from SLiM generation numbers, which may have been # divided by a scaling factor for computational tractibility. tables = ts.dump_tables() for table in (tables.nodes, tables.migrations): table.time *= slim_scaling_factor ts = pyslim.SlimTreeSequence.load_tables(tables) ts.slim_generation *= slim_scaling_factor if do_recap: rng = random.Random(seed) s1, s2 = rng.randrange(1, 2**32), rng.randrange(1, 2**32) population_configurations = [ msprime.PopulationConfiguration( initial_size=pop.start_size, growth_rate=pop.growth_rate) for pop in recap_epoch.populations] ts = ts.recapitate( recombination_rate=contig.recombination_map.mean_recombination_rate, population_configurations=population_configurations, migration_matrix=recap_epoch.migration_matrix, random_seed=s1) ts = simplify_remembered(ts) if do_recap: # Add neutral mutations. ts = pyslim.SlimTreeSequence(msprime.mutate( ts, rate=mutation_rate, keep=True, random_seed=s2)) return ts
def get_contig( self, chromosome=None, genetic_map=None, length_multiplier=1, length=None, inclusion_mask=None, exclusion_mask=None, ): """ Returns a :class:`.Contig` instance describing a section of genome that is to be simulated based on empirical information for a given species and chromosome. :param str chromosome: The ID of the chromosome to simulate. A complete list of chromosome IDs for each species can be found in the "Genome" subsection for the species in the :ref:`sec_catalog`. If the chromosome is not given, we specify a "generic" contig with given ``length``. :param str genetic_map: If specified, obtain recombination rate information from the genetic map with the specified ID. If None, simulate using a default uniform recombination rate on a region with the length of the specified chromosome. The default rates are species- and chromosome- specific, and can be found in the :ref:`sec_catalog`. (Default: None) :param float length_multiplier: If specified, simulate a region of length `length_multiplier` times the length of the specified chromosome with the same chromosome-specific mutation and recombination rates. This option cannot currently be used in conjunction with the ``genetic_map`` argument. :param inclusion_mask: If specified, simulated genomes are subset to only inlude regions given by the mask. The mask can be specified by the path and file name of a bed file or as a list or array of intervals given by the left and right end points of the intervals. :param exclusion_mask: If specified, simulated genomes are subset to exclude regions given by the mask. The mask can be specified by the path and file name of a bed file or as a list or array of intervals given by the left and right end points of the intervals. :param float length: Used with a "generic" contig, specifies the length of genome sequence for this contig. For a generic contig, mutation and recombination rates are equal to the genome-wide average across all autosomal chromosomes. :rtype: :class:`.Contig` :return: A :class:`.Contig` describing the section of the genome. """ # TODO: add non-autosomal support non_autosomal_lower = ["x", "y", "m", "mt", "chrx", "chry", "chrm"] if chromosome is not None and chromosome.lower( ) in non_autosomal_lower: warnings.warn( stdpopsim.NonAutosomalWarning( "Non-autosomal simulations are not yet supported. See " "https://github.com/popsim-consortium/stdpopsim/issues/383 and " "https://github.com/popsim-consortium/stdpopsim/issues/406" )) if chromosome is None: if genetic_map is not None: raise ValueError("Cannot use genetic map with generic contic") if length_multiplier != 1: raise ValueError( "Cannot use length multiplier for generic contig") if inclusion_mask is not None or exclusion_mask is not None: raise ValueError("Cannot use mask with generic contig") if length is None: raise ValueError( "Must specify sequence length of generic contig") L_tot = 0 r_tot = 0 u_tot = 0 for chrom_data in self.genome.chromosomes: if chrom_data.id.lower() not in non_autosomal_lower: L_tot += chrom_data.length r_tot += chrom_data.length * chrom_data.recombination_rate u_tot += chrom_data.length * chrom_data.mutation_rate u = u_tot / L_tot r = r_tot / L_tot recomb_map = msprime.RateMap.uniform(length, r) ret = stdpopsim.Contig(recombination_map=recomb_map, mutation_rate=u) else: if length is not None: raise ValueError( "Cannot specify sequence length for named contig") if inclusion_mask is not None and exclusion_mask is not None: raise ValueError( "Cannot specify both inclusion and exclusion masks") chrom = self.genome.get_chromosome(chromosome) if genetic_map is None: logger.debug( f"Making flat chromosome {length_multiplier} * {chrom.id}") gm = None recomb_map = msprime.RateMap.uniform( round(chrom.length * length_multiplier), chrom.recombination_rate) else: if length_multiplier != 1: raise ValueError( "Cannot use length multiplier with empirical maps") logger.debug(f"Getting map for {chrom.id} from {genetic_map}") gm = self.get_genetic_map(genetic_map) recomb_map = gm.get_chromosome_map(chrom.id) inclusion_intervals = None exclusion_intervals = None if inclusion_mask is not None: if length_multiplier != 1: raise ValueError("Cannot use length multiplier with mask") if isinstance(inclusion_mask, str): inclusion_intervals = stdpopsim.utils.read_bed( inclusion_mask, chromosome) else: inclusion_intervals = inclusion_mask if exclusion_mask is not None: if length_multiplier != 1: raise ValueError("Cannot use length multiplier with mask") if isinstance(exclusion_mask, str): exclusion_intervals = stdpopsim.utils.read_bed( exclusion_mask, chromosome) else: exclusion_intervals = exclusion_mask ret = stdpopsim.Contig( recombination_map=recomb_map, mutation_rate=chrom.mutation_rate, genetic_map=gm, inclusion_mask=inclusion_intervals, exclusion_mask=exclusion_intervals, ) return ret
def species_contig( *, species, chromosome=None, genetic_map=None, length_multiplier=1, length=None, mutation_rate=None, inclusion_mask=None, exclusion_mask=None, ): """ Build a Contig for a species. """ # TODO: add non-autosomal support non_autosomal_lower = ["x", "y", "m", "mt", "chrx", "chry", "chrm"] if chromosome is not None and chromosome.lower( ) in non_autosomal_lower: warnings.warn( stdpopsim.NonAutosomalWarning( "Non-autosomal simulations are not yet supported. See " "https://github.com/popsim-consortium/stdpopsim/issues/383 and " "https://github.com/popsim-consortium/stdpopsim/issues/406" )) if chromosome is None: if genetic_map is not None: raise ValueError("Cannot use genetic map with generic contig") if length_multiplier != 1: raise ValueError( "Cannot use length multiplier for generic contig") if inclusion_mask is not None or exclusion_mask is not None: raise ValueError("Cannot use mask with generic contig") if length is None: raise ValueError( "Must specify sequence length of generic contig") L_tot = 0 r_tot = 0 u_tot = 0 for chrom_data in species.genome.chromosomes: if chrom_data.id.lower() not in non_autosomal_lower: L_tot += chrom_data.length r_tot += chrom_data.length * chrom_data.recombination_rate u_tot += chrom_data.length * chrom_data.mutation_rate if mutation_rate is None: mutation_rate = u_tot / L_tot r = r_tot / L_tot contig = Contig.basic_contig( length=length, mutation_rate=mutation_rate, recombination_rate=r, ) else: if length is not None: raise ValueError( "Cannot specify sequence length for named contig") if inclusion_mask is not None and exclusion_mask is not None: raise ValueError( "Cannot specify both inclusion and exclusion masks") chrom = species.genome.get_chromosome(chromosome) if genetic_map is None: logger.debug( f"Making flat chromosome {length_multiplier} * {chrom.id}") gm = None recomb_map = msprime.RateMap.uniform( round(chrom.length * length_multiplier), chrom.recombination_rate) else: if length_multiplier != 1: raise ValueError( "Cannot use length multiplier with empirical maps") logger.debug(f"Getting map for {chrom.id} from {genetic_map}") gm = species.get_genetic_map(genetic_map) recomb_map = gm.get_chromosome_map(chrom.id) inclusion_intervals = None exclusion_intervals = None if inclusion_mask is not None: if length_multiplier != 1: raise ValueError("Cannot use length multiplier with mask") if isinstance(inclusion_mask, str): inclusion_intervals = stdpopsim.utils.read_bed( inclusion_mask, chromosome) else: inclusion_intervals = inclusion_mask if exclusion_mask is not None: if length_multiplier != 1: raise ValueError("Cannot use length multiplier with mask") if isinstance(exclusion_mask, str): exclusion_intervals = stdpopsim.utils.read_bed( exclusion_mask, chromosome) else: exclusion_intervals = exclusion_mask if mutation_rate is None: mutation_rate = chrom.mutation_rate contig = stdpopsim.Contig( recombination_map=recomb_map, mutation_rate=mutation_rate, genetic_map=gm, inclusion_mask=inclusion_intervals, exclusion_mask=exclusion_intervals, ) return contig
def test_simulation_runs(self): model = pongo.LockeEtAlPongoIM() contig = stdpopsim.Contig() samples = model.get_samples(2) ts = model.simulate(contig, samples) self.assertEqual(ts.num_populations, 2)
def simulate( self, demographic_model=None, contig=None, samples=None, seed=None, mutation_types=None, extended_events=None, slim_path=None, slim_script=False, slim_scaling_factor=1.0, slim_burn_in=10.0, dry_run=False, ): """ Simulate the demographic model using SLiM. See :meth:`.Engine.simulate()` for definitions of the ``demographic_model``, ``contig``, and ``samples`` parameters. :param seed: The seed for the random number generator. :type seed: int :param slim_path: The full path to the slim executable, or the name of a command in the current PATH. :type slim_path: str :param slim_script: If true, the simulation will not be executed. Instead the generated SLiM script will be printed to stdout. :type slim_script: bool :param slim_scaling_factor: Rescale model parameters by the given value, to speed up simulation. Population sizes and generation times are divided by this factor, whereas the mutation rate, recombination rate, and growth rates are multiplied by the factor. See SLiM manual: `5.5 Rescaling population sizes to improve simulation performance.` :type slim_scaling_factor: float :param slim_burn_in: Length of the burn-in phase, in units of N generations. :type slim_burn_in: float :param dry_run: If True, run the first generation setup and then end the simulation. :type dry_run: bool """ if slim_scaling_factor <= 0: raise ValueError("slim_scaling_factor must be positive") if slim_burn_in < 0: raise ValueError("slim_burn_in must be non-negative") if slim_scaling_factor != 1: warnings.warn( stdpopsim.SLiMScalingFactorWarning( f"You're using a scaling factor ({slim_scaling_factor}). " "This should give similar results for many situations, " "but is not equivalent, especially in the presence of selection. " "When using rescaling, you should be careful---do checks and " "compare results across different values of the scaling factor." )) run_slim = not slim_script # Ensure only "weighted" mutations are introduced by SLiM. mutation_rate = contig.mutation_rate slim_frac = stdpopsim.ext.slim_mutation_frac(mutation_types) contig = stdpopsim.Contig( recombination_map=contig.recombination_map, mutation_rate=slim_frac * mutation_rate, genetic_map=contig.genetic_map, inclusion_mask=contig.inclusion_mask, exclusion_mask=contig.exclusion_mask, ) mktemp = functools.partial(tempfile.NamedTemporaryFile, mode="w") @contextlib.contextmanager def script_file_f(): f = mktemp(suffix=".slim") if not slim_script else sys.stdout yield f # Don't close sys.stdout. if not slim_script: f.close() with script_file_f() as script_file, mktemp(suffix=".ts") as ts_file: recap_epoch = slim_makescript( script_file, ts_file.name, demographic_model, contig, samples, mutation_types, extended_events, slim_scaling_factor, slim_burn_in, ) script_file.flush() if not run_slim: return None self._run_slim(script_file.name, slim_path=slim_path, seed=seed, dry_run=dry_run) if dry_run: return None ts = pyslim.load(ts_file.name) ts = self._recap_and_rescale(ts, seed, recap_epoch, contig, mutation_rate, slim_frac, slim_scaling_factor) if contig.inclusion_mask is not None: ts = stdpopsim.utils.mask_tree_sequence(ts, contig.inclusion_mask, False) if contig.exclusion_mask is not None: ts = stdpopsim.utils.mask_tree_sequence(ts, contig.exclusion_mask, True) return ts
def simulate_stdpopsim( species, model, contig, num_samples, mutation_file=None, seed=123, skip_existing=False, num_procs=1, ): base_fn = f"{model}_{contig}_n{num_samples}" tree_fn = f"{base_fn}_seed{seed}" logger.info( f"Using {species}:{contig} from stdpopsim using the {model} model") if skip_existing and os.path.exists(tree_fn + ".trees"): logger.info( f"Simulation file {tree_fn}.trees already exists, returning that.") return base_fn, tree_fn sample_data = None species = stdpopsim.get_species(species) model = species.get_demographic_model(model) num_pops = model.num_sampling_populations if num_samples < num_pops or num_samples % num_pops != 0: raise ValueError( f"num_samples must be an integer multiple of {num_pops} " f"(or 2 x {num_pops} if diploid sequencing error is injected)") pop_n = num_samples // num_pops logger.info( f"Simulating {num_pops}x{pop_n} samples, seed {seed}, file prefix '{tree_fn}'." ) contig = species.get_contig(contig) l = contig.recombination_map.get_sequence_length() if mutation_file is not None: logger.debug(f"Loading {mutation_file}") sample_data = tsinfer.load(mutation_file) if sample_data.sequence_length != l: raise ValueError( f"Mismatching sequence_length between simulation and {mutation_file}" ) # Reduce mutation rate to 0, as we will insert mutations later contig = stdpopsim.Contig( mutation_rate=0, recombination_map=contig.recombination_map, genetic_map=contig.genetic_map, ) r_map = contig.recombination_map assert len(r_map.get_rates()) == 2 # Ensure a single rate over chr samples = model.get_samples(*([pop_n] * num_pops)) engine = stdpopsim.get_engine('msprime') ts = engine.simulate(model, contig, samples, seed=seed) tables = ts.dump_tables() if sample_data is not None: pos = sample_data.sites_position[:] logger.info( f"Inserting {len(pos)} mutations at variable sites from {mutation_file}" ) for tree in ts.trees(): positions = pos[np.logical_and(pos >= tree.interval[0], pos < tree.interval[1])] if len(positions) == 0: continue muts = list( zip( np.random.uniform(0, tree.total_branch_length, size=len(positions)), positions)) muts.sort() tot = 0 # place a mutation on a random branch, proportional to branch length try: for n in tree.nodes(): tot += tree.branch_length(n) while muts[0][0] < tot: _, position = muts.pop(0) s = tables.sites.add_row(position=position, ancestral_state="0") tables.mutations.add_row(node=n, site=s, derived_state="1") except IndexError: # No more mutations - go to next tree continue tables.sort() logger.debug( f"Inserted mutations at density {ts.num_mutations/ts.sequence_length}" ) interval = [int(l * 2 / 20), int(l * 2 / 20) + 1e7] # 10Mb near the start, not centromeric tables.keep_intervals([interval]) tables.trim() logger.debug( f"Cut down tree seq to {interval} ({tables.sites.num_rows} sites) for speed" ) # Add info to the top-level metadata user_data = {} logger.info( "Calculating the kc distance of the simulation against a flat tree") star_tree = tskit.Tree.generate_star(ts.num_samples, span=tables.sequence_length, record_provenance=False) user_data['kc_max'] = tables.tree_sequence().kc_distance( star_tree.tree_sequence) kc_array = [] max_reps = 100 ts = tables.tree_sequence() logger.info( f"Calculating KC distance of the sim against at most {max_reps} * {ts.num_trees}" f" random trees using {num_procs} parallel threads. This could take a while." ) seeds = range(seed, seed + max_reps) with multiprocessing.Pool(num_procs) as pool: for i, kc in enumerate( pool.imap_unordered(rnd_kc, zip(itertools.repeat(ts), seeds))): kc_array.append(kc) if i > 10: se_mean = np.std(kc_array, ddof=1) / np.sqrt(i) # break if SEM < 1/100th of mean KC. This can take along time if se_mean / np.average(kc_array) < 0.01: logger.info( f"Stopped after {i} replicates as kc_max_split deemed accurate." ) break user_data['kc_max_split'] = np.average(kc_array) if tables.metadata_schema != tskit.MetadataSchema({"codec": "json"}): if tables.metadata: raise RuntimeError("Metadata already exists, and is not JSON") tables.metadata_schema = tskit.MetadataSchema({"codec": "json"}) tables.metadata = {} tables.metadata = {"user_data": user_data, **tables.metadata} tables.tree_sequence().dump(tree_fn + ".trees") return base_fn, tree_fn
out = np.copy(pgen) for i in range(out.shape[0]): out[i, :] = np.random.binomial(2, out[i, :]) return out bingen = binomialBinGenotypes(pgen) ############################ run OOA simulation and prep allele counts ##################### import msprime as msp import stdpopsim print("simulating") species = stdpopsim.get_species("HomSap") contig = species.get_contig("chr22", genetic_map="HapMapII_GRCh37") new_contig = stdpopsim.Contig(recombination_map=contig.recombination_map, mutation_rate=2.35e-8, genetic_map=contig.genetic_map) model = species.get_demographic_model( 'OutOfAfrica_3G09' ) #similar results with OutOfAfrica_3G09 and OutOfAfricaArchaicAdmixture_5R19 simsamples = model.get_samples(100, 100, 100) engine = stdpopsim.get_engine('msprime') sim = engine.simulate(model, new_contig, simsamples, seed=12345) sim_gen = allel.HaplotypeArray(sim.genotype_matrix()).to_genotypes(ploidy=2) sim_pos = np.array([s.position for s in sim.sites()], dtype="int32") m2 = np.isin(sim_pos, keep) sim_gen = sim_gen[m2, :, :] sim_pos = sim_pos[m2] # sim_gen=sim_gen[sim_pos<3.8e7,:,:] # sim_pos=sim_pos[sim_pos<3.8e7]