def _execute_model_iim(optimiser): """ Execute an experiment for the isolation-with-migration model using the specified optimiser. :param optimiser: The optimiser to use while executing the model. :return: None """ from IMCoalHMM.hmm import Forwarder from IMCoalHMM.isolation_with_migration_model import IsolationMigrationModel from IMCoalHMM.likelihood import Likelihood no_ancestral_states = _config.try_int('model.ancestral_states', 10) no_migration_states = _config.try_int('model.migration_states', 10) transformer = _Transformer() transformer.add('isolation_time', _config.try_float('model.isolation_time.min', 0.0), _config.try_float('model.isolation_time.max', 0.002)) transformer.add('mig_time', _config.try_float('model.mig_time.min', 0.0), _config.try_float('model.mig_time.max', 0.016)) transformer.add('coal_rate', _config.try_float('model.coal_rate.min', 0.0), _config.try_float('model.coal_rate.max', 2000.0)) transformer.add('recomb_rate', _config.try_float('model.recomb_rate.min', 0.0), _config.try_float('model.recomb_rate.max', 0.8)) transformer.add('mig_rate', _config.try_float('model.mig_rate.min', 0.0), _config.try_float('model.mig_rate.max', 500.0)) _log_header() forwarders = [Forwarder(arg, NSYM=3) for arg in _alignments] model = IsolationMigrationModel(no_migration_states, no_ancestral_states) log_likelihood = Likelihood(model, forwarders) def fitness_function(parameters): transformed_parameters = transformer.transform(parameters) return log_likelihood(numpy.array(transformed_parameters)) def log_function(context): _process_logged_context(context, transformer) optimiser.log = log_function mle_context = optimiser.maximise(fitness_function, len(transformer.names)) mle_parameters, mle_log_likelihood = _process_final_context( mle_context, transformer) mle_isolation_time, mle_mig_time, mle_coal_rate, mle_recomb_rate, mle_mig_rate = mle_parameters mle_theta = 2.0 / mle_coal_rate _log_comment('') _log_comment('mle_isolation_time = {0}'.format(mle_isolation_time)) _log_comment('mle_mig_time = {0}'.format(mle_mig_time)) _log_comment('mle_coal_rate = {0}'.format(mle_coal_rate)) _log_comment('mle_recomb_rate = {0}'.format(mle_recomb_rate)) _log_comment('mle_mig_rate = {0}'.format(mle_mig_rate)) _log_comment('mle_theta = {0}'.format(mle_theta)) _log_comment('mle_log_likelihood = {0}'.format(mle_log_likelihood))
def _execute_model_iim_epochs(optimiser): """ Execute an experiment for the isolation-with-migration-with-epochs model using the specified optimiser. :param optimiser: The optimiser to use while executing the model. :return: None """ from IMCoalHMM.hmm import Forwarder from IMCoalHMM.isolation_with_migration_model_epochs import IsolationMigrationEpochsModel from IMCoalHMM.likelihood import Likelihood epoch_factor = _config.try_int('model.epoch_factor', 1) coal_count = 1 + (2 * epoch_factor) no_ancestral_states = _config.try_int('model.ancestral_states', 10) no_migration_states = _config.try_int('model.migration_states', 10) transformer = _Transformer() transformer.add('isolation_time', _config.try_float('model.isolation_time.min', 0.0), _config.try_float('model.isolation_time.max', 0.002)) transformer.add('mig_time', _config.try_float('model.mig_time.min', 0.0), _config.try_float('model.mig_time.max', 0.016)) transformer.add('recomb_rate', _config.try_float('model.recomb_rate.min', 0.0), _config.try_float('model.recomb_rate.max', 0.8)) for i in xrange(coal_count): key = 'coal_rate_{0}'.format(i + 1) transformer.add(key, _config.try_float('model.{0}.min'.format(key), 0.0), _config.try_float('model.{0}.max'.format(key), 2000.0)) for i in xrange(epoch_factor): key = 'mig_rate_{0}'.format(i + 1) transformer.add(key, _config.try_float('model.{0}.min'.format(key), 0.0), _config.try_float('model.{0}.max'.format(key), 500.0)) _log_header() forwarders = [Forwarder(arg, NSYM=3) for arg in _alignments] model = IsolationMigrationEpochsModel(epoch_factor, no_migration_states, no_ancestral_states) log_likelihood = Likelihood(model, forwarders) def fitness_function(parameters): transformed_parameters = transformer.transform(parameters) return log_likelihood(numpy.array(transformed_parameters)) def log_function(context): _process_logged_context(context, transformer) optimiser.log = log_function mle_context = optimiser.maximise(fitness_function, len(transformer.names)) mle_parameters, mle_log_likelihood = _process_final_context( mle_context, transformer) mle_isolation_time = mle_parameters[0] mle_mig_time = mle_parameters[1] mle_recomb_rate = mle_parameters[2] mle_coal_rates = mle_parameters[3:coal_count + 3] mle_mig_rates = mle_parameters[coal_count + 3:] _log_comment('') _log_comment('mle_isolation_time = {0}'.format(mle_isolation_time)) _log_comment('mle_mig_time = {0}'.format(mle_mig_time)) _log_comment('mle_recomb_rate = {0}'.format(mle_recomb_rate)) for i, coal_rate in enumerate(mle_coal_rates): _log_comment('mle_coal_rate_{0} = {1}'.format(i + 1, coal_rate)) for i, mig_rate in enumerate(mle_mig_rates): _log_comment('mle_mig_rate_{0} = {1}'.format(i + 1, mig_rate)) _log_comment('mle_log_likelihood = {0}'.format(mle_log_likelihood))
def main(): """ Run the main script. """ usage = """%(prog)s [options] <forwarder dirs> This program estimates the parameters of an isolation model with an initial migration period with two species and uniform coalescence and recombination rates.""" parser = ArgumentParser(usage=usage, version="%(prog)s 1.2") parser.add_argument("--header", action="store_true", default=False, help="Include a header on the output") parser.add_argument("-o", "--outfile", type=str, default="/dev/stdout", help="Output file for the estimate (/dev/stdout)") parser.add_argument( "--logfile", type=str, default=None, help="Log for all points estimated in the optimization") parser.add_argument( "--ancestral-states", type=int, default=10, help= "Number of intervals used to discretize the time in the ancestral population (10)" ) parser.add_argument( "--migration-states", type=int, default=10, help= "Number of intervals used to discretize the time in the migration period (10)" ) parser.add_argument( "--optimizer", type=str, default="Nelder-Mead", help= "Optimization algorithm to use for maximizing the likelihood (Nealder-Mead)", choices=['Nelder-Mead', 'Powell', 'L-BFGS-B', 'TNC']) optimized_params = [ ('isolation-period', 'time where the populations have been isolated', 1e6 / 1e9), ('migration-period', 'time period where the populations exchanged genes', 1e6 / 1e9), ('theta', 'effective population size in 4Ne substitutions', 1e6 / 1e9), ('rho', 'recombination rate in substitutions', 0.4), ('migration-rate', 'migration rate in number of migrations per substitution', 200.0) ] for parameter_name, description, default in optimized_params: parser.add_argument("--%s" % parameter_name, type=float, default=default, help="Initial guess at the %s (%g)" % (description, default)) parser.add_argument('alignments', nargs='+', help='Alignments in ZipHMM format') options = parser.parse_args() if len(options.alignments) < 1: parser.error("Input alignment not provided!") # get options no_migration_states = options.migration_states no_ancestral_states = options.ancestral_states theta = options.theta rho = options.rho forwarders = [Forwarder(arg, NSYM=3) for arg in options.alignments] init_isolation_time = options.isolation_period init_migration_time = options.migration_period init_coal = 1 / (theta / 2) init_recomb = rho init_migration = options.migration_rate log_likelihood = Likelihood( IsolationMigrationModel(no_migration_states, no_ancestral_states), forwarders) initial_parameters = (init_isolation_time, init_migration_time, init_coal, init_recomb, init_migration) if options.logfile: with open(options.logfile, 'w') as logfile: if options.header: print >> logfile, '\t'.join([ 'isolation.period', 'migration.period', 'theta', 'rho', 'migration' ]) mle_parameters = \ maximum_likelihood_estimate(log_likelihood, initial_parameters, log_file=logfile, optimizer_method=options.optimizer, log_param_transform=transform) else: mle_parameters = \ maximum_likelihood_estimate(log_likelihood, initial_parameters, optimizer_method=options.optimizer) max_log_likelihood = log_likelihood(mle_parameters) with open(options.outfile, 'w') as outfile: if options.header: print >> outfile, '\t'.join([ 'isolation.period', 'migration.period', 'theta', 'rho', 'migration', 'log.likelihood' ]) print >> outfile, '\t'.join( map(str, transform(mle_parameters) + (max_log_likelihood, )))
def _set_chain(self): forwarders = [Forwarder(arg, NSYM=3) for arg in self.input_files] log_likelihood = Likelihood(self.model, forwarders) self.chain = MCMC(priors=self.priors, log_likelihood=log_likelihood, thinning=self.thinning)
def main(): """ Run the main script. """ usage = """%(prog)s [options] alignments... This program estimates the parameters of an isolation model with an initial migration period with two species and uniform coalescence and recombination rates.""" parser = ArgumentParser(usage=usage, version="%(prog)s 1.7") parser.add_argument("-o", "--outfile", type=str, default="/dev/stdout", help="Output file for the estimate (/dev/stdout)") parser.add_argument("--logfile", type=str, default=None, help="Log for sampled points in all chains for the MCMCMC during the run." \ "This parameter is only valid when running --mc3.") parser.add_argument( "--ancestral-states", type=int, default=10, help= "Number of intervals used to discretize the time in the ancestral population (10)" ) parser.add_argument( "--migration-states", type=int, default=10, help= "Number of intervals used to discretize the time in the migration period (10)" ) parser.add_argument("-n", "--samples", type=int, default=500, help="Number of samples to draw (500)") parser.add_argument("--mc3", help="Run a Metropolis-Coupled MCMC", action="store_true") parser.add_argument("--mc3-chains", type=int, default=3, help="Number of MCMCMC chains") parser.add_argument("--temperature-scale", type=float, default=10.0, help="The scale by which higher chains will have added temperature." \ "Chain i will have temperature scale*i.") parser.add_argument("-k", "--thinning", type=int, default=100, help="Number of MCMC steps between samples (100)") parser.add_argument("--sample-priors", help="Sample independently from the priors", action="store_true") parser.add_argument("--mcmc-priors", help="Run the MCMC but use the prior as the posterior", action="store_true") meta_params = [ ('isolation-period', 'time where the populations have been isolated', 1e6 / 1e9), ('migration-period', 'time period where the populations exchanged genes', 1e6 / 1e9), ('theta', 'effective population size in 4Ne substitutions', 1e6 / 1e9), ('rho', 'recombination rate in substitutions', 0.4), ('migration-rate', 'migration rate in number of migrations per substitution', 200.0) ] for parameter_name, description, default in meta_params: parser.add_argument("--%s" % parameter_name, type=float, default=default, help="Meta-parameter mean of the %s (%g)" % (description, default)) parser.add_argument('alignments', nargs='*', help='Alignments in ZipHMM format') options = parser.parse_args() if len(options.alignments) < 1 and not (options.sample_priors or options.mcmc_priors): parser.error("Input alignment not provided!") if len(options.alignments) > 0 and options.mcmc_priors: parser.error( "You should not provide alignments to the command line when sampling from the prior" ) if options.logfile and not options.mc3: parser.error( "the --logfile option is only valid together with the --mc3 option." ) # Specify priors and proposal distributions... # I am sampling in log-space to make it easier to make a random walk isolation_period_prior = LogNormPrior(log(options.isolation_period)) migration_period_prior = LogNormPrior(log(options.migration_period)) coal_prior = LogNormPrior(log(1 / (options.theta / 2))) rho_prior = LogNormPrior(log(options.rho)) migration_rate_prior = ExpLogNormPrior(options.migration_rate) priors = [ isolation_period_prior, migration_period_prior, coal_prior, rho_prior, migration_rate_prior ] # If we only want to sample from the priors we simply collect random points from these if options.sample_priors: with open(options.outfile, 'w') as outfile: print >> outfile, '\t'.join([ 'isolation.period', 'migration.period', 'theta', 'rho', 'migration', 'log.prior', 'log.likelihood', 'log.posterior' ]) for _ in xrange(options.samples): params = [prior.sample() for prior in priors] prior = sum( log(prior.pdf(p)) for prior, p in zip(priors, params)) likelihood = 0.0 posterior = prior + likelihood print >> outfile, '\t'.join( map(str, transform(params) + (prior, likelihood, posterior))) outfile.flush() sys.exit(0) # Successful termination if options.mc3: mcmc = MC3(priors, input_files=options.alignments, model=IsolationMigrationModel(options.migration_states, options.ancestral_states), thinning=options.thinning, no_chains=options.mc3_chains, switching=options.thinning / 10, temperature_scale=options.temperature_scale) else: forwarders = [Forwarder(arg, NSYM=3) for arg in options.alignments] log_likelihood = Likelihood( IsolationMigrationModel(options.migration_states, options.ancestral_states), forwarders) mcmc = MCMC(priors, log_likelihood, thinning=options.thinning) with open(options.outfile, 'w') as outfile: print >> outfile, '\t'.join([ 'isolation.period', 'migration.period', 'theta', 'rho', 'migration', 'log.prior', 'log.likelihood', 'log.posterior' ]) if options.logfile: with open(options.logfile, 'w') as logfile: print >> logfile, '\t'.join([ 'chain', 'isolation.period', 'migration.period', 'theta', 'rho', 'migration', 'log.prior', 'log.likelihood', 'log.posterior' ]) for _ in xrange(options.samples): # Write main chain to output params, prior, likelihood, posterior = mcmc.sample() print >> outfile, '\t'.join( map(str, transform(params) + (prior, likelihood, posterior))) outfile.flush() # All chains written to the log for chain_no, chain in enumerate(mcmc.chains): params = chain.current_theta prior = chain.current_prior likelihood = chain.current_likelihood posterior = chain.current_posterior print >> logfile, '\t'.join( map(str, (chain_no, ) + transform(params) + (prior, likelihood, posterior))) logfile.flush() else: for _ in xrange(options.samples): params, prior, likelihood, posterior = mcmc.sample() print >> outfile, '\t'.join( map(str, transform(params) + (prior, likelihood, posterior))) outfile.flush() if options.mc3: mcmc.terminate()
def main(): usage = """%(prog)s [options] <input> <input format> <output dir> This program reads in an input sequence in any format supported by BioPython and writes out a preprocessed file ready for use with CoalHMM (python module ziphmm). Also supports gzipped input files, if the name ends with `.gz`. Assumption #1: Either the file is a pairwise alignment, or you have provided exactly two names to the `--names` option. Assumption #2: The file uses a simple ACGT format (and N/-). Anything else will be interpreted as N and a warning will be given with all unknown symbols. Warning: This program uses SeqIO.to_dict to read in the entire alignment, you may want to split the alignment first if it's very large. """ parser = ArgumentParser(usage=usage, version="%(prog)s 1.1") parser.add_argument("--names", type=str, default=None, help="A comma-separated list of names to use from the source file") parser.add_argument("--verbose", action="store_true", default=False, help="Print status information during processing") # positional arguments parser.add_argument("in_filename", type=str, help="Input file") parser.add_argument("in_format", type=str, help="The file format for the input") parser.add_argument("output_filename", type=str, help="Where to write the processed alignment") options = parser.parse_args() if not os.path.exists(options.in_filename): print 'The input file', options.in_filename, 'does not exists.' sys.exit(1) if os.path.exists(options.output_filename): print 'The output file', options.output_filename, 'already exists.' print 'If you want to replace it, please explicitly remove the current' print 'version first.' sys.exit(1) if options.in_filename.endswith('.gz'): if options.verbose: print "Assuming '%s' is a gzipped file." % options.in_filename inf = gzip.open(options.in_filename) else: inf = open(options.in_filename) if options.verbose: print "Loading data...", sys.stdout.flush() alignments = SeqIO.to_dict(SeqIO.parse(inf, options.in_format)) if options.verbose: print "done" clean = set('ACGT') if options.names: names = options.names.split(',') else: names = list(alignments.keys()) if len(names) == 2: # PAIRWISE ALIGNMENT ########################################################################### if options.verbose: print "Assuming pairwise alignment between '%s' and '%s'" % (names[0], names[1]) srcs = [alignments[name].seq for name in names] sequence1 = srcs[0] sequence2 = srcs[1] assert len(sequence1) == len(sequence2) sequence_length = len(sequence1) if options.verbose: print "Writing file readable by ziphmm to '%s'..." % options.output_filename, sys.stdout.flush() seen = set() with open(options.output_filename, 'w', 64 * 1024) as f: for i in xrange(sequence_length): s1, s2 = sequence1[i].upper(), sequence2[i].upper() seen.add(s1) seen.add(s2) if s1 not in clean or s2 not in clean: print >> f, 2, elif s1 == s2: print >> f, 0, else: print >> f, 1, if options.verbose: print "done" if len(seen - set('ACGTN-')) > 1: print >> sys.stderr, "I didn't understand the following symbols form the input sequence: %s" % ( ''.join(list(seen - set('ACGTN-')))) elif len(names) == 3: # TRIPLET ALIGNMENT ########################################################################### if options.verbose: print "Assuming triplet alignment between '%s', '%s', and '%s'" % (names[0], names[1], names[2]) srcs = [alignments[name].seq for name in names] sequence1 = srcs[0] sequence2 = srcs[1] sequence3 = srcs[2] assert len(sequence1) == len(sequence2) assert len(sequence1) == len(sequence3) sequence_length = len(sequence1) outname = options.output_filename if options.verbose: print "Writing file readable by ziphmm to '%s'..." % outname, sys.stdout.flush() seen = set() nuc_map = {'A': 0, 'C': 1, 'G': 2, 'T': 3} with open(outname, 'w', 64 * 1024) as f: for i in xrange(sequence_length): s1, s2, s3 = sequence1[i].upper(), sequence2[i].upper(), sequence3[i].upper() seen.add(s1) seen.add(s2) seen.add(s3) if s1 in clean and s2 in clean and s3 in clean: i1, i2, i3 = nuc_map[s1], nuc_map[s2], nuc_map[s3] print >> f, i1 + 4*i2 + 16*i3, else: print >> f, 64, if options.verbose: print "done" if len(seen - set('ACGTN-')) > 1: print >> sys.stderr, "I didn't understand the following symbols form the input sequence: %s" % ( ''.join(list(seen - set('ACGTN-')))) elif len(names) == 4: # QUARTET ALIGNMENT ########################################################################### if options.verbose: print "Assuming quartet alignment between '%s', '%s', '%s', and '%s'" % (names[0], names[1], names[2], names[3]) srcs = [alignments[name].seq for name in names] sequence1 = srcs[0] sequence2 = srcs[1] sequence3 = srcs[2] sequence4 = srcs[3] assert len(sequence1) == len(sequence2) assert len(sequence1) == len(sequence3) assert len(sequence1) == len(sequence4) sequence_length = len(sequence1) outname = options.output_filename if options.verbose: print "Writing file readable by ziphmm to '%s'..." % outname, sys.stdout.flush() seen = set() nuc_map = {'A': 0, 'C': 1, 'G': 2, 'T': 3} with open(outname, 'w', 64 * 1024) as f: for i in xrange(sequence_length): s1, s2, s3, s4 = sequence1[i].upper(), sequence2[i].upper(), sequence3[i].upper(), sequence4[i].upper() seen.add(s1) seen.add(s2) seen.add(s3) seen.add(s4) if s1 in clean and s2 in clean and s3 in clean and s4 in clean: i1, i2, i3, i4 = nuc_map[s1], nuc_map[s2], nuc_map[s3], nuc_map[s4] print >> f, i1 + 4*i2 + 16*i3 + 32*i4, else: print >> f, 128, if options.verbose: print "done" if len(seen - set('ACGTN-')) > 1: print >> sys.stderr, "I didn't understand the following symbols form the input sequence: %s" % ( ''.join(list(seen - set('ACGTN-')))) if options.verbose: print "ZipHMM is pre-processing...", sys.stdout.flush() f = Forwarder.fromSequence(seqFilename=outname, alphabetSize=9, minNoEvals=500) if options.verbose: print "done" else: print 'There are', len(names), 'species identified. We do not know how to convert that into something' print 'that CoalHMM can handle, sorry.' sys.exit(1)