def __init__(self, options): super(_IsolationInitialMigration3HMMModel, self).__init__() alignments_src1_admix = _prepare_alignments(options, 'ziphmm_src1_admix') alignments_src1_scr1 = _prepare_alignments(options, 'ziphmm_scr1_scr1') alignments_admix_admix = _prepare_alignments(options, 'ziphmm_admix_admix') self.forwarders_src1_admix = [Forwarder.fromDirectory(arg) for arg in alignments_src1_admix] self.forwarders_src1_src1 = [Forwarder.fromDirectory(arg) for arg in alignments_src1_scr1] self.forwarders_admix_admix = [Forwarder.fromDirectory(arg) for arg in alignments_admix_admix]
def __init__(self, options): super(_ThreePopAdmix23Model, self).__init__() alignments_src1_admix = _prepare_alignments(options, 'ziphmm_src1_admix') alignments_src2_admix = _prepare_alignments(options, 'ziphmm_src2_admix') alignments_src1_src2 = _prepare_alignments(options, 'ziphmm_src1_src2') self.forwarders_src1_admix = [Forwarder.fromDirectory(arg) for arg in alignments_src1_admix] self.forwarders_src2_admix = [Forwarder.fromDirectory(arg) for arg in alignments_src2_admix] self.forwarders_src1_src2 = [Forwarder.fromDirectory(arg) for arg in alignments_src1_src2]
def __init__(self, options): super(_ThreePopAdmix23Model15HMM, self).__init__() alignments_src1_admix = [_prepare_alignments(options, 'ziphmm_src1_admix'), _prepare_alignments(options, 'ziphmm_src1_admix_2'), _prepare_alignments(options, 'ziphmm_src1_admix_3'), _prepare_alignments(options, 'ziphmm_src1_admix_4')] alignments_src2_admix = [_prepare_alignments(options, 'ziphmm_src2_admix'), _prepare_alignments(options, 'ziphmm_src2_admix_2'), _prepare_alignments(options, 'ziphmm_src2_admix_3'), _prepare_alignments(options, 'ziphmm_src2_admix_4')] alignments_src1_src2 = [_prepare_alignments(options, 'ziphmm_src1_src2'), _prepare_alignments(options, 'ziphmm_src1_src2_2'), _prepare_alignments(options, 'ziphmm_src1_src2_3'), _prepare_alignments(options, 'ziphmm_src1_src2_4')] alignments_src1_scr1 = _prepare_alignments(options, 'ziphmm_scr1_scr1') alignments_src2_src2 = _prepare_alignments(options, 'ziphmm_src2_src2') alignments_admix_admix = _prepare_alignments(options, 'ziphmm_admix_admix') self.forwarders_src1_admix = [[Forwarder.fromDirectory(arg) for arg in algs] for algs in alignments_src1_admix] self.forwarders_src2_admix = [[Forwarder.fromDirectory(arg) for arg in algs] for algs in alignments_src2_admix] self.forwarders_src1_src2 = [[Forwarder.fromDirectory(arg) for arg in algs] for algs in alignments_src1_src2] self.forwarders_src1_src1 = [Forwarder.fromDirectory(arg) for arg in alignments_src1_scr1] self.forwarders_src2_src2 = [Forwarder.fromDirectory(arg) for arg in alignments_src2_src2] self.forwarders_admix_admix = [Forwarder.fromDirectory(arg) for arg in alignments_admix_admix]
def main(): usage = """%prog [options] <input> <input format> <output dir> This program reads in an input sequence in any format supported by BioPython and writes out a preprocessed file ready for use with zipHMM. Also supports gzipped input files, if the name ends with `.gz`. Assumption #1: Either the file is a pairwise alignment, or you have provided exactly two names to the `--names` option. Assumption #2: The file uses a simple ACGT format (and N/-). Anything else will be interpreted as N and a warning will be given with all unknown symbols. Warning: This program uses SeqIO.to_dict to read in the entire alignment, you may want to split the alignment first if it's very large. """ parser = OptionParser(usage=usage, version="%prog 1.0") parser.add_option( "--names", dest="names", type="string", default=None, help="A comma-separated list of names to use from the source file", ) parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Print some stuff") (options, args) = parser.parse_args() if len(args) != 3: parser.error("Needs input file, input format and output file") in_filename = args.pop(0) in_format = args.pop(0) output_dirname = args.pop(0) assert os.path.exists(in_filename), "Must use an existing input file" if in_filename.endswith(".gz"): if options.verbose: print "Assuming '%s' is a gzipped file." % in_filename inf = gzip.open(in_filename) else: inf = open(in_filename) if options.verbose: print "Loading data...", sys.stdout.flush() alignments = SeqIO.to_dict(SeqIO.parse(inf, in_format)) if options.verbose: print "done" if options.names: names = options.names.split(",") else: names = list(alignments.keys()) assert len(names) == 2, "Must be a pairwise alignment." if options.verbose: print "Assuming pairwise alignment between '%s' and '%s'" % (names[0], names[1]) srcs = [alignments[name].seq for name in names] clean = set("ACGT") A = srcs[0] B = srcs[1] assert len(A) == len(B) L = len(A) fd, foutname = tempfile.mkstemp() if options.verbose: print "Writing temp file readable by zipHMM to '%s'..." % (foutname), sys.stdout.flush() seen = set() with os.fdopen(fd, "w", 64 * 1024) as f: for i in xrange(L): s1, s2 = A[i].upper(), B[i].upper() seen.add(s1) seen.add(s2) if s1 not in clean or s2 not in clean: print >> f, 2, elif s1 == s2: print >> f, 0, else: print >> f, 1, if options.verbose: print "done" if len(seen - set("ACGTN-")) > 1: print >> sys.stderr, "I didn't understand the following symbols form the input sequence: %s" % ( "".join(list(seen - set("ACGTN-"))) ) if options.verbose: print "zipHMM is preprocessing...", sys.stdout.flush() f = Forwarder.fromSequence(seqFilename=foutname, alphabetSize=3, minNoEvals=500) if options.verbose: print "done" if options.verbose: print "Writing zipHMM data to '%s'..." % (output_dirname), sys.stdout.flush() if not os.path.exists(output_dirname): os.makedirs(output_dirname) f.writeToDirectory(output_dirname) os.rename(foutname, os.path.join(output_dirname, "original_sequence")) if options.verbose: print "done"
def main(): """ Run the main script. """ usage = """%(prog)s [options] <forwarder dirs> This program estimates the parameters of an isolation model with two species and uniform coalescence and recombination rates.""" parser = ArgumentParser(usage=usage, version="%(prog)s 1.1") parser.add_argument("--header", action="store_true", default=False, help="Include a header on the output") parser.add_argument("-o", "--outfile", type=str, default="/dev/stdout", help="Output file for the estimate (/dev/stdout)") parser.add_argument( "--logfile", type=str, default=None, help="Log for all points estimated in the optimization") parser.add_argument( "--states-12", type=int, default=10, help= "Number of intervals used to discretize the time between the first and second speciation (10)" ) parser.add_argument( "--states-123", type=int, default=10, help= "Number of intervals used to discretize the time after the second speciation (10)" ) parser.add_argument( "--optimizer", type=str, default="Nelder-Mead", help= "Optimization algorithm to use for maximizing the likelihood (Nealder-Mead)", choices=['Nelder-Mead', 'Powell', 'L-BFGS-B', 'TNC']) parser.add_argument( "--outgroup", action="store_true", default=None, help="Outgroup is included as fourth sequence in alignment.") optimized_params = [ ('split-12', 'First split time in substitutions', 1e6 / 1e9), ('split-123', 'Second split time in substitutions', 1e6 / 1e9), ('theta-1', 'effective population size in 4Ne substitutions for species 1', 1e6 / 1e9), ('theta-2', 'effective population size in 4Ne substitutions for species 2', 1e6 / 1e9), ('theta-3', 'effective population size in 4Ne substitutions for species 3', 1e6 / 1e9), ('theta-12', 'effective population size in 4Ne substitutions for species 12 (first ancestral)', 1e6 / 1e9), ('theta-123', 'effective population size in 4Ne substitutions for species 123 (ancestral to all)', 1e6 / 1e9), ('rho', 'recombination rate in substitutions', 0.4), ('outgroup', 'total height of tree with outgroup', 1e6 / 1e9) ] for parameter_name, description, default in optimized_params: parser.add_argument("--%s" % parameter_name, type=float, default=default, help="Initial guess at the %s (%g)" % (description, default)) parser.add_argument('alignments', nargs='+', help='Alignments in ZipHMM format') options = parser.parse_args() if len(options.alignments) < 1: parser.error("Input alignment not provided!") init_parameters = (options.split_12, options.split_123, 1 / (options.theta_1 / 2), 1 / (options.theta_2 / 2), 1 / (options.theta_3 / 2), 1 / (options.theta_12 / 2), 1 / (options.theta_123 / 2), options.rho) if options.outgroup: init_parameters += (options.outgroup, ) output_header = [ 'split.time.12', 'split.time.123', 'theta.1', 'theta.2', 'theta.3', 'theta.12', 'theta.123', 'rho' ] if options.outgroup: output_header.append("outgroup") forwarders = [Forwarder.fromDirectory(arg) for arg in options.alignments] log_likelihood = Likelihood( ILSModel(options.states_12, options.states_123), forwarders) if options.logfile: with open(options.logfile, 'w') as logfile: if options.header: print >> logfile, '\t'.join(output_header) mle_parameters = maximum_likelihood_estimate( log_likelihood, init_parameters, optimizer_method=options.optimizer, log_file=logfile, log_param_transform=transform) else: mle_parameters = maximum_likelihood_estimate( log_likelihood, init_parameters, optimizer_method=options.optimizer) max_log_likelihood = log_likelihood(mle_parameters) with open(options.outfile, 'w') as outfile: if options.header: print >> outfile, '\t'.join(output_header) print >> outfile, '\t'.join( map(str, transform(mle_parameters) + (max_log_likelihood, )))
def main(): usage = """%prog [options] <forwarder dirs> This program estimates the parameters of an isolation model with two species and uniform coalescence/recombination rate.""" parser = OptionParser(usage=usage, version="%prog 1.0") parser.add_option("-o", "--out", dest="outfile", type="string", default="/dev/stdout", help="Output file for the estimate (/dev/stdout)") parser.add_option( "--tmpfile", dest="tmpfile", type="string", default="/dev/null", help="Log for all points estimated in the optimization (/dev/null)") optimized_params = [ ('splittime', 'split time', 1e6), ('Ne', 'effective population size', 20e3), ('recomb', 'recombination rate', 0.1), ] for (cname, desc, default) in optimized_params: parser.add_option("--%s" % cname, dest=cname, type="float", default=default, help="Initial guess at the %s (%g)" % (desc, default)) fixed_params = [ ('mu', 'mutation rate', 1e-9), ('g', 'generation time', 20), ] for (cname, desc, default) in fixed_params: parser.add_option("--%s" % cname, dest=cname, type="float", default=default, help="Value of the %s (%g)" % (desc, default)) parser.add_option( "--intervals", dest="intervals", type="int", default=10, help="Number of sub intervals used to discretize the time (10)") parser.add_option("--header", dest="include_header", action="store_true", default=False, help="Include a header on the output") parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Print help") (options, args) = parser.parse_args() if len(args) < 1: parser.error("Needs at least one preprocessed sequence to work on") if not options.verbose: log = lambda s: None logu = lambda s: None else: logu = log_unfinished_line log = log_finished_line logu("Loading forwarders...") forwarders = [Forwarder.fromDirectory(dir) for dir in args] log("done") logu("Constructing model...") intervals = options.intervals modelI = build_epoch_seperated_model(2, [[0, 0]], [1, intervals]) log("done") mu = options.mu g = options.g T = options.splittime * mu C = 1.0 / (g * mu * 2 * options.Ne) R = options.recomb with open(options.tmpfile, 'w') as tmpfile: L, est = estimate_I(modelI, forwarders, T, C, R, outfile=tmpfile) vals = "\t".join(map(str, est)) with open(options.outfile, 'w') as outfile: if options.include_header: print >> outfile, 'logL\tT\tC\tR' print >> outfile, "%f\t%s" % (L, vals)
def main(): usage = """%prog [options] <input> <input format> <output dir> This program reads in an input sequence in any format supported by BioPython and writes out a preprocessed file ready for use with zipHMM. Also supports gzipped input files, if the name ends with `.gz`. Assumption #1: Either the file is a pairwise alignment, or you have provided exactly two names to the `--names` option. Assumption #2: The file uses a simple ACGT format (and N/-). Anything else will be interpreted as N and a warning will be given with all unknown symbols. Warning: This program uses SeqIO.to_dict to read in the entire alignment, you may want to split the alignment first if it's very large. """ parser = OptionParser(usage=usage, version="%prog 1.0") parser.add_option( "--names", dest="names", type="string", default=None, help="A comma-separated list of names to use from the source file") parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Print some stuff") (options, args) = parser.parse_args() if len(args) != 3: parser.error("Needs input file, input format and output file") in_filename = args.pop(0) in_format = args.pop(0) output_dirname = args.pop(0) assert os.path.exists(in_filename), "Must use an existing input file" if in_filename.endswith('.gz'): if options.verbose: print "Assuming '%s' is a gzipped file." % in_filename inf = gzip.open(in_filename) else: inf = open(in_filename) if options.verbose: print "Loading data...", sys.stdout.flush() alignments = SeqIO.to_dict(SeqIO.parse(inf, in_format)) if options.verbose: print "done" if options.names: names = options.names.split(',') else: names = list(alignments.keys()) assert len(names) == 2, "Must be a pairwise alignment." if options.verbose: print "Assuming pairwise alignment between '%s' and '%s'" % (names[0], names[1]) srcs = [alignments[name].seq for name in names] clean = set('ACGT') A = srcs[0] B = srcs[1] assert len(A) == len(B) L = len(A) fd, foutname = tempfile.mkstemp() if options.verbose: print "Writing temp file readable by zipHMM to '%s'..." % (foutname), sys.stdout.flush() seen = set() with os.fdopen(fd, 'w', 64 * 1024) as f: for i in xrange(L): s1, s2 = A[i].upper(), B[i].upper() seen.add(s1) seen.add(s2) if s1 not in clean or s2 not in clean: print >> f, 2, elif s1 == s2: print >> f, 0, else: print >> f, 1, if options.verbose: print "done" if len(seen - set('ACGTN-')) > 1: print >> sys.stderr, "I didn't understand the following symbols form the input sequence: %s" % ( ''.join(list(seen - set('ACGTN-')))) if options.verbose: print "zipHMM is preprocessing...", sys.stdout.flush() f = Forwarder.fromSequence(seqFilename=foutname, alphabetSize=3, minNoEvals=500) if options.verbose: print "done" if options.verbose: print "Writing zipHMM data to '%s'..." % (output_dirname), sys.stdout.flush() if not os.path.exists(output_dirname): os.makedirs(output_dirname) f.writeToDirectory(output_dirname) os.rename(foutname, os.path.join(output_dirname, 'original_sequence')) if options.verbose: print "done"
def __init__(self, options): super(_IsolationModel, self).__init__() alignments = _prepare_alignments(options, 'ziphmm_src1_admix') self.forwarders = [Forwarder.fromDirectory(arg) for arg in alignments]
def __init__(self, options): super(_TwoPopAdmix23OneSampleModel, self).__init__() alignments_src1_admix = _prepare_alignments(options, 'ziphmm_src1_admix') self.forwarders_src1_admix = [Forwarder.fromDirectory(arg) for arg in alignments_src1_admix]
def main(): usage="""%prog [options] <forwarder dirs> This program estimates the parameters of an isolation model with two species and uniform coalescence/recombination rate.""" parser = OptionParser(usage=usage, version="%prog 1.0") parser.add_option("-o", "--out", dest="outfile", type="string", default="/dev/stdout", help="Output file for the estimate (/dev/stdout)") parser.add_option("--tmpfile", dest="tmpfile", type="string", default="/dev/null", help="Log for all points estimated in the optimization (/dev/null)") optimized_params = [ ('splittime', 'split time', 1e6), ('Ne', 'effective population size', 20e3), ('recomb', 'recombination rate', 0.1), ] for (cname, desc, default) in optimized_params: parser.add_option("--%s" % cname, dest=cname, type="float", default=default, help="Initial guess at the %s (%g)" % (desc, default)) fixed_params = [ ('mu', 'mutation rate', 1e-9), ('g', 'generation time', 20), ] for (cname, desc, default) in fixed_params: parser.add_option("--%s" % cname, dest=cname, type="float", default=default, help="Value of the %s (%g)" % (desc, default)) parser.add_option("--intervals", dest="intervals", type="int", default=10, help="Number of sub intervals used to discretize the time (10)") parser.add_option("--header", dest="include_header", action="store_true", default=False, help="Include a header on the output") parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Print help") (options, args) = parser.parse_args() if len(args) < 1: parser.error("Needs at least one preprocessed sequence to work on") if not options.verbose: log = lambda s: None logu = lambda s: None else: logu = log_unfinished_line log = log_finished_line logu("Loading forwarders...") forwarders = [Forwarder.fromDirectory(dir) for dir in args] log("done") logu("Constructing model...") intervals = options.intervals modelI = build_epoch_seperated_model(2, [[0,0]], [1,intervals]) log("done") mu = options.mu g = options.g T = options.splittime * mu C = 1.0/(g*mu*2*options.Ne) R = options.recomb with open(options.tmpfile, 'w') as tmpfile: L, est = estimate_I(modelI, forwarders, T, C, R, outfile=tmpfile) vals = "\t".join(map(str,est)) with open(options.outfile, 'w') as outfile: if options.include_header: print >>outfile, 'logL\tT\tC\tR' print >>outfile, "%f\t%s" % (L,vals)