예제 #1
0
    def __init__(self, options):
        super(_IsolationInitialMigration3HMMModel, self).__init__()

        alignments_src1_admix = _prepare_alignments(options, 'ziphmm_src1_admix')
        alignments_src1_scr1 = _prepare_alignments(options, 'ziphmm_scr1_scr1')
        alignments_admix_admix = _prepare_alignments(options, 'ziphmm_admix_admix')
        self.forwarders_src1_admix = [Forwarder.fromDirectory(arg) for arg in alignments_src1_admix]
        self.forwarders_src1_src1 = [Forwarder.fromDirectory(arg) for arg in alignments_src1_scr1]
        self.forwarders_admix_admix = [Forwarder.fromDirectory(arg) for arg in alignments_admix_admix]
예제 #2
0
    def __init__(self, options):
        super(_ThreePopAdmix23Model, self).__init__()

        alignments_src1_admix = _prepare_alignments(options, 'ziphmm_src1_admix')
        alignments_src2_admix = _prepare_alignments(options, 'ziphmm_src2_admix')
        alignments_src1_src2 = _prepare_alignments(options, 'ziphmm_src1_src2')

        self.forwarders_src1_admix = [Forwarder.fromDirectory(arg) for arg in alignments_src1_admix]
        self.forwarders_src2_admix = [Forwarder.fromDirectory(arg) for arg in alignments_src2_admix]
        self.forwarders_src1_src2 = [Forwarder.fromDirectory(arg) for arg in alignments_src1_src2]
예제 #3
0
    def __init__(self, options):
        super(_ThreePopAdmix23Model15HMM, self).__init__()

        alignments_src1_admix = [_prepare_alignments(options, 'ziphmm_src1_admix'),
                                 _prepare_alignments(options, 'ziphmm_src1_admix_2'),
                                 _prepare_alignments(options, 'ziphmm_src1_admix_3'),
                                 _prepare_alignments(options, 'ziphmm_src1_admix_4')]
        alignments_src2_admix = [_prepare_alignments(options, 'ziphmm_src2_admix'),
                                 _prepare_alignments(options, 'ziphmm_src2_admix_2'),
                                 _prepare_alignments(options, 'ziphmm_src2_admix_3'),
                                 _prepare_alignments(options, 'ziphmm_src2_admix_4')]
        alignments_src1_src2 = [_prepare_alignments(options, 'ziphmm_src1_src2'),
                                _prepare_alignments(options, 'ziphmm_src1_src2_2'),
                                _prepare_alignments(options, 'ziphmm_src1_src2_3'),
                                _prepare_alignments(options, 'ziphmm_src1_src2_4')]
        alignments_src1_scr1 = _prepare_alignments(options, 'ziphmm_scr1_scr1')
        alignments_src2_src2 = _prepare_alignments(options, 'ziphmm_src2_src2')
        alignments_admix_admix = _prepare_alignments(options, 'ziphmm_admix_admix')

        self.forwarders_src1_admix = [[Forwarder.fromDirectory(arg) for arg in algs] for algs in alignments_src1_admix]
        self.forwarders_src2_admix = [[Forwarder.fromDirectory(arg) for arg in algs] for algs in alignments_src2_admix]
        self.forwarders_src1_src2 = [[Forwarder.fromDirectory(arg) for arg in algs] for algs in alignments_src1_src2]
        self.forwarders_src1_src1 = [Forwarder.fromDirectory(arg) for arg in alignments_src1_scr1]
        self.forwarders_src2_src2 = [Forwarder.fromDirectory(arg) for arg in alignments_src2_src2]
        self.forwarders_admix_admix = [Forwarder.fromDirectory(arg) for arg in alignments_admix_admix]
예제 #4
0
def main():
    usage = """%prog [options] <input> <input format> <output dir>

This program reads in an input sequence in any format supported by BioPython
and writes out a preprocessed file ready for use with zipHMM.
Also supports gzipped input files, if the name ends with `.gz`.

Assumption #1: Either the file is a pairwise alignment, or you have provided
exactly two names to the `--names` option.

Assumption #2: The file uses a simple ACGT format (and N/-). Anything else will
be interpreted as N and a warning will be given with all unknown symbols.

Warning: This program uses SeqIO.to_dict to read in the entire alignment, you
may want to split the alignment first if it's very large.
"""

    parser = OptionParser(usage=usage, version="%prog 1.0")

    parser.add_option(
        "--names",
        dest="names",
        type="string",
        default=None,
        help="A comma-separated list of names to use from the source file",
    )
    parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Print some stuff")

    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error("Needs input file, input format and output file")
    in_filename = args.pop(0)
    in_format = args.pop(0)
    output_dirname = args.pop(0)

    assert os.path.exists(in_filename), "Must use an existing input file"
    if in_filename.endswith(".gz"):
        if options.verbose:
            print "Assuming '%s' is a gzipped file." % in_filename
        inf = gzip.open(in_filename)
    else:
        inf = open(in_filename)

    if options.verbose:
        print "Loading data...",
        sys.stdout.flush()
    alignments = SeqIO.to_dict(SeqIO.parse(inf, in_format))
    if options.verbose:
        print "done"

    if options.names:
        names = options.names.split(",")
    else:
        names = list(alignments.keys())
    assert len(names) == 2, "Must be a pairwise alignment."
    if options.verbose:
        print "Assuming pairwise alignment between '%s' and '%s'" % (names[0], names[1])
    srcs = [alignments[name].seq for name in names]

    clean = set("ACGT")
    A = srcs[0]
    B = srcs[1]
    assert len(A) == len(B)
    L = len(A)
    fd, foutname = tempfile.mkstemp()
    if options.verbose:
        print "Writing temp file readable by zipHMM to '%s'..." % (foutname),
        sys.stdout.flush()
    seen = set()
    with os.fdopen(fd, "w", 64 * 1024) as f:
        for i in xrange(L):
            s1, s2 = A[i].upper(), B[i].upper()
            seen.add(s1)
            seen.add(s2)
            if s1 not in clean or s2 not in clean:
                print >> f, 2,
            elif s1 == s2:
                print >> f, 0,
            else:
                print >> f, 1,
    if options.verbose:
        print "done"
    if len(seen - set("ACGTN-")) > 1:
        print >> sys.stderr, "I didn't understand the following symbols form the input sequence: %s" % (
            "".join(list(seen - set("ACGTN-")))
        )
    if options.verbose:
        print "zipHMM  is preprocessing...",
        sys.stdout.flush()
    f = Forwarder.fromSequence(seqFilename=foutname, alphabetSize=3, minNoEvals=500)
    if options.verbose:
        print "done"

    if options.verbose:
        print "Writing zipHMM data to '%s'..." % (output_dirname),
        sys.stdout.flush()
    if not os.path.exists(output_dirname):
        os.makedirs(output_dirname)
    f.writeToDirectory(output_dirname)
    os.rename(foutname, os.path.join(output_dirname, "original_sequence"))
    if options.verbose:
        print "done"
예제 #5
0
def main():
    """
    Run the main script.
    """
    usage = """%(prog)s [options] <forwarder dirs>

This program estimates the parameters of an isolation model with two species
and uniform coalescence and recombination rates."""

    parser = ArgumentParser(usage=usage, version="%(prog)s 1.1")

    parser.add_argument("--header",
                        action="store_true",
                        default=False,
                        help="Include a header on the output")
    parser.add_argument("-o",
                        "--outfile",
                        type=str,
                        default="/dev/stdout",
                        help="Output file for the estimate (/dev/stdout)")

    parser.add_argument(
        "--logfile",
        type=str,
        default=None,
        help="Log for all points estimated in the optimization")

    parser.add_argument(
        "--states-12",
        type=int,
        default=10,
        help=
        "Number of intervals used to discretize the time between the first and second speciation (10)"
    )

    parser.add_argument(
        "--states-123",
        type=int,
        default=10,
        help=
        "Number of intervals used to discretize the time after the second speciation (10)"
    )

    parser.add_argument(
        "--optimizer",
        type=str,
        default="Nelder-Mead",
        help=
        "Optimization algorithm to use for maximizing the likelihood (Nealder-Mead)",
        choices=['Nelder-Mead', 'Powell', 'L-BFGS-B', 'TNC'])

    parser.add_argument(
        "--outgroup",
        action="store_true",
        default=None,
        help="Outgroup is included as fourth sequence in alignment.")

    optimized_params = [
        ('split-12', 'First split time in substitutions', 1e6 / 1e9),
        ('split-123', 'Second split time in substitutions', 1e6 / 1e9),
        ('theta-1',
         'effective population size in 4Ne substitutions for species 1',
         1e6 / 1e9),
        ('theta-2',
         'effective population size in 4Ne substitutions for species 2',
         1e6 / 1e9),
        ('theta-3',
         'effective population size in 4Ne substitutions for species 3',
         1e6 / 1e9),
        ('theta-12',
         'effective population size in 4Ne substitutions for species 12 (first ancestral)',
         1e6 / 1e9),
        ('theta-123',
         'effective population size in 4Ne substitutions for species 123 (ancestral to all)',
         1e6 / 1e9), ('rho', 'recombination rate in substitutions', 0.4),
        ('outgroup', 'total height of tree with outgroup', 1e6 / 1e9)
    ]

    for parameter_name, description, default in optimized_params:
        parser.add_argument("--%s" % parameter_name,
                            type=float,
                            default=default,
                            help="Initial guess at the %s (%g)" %
                            (description, default))

    parser.add_argument('alignments',
                        nargs='+',
                        help='Alignments in ZipHMM format')

    options = parser.parse_args()
    if len(options.alignments) < 1:
        parser.error("Input alignment not provided!")

    init_parameters = (options.split_12, options.split_123,
                       1 / (options.theta_1 / 2), 1 / (options.theta_2 / 2),
                       1 / (options.theta_3 / 2), 1 / (options.theta_12 / 2),
                       1 / (options.theta_123 / 2), options.rho)
    if options.outgroup:
        init_parameters += (options.outgroup, )

    output_header = [
        'split.time.12', 'split.time.123', 'theta.1', 'theta.2', 'theta.3',
        'theta.12', 'theta.123', 'rho'
    ]
    if options.outgroup:
        output_header.append("outgroup")

    forwarders = [Forwarder.fromDirectory(arg) for arg in options.alignments]
    log_likelihood = Likelihood(
        ILSModel(options.states_12, options.states_123), forwarders)

    if options.logfile:
        with open(options.logfile, 'w') as logfile:

            if options.header:
                print >> logfile, '\t'.join(output_header)

            mle_parameters = maximum_likelihood_estimate(
                log_likelihood,
                init_parameters,
                optimizer_method=options.optimizer,
                log_file=logfile,
                log_param_transform=transform)
    else:
        mle_parameters = maximum_likelihood_estimate(
            log_likelihood,
            init_parameters,
            optimizer_method=options.optimizer)

    max_log_likelihood = log_likelihood(mle_parameters)

    with open(options.outfile, 'w') as outfile:
        if options.header:
            print >> outfile, '\t'.join(output_header)
        print >> outfile, '\t'.join(
            map(str,
                transform(mle_parameters) + (max_log_likelihood, )))
예제 #6
0
def main():
    usage = """%prog [options] <forwarder dirs>

This program estimates the parameters of an isolation model with two species
and uniform coalescence/recombination rate."""

    parser = OptionParser(usage=usage, version="%prog 1.0")

    parser.add_option("-o",
                      "--out",
                      dest="outfile",
                      type="string",
                      default="/dev/stdout",
                      help="Output file for the estimate (/dev/stdout)")
    parser.add_option(
        "--tmpfile",
        dest="tmpfile",
        type="string",
        default="/dev/null",
        help="Log for all points estimated in the optimization (/dev/null)")
    optimized_params = [
        ('splittime', 'split time', 1e6),
        ('Ne', 'effective population size', 20e3),
        ('recomb', 'recombination rate', 0.1),
    ]
    for (cname, desc, default) in optimized_params:
        parser.add_option("--%s" % cname,
                          dest=cname,
                          type="float",
                          default=default,
                          help="Initial guess at the %s (%g)" %
                          (desc, default))
    fixed_params = [
        ('mu', 'mutation rate', 1e-9),
        ('g', 'generation time', 20),
    ]
    for (cname, desc, default) in fixed_params:
        parser.add_option("--%s" % cname,
                          dest=cname,
                          type="float",
                          default=default,
                          help="Value of the %s (%g)" % (desc, default))
    parser.add_option(
        "--intervals",
        dest="intervals",
        type="int",
        default=10,
        help="Number of sub intervals used to discretize the time (10)")
    parser.add_option("--header",
                      dest="include_header",
                      action="store_true",
                      default=False,
                      help="Include a header on the output")
    parser.add_option("-v",
                      "--verbose",
                      dest="verbose",
                      action="store_true",
                      default=False,
                      help="Print help")

    (options, args) = parser.parse_args()
    if len(args) < 1:
        parser.error("Needs at least one preprocessed sequence to work on")

    if not options.verbose:
        log = lambda s: None
        logu = lambda s: None
    else:
        logu = log_unfinished_line
        log = log_finished_line

    logu("Loading forwarders...")
    forwarders = [Forwarder.fromDirectory(dir) for dir in args]
    log("done")

    logu("Constructing model...")
    intervals = options.intervals
    modelI = build_epoch_seperated_model(2, [[0, 0]], [1, intervals])
    log("done")

    mu = options.mu
    g = options.g
    T = options.splittime * mu
    C = 1.0 / (g * mu * 2 * options.Ne)
    R = options.recomb

    with open(options.tmpfile, 'w') as tmpfile:
        L, est = estimate_I(modelI, forwarders, T, C, R, outfile=tmpfile)

    vals = "\t".join(map(str, est))
    with open(options.outfile, 'w') as outfile:
        if options.include_header:
            print >> outfile, 'logL\tT\tC\tR'
        print >> outfile, "%f\t%s" % (L, vals)
예제 #7
0
def main():
    usage = """%prog [options] <input> <input format> <output dir>

This program reads in an input sequence in any format supported by BioPython
and writes out a preprocessed file ready for use with zipHMM.
Also supports gzipped input files, if the name ends with `.gz`.

Assumption #1: Either the file is a pairwise alignment, or you have provided
exactly two names to the `--names` option.

Assumption #2: The file uses a simple ACGT format (and N/-). Anything else will
be interpreted as N and a warning will be given with all unknown symbols.

Warning: This program uses SeqIO.to_dict to read in the entire alignment, you
may want to split the alignment first if it's very large.
"""

    parser = OptionParser(usage=usage, version="%prog 1.0")

    parser.add_option(
        "--names",
        dest="names",
        type="string",
        default=None,
        help="A comma-separated list of names to use from the source file")
    parser.add_option("-v",
                      "--verbose",
                      dest="verbose",
                      action="store_true",
                      default=False,
                      help="Print some stuff")

    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error("Needs input file, input format and output file")
    in_filename = args.pop(0)
    in_format = args.pop(0)
    output_dirname = args.pop(0)

    assert os.path.exists(in_filename), "Must use an existing input file"
    if in_filename.endswith('.gz'):
        if options.verbose:
            print "Assuming '%s' is a gzipped file." % in_filename
        inf = gzip.open(in_filename)
    else:
        inf = open(in_filename)

    if options.verbose:
        print "Loading data...",
        sys.stdout.flush()
    alignments = SeqIO.to_dict(SeqIO.parse(inf, in_format))
    if options.verbose:
        print "done"

    if options.names:
        names = options.names.split(',')
    else:
        names = list(alignments.keys())
    assert len(names) == 2, "Must be a pairwise alignment."
    if options.verbose:
        print "Assuming pairwise alignment between '%s' and '%s'" % (names[0],
                                                                     names[1])
    srcs = [alignments[name].seq for name in names]

    clean = set('ACGT')
    A = srcs[0]
    B = srcs[1]
    assert len(A) == len(B)
    L = len(A)
    fd, foutname = tempfile.mkstemp()
    if options.verbose:
        print "Writing temp file readable by zipHMM to '%s'..." % (foutname),
        sys.stdout.flush()
    seen = set()
    with os.fdopen(fd, 'w', 64 * 1024) as f:
        for i in xrange(L):
            s1, s2 = A[i].upper(), B[i].upper()
            seen.add(s1)
            seen.add(s2)
            if s1 not in clean or s2 not in clean:
                print >> f, 2,
            elif s1 == s2:
                print >> f, 0,
            else:
                print >> f, 1,
    if options.verbose:
        print "done"
    if len(seen - set('ACGTN-')) > 1:
        print >> sys.stderr, "I didn't understand the following symbols form the input sequence: %s" % (
            ''.join(list(seen - set('ACGTN-'))))
    if options.verbose:
        print "zipHMM  is preprocessing...",
        sys.stdout.flush()
    f = Forwarder.fromSequence(seqFilename=foutname,
                               alphabetSize=3,
                               minNoEvals=500)
    if options.verbose:
        print "done"

    if options.verbose:
        print "Writing zipHMM data to '%s'..." % (output_dirname),
        sys.stdout.flush()
    if not os.path.exists(output_dirname):
        os.makedirs(output_dirname)
    f.writeToDirectory(output_dirname)
    os.rename(foutname, os.path.join(output_dirname, 'original_sequence'))
    if options.verbose:
        print "done"
예제 #8
0
 def __init__(self, options):
     super(_IsolationModel, self).__init__()
     alignments = _prepare_alignments(options, 'ziphmm_src1_admix')
     self.forwarders = [Forwarder.fromDirectory(arg) for arg in alignments]
예제 #9
0
    def __init__(self, options):
        super(_TwoPopAdmix23OneSampleModel, self).__init__()

        alignments_src1_admix = _prepare_alignments(options, 'ziphmm_src1_admix')
        self.forwarders_src1_admix = [Forwarder.fromDirectory(arg) for arg in alignments_src1_admix]
예제 #10
0
def main():
    usage="""%prog [options] <forwarder dirs>

This program estimates the parameters of an isolation model with two species
and uniform coalescence/recombination rate."""


    parser = OptionParser(usage=usage, version="%prog 1.0")

    parser.add_option("-o", "--out",
                      dest="outfile",
                      type="string",
                      default="/dev/stdout",
                      help="Output file for the estimate (/dev/stdout)")
    parser.add_option("--tmpfile",
                      dest="tmpfile",
                      type="string",
                      default="/dev/null",
                      help="Log for all points estimated in the optimization (/dev/null)")
    optimized_params = [
            ('splittime', 'split time', 1e6),
            ('Ne', 'effective population size', 20e3),
            ('recomb', 'recombination rate', 0.1),
            ]
    for (cname, desc, default) in optimized_params:
        parser.add_option("--%s" % cname,
                          dest=cname,
                          type="float",
                          default=default,
                          help="Initial guess at the %s (%g)" % (desc, default))
    fixed_params = [
            ('mu', 'mutation rate', 1e-9),
            ('g', 'generation time', 20),
            ]
    for (cname, desc, default) in fixed_params:
        parser.add_option("--%s" % cname,
                          dest=cname,
                          type="float",
                          default=default,
                          help="Value of the %s (%g)" % (desc, default))
    parser.add_option("--intervals",
                      dest="intervals",
                      type="int",
                      default=10,
                      help="Number of sub intervals used to discretize the time (10)")
    parser.add_option("--header",
                      dest="include_header",
                      action="store_true",
                      default=False,
                      help="Include a header on the output")
    parser.add_option("-v", "--verbose",
                      dest="verbose",
                      action="store_true",
                      default=False,
                      help="Print help")

    (options, args) = parser.parse_args()
    if len(args) < 1:
        parser.error("Needs at least one preprocessed sequence to work on")

    if not options.verbose:
        log = lambda s: None
        logu = lambda s: None
    else:
        logu = log_unfinished_line
        log = log_finished_line

    logu("Loading forwarders...")
    forwarders = [Forwarder.fromDirectory(dir) for dir in args]
    log("done")

    logu("Constructing model...")
    intervals = options.intervals
    modelI = build_epoch_seperated_model(2, [[0,0]], [1,intervals])
    log("done")


    mu = options.mu
    g = options.g
    T = options.splittime * mu
    C = 1.0/(g*mu*2*options.Ne)
    R = options.recomb
    
    with open(options.tmpfile, 'w') as tmpfile:
        L, est = estimate_I(modelI, forwarders, T, C, R, outfile=tmpfile)
    
    vals = "\t".join(map(str,est))
    with open(options.outfile, 'w') as outfile:
        if options.include_header:
            print >>outfile, 'logL\tT\tC\tR'
        print >>outfile, "%f\t%s" % (L,vals)