示例#1
0
def outputRates( result, options ):
    """output rates in a grammar."""

    trained_model = result.getModel()

    pis = trained_model.evaluateTerminalFrequencies()
    matrices = trained_model.evaluateRateMatrix()
    terminals = pis.keys()

    for terminal in terminals:
        Q, distance = RateEstimation.getDistanceGTR( pis[terminal], matrices[terminal] )
        options.stdout.write("\t%s" % (options.value_format % distance ) )
示例#2
0
def runXrate(mali, pairs, options):

    from XGram.Generator.Prebuilt import DNA
    from XGram.Model import Annotation
    import XGram.Run

    xgram = XGram.XGram()
    if options.xrate_min_increment:
        xgram.setMinIncrement(options.xrate_min_increment)

    ninput, noutput, nskipped = 0, 0, 0

    tempdir = tempfile.mkdtemp()
    data = tempdir + "/data"

    if options.distance == "K80":
        model = DNA.buildModel(substitution_model="k80")
    elif options.distance == "JC69":
        model = DNA.buildModel(substitution_model="jc69")
    elif options.distance == "REV":
        model = DNA.buildModel(substitution_model="gtr")
    else:
        raise "distance %s not implemented for xrate" % (options.distance)

    writeModel(model, "input", options)

    if options.output_format == "list":
        options.stdout.write(
            "\t".join(("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg")))

        if options.with_counts:
            options.stdout.write(
                "\t%s" % Genomics.SequencePairInfo().getHeader())
        options.stdout.write("\n")

    for x, y in pairs:

        m1 = mali.getSequence(ids[x])
        ninput += 1
        temp_mali = Mali.Mali()
        m2 = mali.getSequence(ids[y])

        temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString)
        temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString)

# if temp_mali.getWidth() < options.min_overlap:
# if options.loglevel >= 1:
# options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId,
# mali.getEntry(ids[y]).mId,
# temp_mali.getWidth()) )

#             nskipped += 1
# continue

        outfile = open(data, "w")
        temp_mali.writeToFile(outfile, format="stockholm",
                              write_ranges=False,
                              options=("#=GF NH (%s:1.0)%s;" % tuple(temp_mali.getIdentifiers()),))
        outfile.close()

        o_alpha, o_kappa = "na", "na"
        o_distance = "na"
        msg = ""

        if options.test_xrate:
            for alpha in (0.1, 0.5, 1.0, 1.5):
                for beta in (0.1, 0.5, 1.0, 1.5):
                    model.mGrammar.setParameter("alpha", alpha)
                    model.mGrammar.setParameter("beta", beta)
                    result = xgram.train(model, data)
                    trained_model = result.getModel()
                    xalpha, xbeta = \
                        (trained_model.mGrammar.getParameter('alpha'),
                         trained_model.mGrammar.getParameter('beta'))
                    # this assumes that the branch length in the input is normalized to 1
                    # this is the normalization constant
                    o_distance = options.format % (2 * xbeta + xalpha)
                    o_kappa = options.format % (xalpha / xbeta)

                    msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta)

                    options.stdout.write("\t".join(("%f" % alpha,
                                                    "%f" % beta,
                                                    o_distance,
                                                    options.format % result.getLogLikelihood(
                                                    ),
                                                    o_alpha,
                                                    o_kappa,
                                                    msg)))
                    options.stdout.write("\n")
            continue

        options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId))

        if options.distance in ("K80", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()

        elif options.distance in ("REV", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()
            alpha, beta, gamma, delta, epsilon, theta = \
                (trained_model.mGrammar.getParameter('alpha'),
                 trained_model.mGrammar.getParameter('beta'),
                 trained_model.mGrammar.getParameter('gamma'),
                 trained_model.mGrammar.getParameter('delta'),
                 trained_model.mGrammar.getParameter('epsilon'),
                 trained_model.mGrammar.getParameter('theta'))

            pi = trained_model.evaluateTerminalFrequencies(('A0',))[('A0',)]
            matrix = trained_model.evaluateRateMatrix(('A0',))[('A0',)]
            q, d = RateEstimation.getDistanceGTR(pi, matrix)
            o_distance = options.format % (d)
            o_kappa = ""
            msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % (
                alpha, beta, gamma, delta, epsilon, theta)

        elif options.distance in ('JC69', ):
            result = xgram.buildTree(model, data)

        if options.distance == "K80":
            alpha, beta = \
                (trained_model.mGrammar.getParameter('alpha'),
                    trained_model.mGrammar.getParameter('beta'))
            # this assumes that the branch length in the input is normalized to 1
            # this is the normalization constant
            o_distance = options.format % (2 * beta + alpha)
            o_kappa = options.format % (alpha / beta)

            msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta)
            alpha = "na"

        elif options.distance == "JC69":

            tree = result.getTree()
            # multiply distance by tree, as rates are set to 1 and
            # thus the matrix is scaled by a factor of 3
            o_distance = options.format % (
                3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0]))
            o_kappa = "na"
            msg = ""

        writeModel(result.mModel, "trained", options)

        options.stdout.write("\t".join((o_distance,
                                        options.format % result.getLogLikelihood(
                                        ),
                                        o_alpha,
                                        o_kappa,
                                        msg)))

        if options.with_counts:
            info = Genomics.CalculatePairIndices(
                mali[ids[x]], mali[ids[y]], with_codons=options.is_codons)
            options.stdout.write("\t%s" % (str(info)))

        options.stdout.write("\n")

    shutil.rmtree(tempdir)
示例#3
0
def runXrate(mali, pairs, options):

    from XGram.Generator.Prebuilt import DNA
    from XGram.Model import Annotation
    import XGram.Run

    xgram = XGram.XGram()
    if options.xrate_min_increment:
        xgram.setMinIncrement(options.xrate_min_increment)

    ninput, noutput, nskipped = 0, 0, 0

    tempdir = tempfile.mkdtemp()
    data = tempdir + "/data"

    if options.distance == "K80":
        model = DNA.buildModel(substitution_model="k80")
    elif options.distance == "JC69":
        model = DNA.buildModel(substitution_model="jc69")
    elif options.distance == "REV":
        model = DNA.buildModel(substitution_model="gtr")
    else:
        raise "distance %s not implemented for xrate" % (options.distance)

    writeModel(model, "input", options)

    if options.output_format == "list":
        options.stdout.write("\t".join(
            ("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg")))

        if options.with_counts:
            options.stdout.write("\t%s" %
                                 Genomics.SequencePairInfo().getHeader())
        options.stdout.write("\n")

    for x, y in pairs:

        m1 = mali.getSequence(ids[x])
        ninput += 1
        temp_mali = Mali.Mali()
        m2 = mali.getSequence(ids[y])

        temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString)
        temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString)

        # if temp_mali.getWidth() < options.min_overlap:
        # if options.loglevel >= 1:
        # options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId,
        # mali.getEntry(ids[y]).mId,
        # temp_mali.getWidth()) )

        ##             nskipped += 1
        # continue

        outfile = open(data, "w")
        temp_mali.writeToFile(outfile,
                              format="stockholm",
                              write_ranges=False,
                              options=("#=GF NH (%s:1.0)%s;" %
                                       tuple(temp_mali.getIdentifiers()), ))
        outfile.close()

        o_alpha, o_kappa = "na", "na"
        o_distance = "na"
        msg = ""

        if options.test_xrate:
            for alpha in (0.1, 0.5, 1.0, 1.5):
                for beta in (0.1, 0.5, 1.0, 1.5):
                    model.mGrammar.setParameter("alpha", alpha)
                    model.mGrammar.setParameter("beta", beta)
                    result = xgram.train(model, data)
                    trained_model = result.getModel()
                    xalpha, xbeta = \
                        (trained_model.mGrammar.getParameter('alpha'),
                         trained_model.mGrammar.getParameter('beta'))
                    # this assumes that the branch length in the input is normalized to 1
                    # this is the normalization constant
                    o_distance = options.format % (2 * xbeta + xalpha)
                    o_kappa = options.format % (xalpha / xbeta)

                    msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta)

                    options.stdout.write("\t".join(
                        ("%f" % alpha, "%f" % beta, o_distance,
                         options.format % result.getLogLikelihood(), o_alpha,
                         o_kappa, msg)))
                    options.stdout.write("\n")
            continue

        options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId))

        if options.distance in ("K80", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()

        elif options.distance in ("REV", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()
            alpha, beta, gamma, delta, epsilon, theta = \
                (trained_model.mGrammar.getParameter('alpha'),
                 trained_model.mGrammar.getParameter('beta'),
                 trained_model.mGrammar.getParameter('gamma'),
                 trained_model.mGrammar.getParameter('delta'),
                 trained_model.mGrammar.getParameter('epsilon'),
                 trained_model.mGrammar.getParameter('theta'))

            pi = trained_model.evaluateTerminalFrequencies(('A0', ))[('A0', )]
            matrix = trained_model.evaluateRateMatrix(('A0', ))[('A0', )]
            q, d = RateEstimation.getDistanceGTR(pi, matrix)
            o_distance = options.format % (d)
            o_kappa = ""
            msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % (
                alpha, beta, gamma, delta, epsilon, theta)

        elif options.distance in ('JC69', ):
            result = xgram.buildTree(model, data)

        if options.distance == "K80":
            alpha, beta = \
                (trained_model.mGrammar.getParameter('alpha'),
                    trained_model.mGrammar.getParameter('beta'))
            # this assumes that the branch length in the input is normalized to 1
            # this is the normalization constant
            o_distance = options.format % (2 * beta + alpha)
            o_kappa = options.format % (alpha / beta)

            msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta)
            alpha = "na"

        elif options.distance == "JC69":

            tree = result.getTree()
            # multiply distance by tree, as rates are set to 1 and
            # thus the matrix is scaled by a factor of 3
            o_distance = options.format % (
                3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0]))
            o_kappa = "na"
            msg = ""

        writeModel(result.mModel, "trained", options)

        options.stdout.write("\t".join(
            (o_distance, options.format % result.getLogLikelihood(), o_alpha,
             o_kappa, msg)))

        if options.with_counts:
            info = Genomics.CalculatePairIndices(mali[ids[x]],
                                                 mali[ids[y]],
                                                 with_codons=options.is_codons)
            options.stdout.write("\t%s" % (str(info)))

        options.stdout.write("\n")

    shutil.rmtree(tempdir)