Python RateEstimation примеры использования

Язык программирования: Python

Пространство имен/Пакет: CGAT

Класс/Тип: RateEstimation

Примеров на hotexamples.com: 10

Python RateEstimation - 10 примеров найдено. Это лучшие примеры Python кода для CGAT.RateEstimation, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

countSubstitutions(2)

getDistanceGTR(2)

getQMatrix(2)

getRateMatrix(2)

evaluateCodonPair(1)

setFrequencies(1)

Пример #1

Показать файл

Файл: xrate_blocks.py Проект: logust79/cgat-apps

def outputRates( result, options ):
    """output rates in a grammar."""

    trained_model = result.getModel()

    pis = trained_model.evaluateTerminalFrequencies()
    matrices = trained_model.evaluateRateMatrix()
    terminals = pis.keys()

    for terminal in terminals:
        Q, distance = RateEstimation.getDistanceGTR( pis[terminal], matrices[terminal] )
        options.stdout.write("\t%s" % (options.value_format % distance ) )

Пример #2

Показать файл

Файл: mali2kaks.py Проект: santayana/cgat

    def getQMatrix(pi, k, s, n):
        """build a q matrix.

        Diagonal elements are set to the negative of the row sums.
        The matrix is normalized such that trace of the matrix is -1.
        """

        codons = Bio.Data.CodonTable.standard_dna_table.forward_table.keys()

        Q = initializeQMatrix(codons)

        trace = 0.0
        for codon_i in codons:
            row_sum = 0.0
            for codon_j in codons:
                if codon_i == codon_j:
                    continue

                is_single, is_synonymous, is_transition = RateEstimation.evaluateCodonPair(
                    codon_i, codon_j)

                if not is_single:
                    continue

                if is_synonymous:
                    if is_transition:
                        v = s
                    else:
                        v = s * k
                else:
                    if is_transition:
                        v = n
                    else:
                        v = n * k

                v *= pi[codon_j]
                Q[codon_i][codon_j] = v
                row_sum += v

            Q[codon_i][codon_i] = -row_sum
            trace += pi[codon_i] * row_sum

        for codon_i in codons:
            for codon_j in codons:
                Q[codon_i][codon_j] /= trace

        return Q, trace

Пример #3

Показать файл

def processMali(mali, options):

    ncols = mali.getNumColumns()

    if ncols == 0:
        raise "refusing to process empty alignment."

    ## add annotation of states
    if options.block_size != None:
        if options.block_size < 1:
            size = int(float(ncols) / 3.0 * options.block_size) * 3
        else:
            size = int(options.block_size) * 3

        size = min(size, ncols)
        mali.addAnnotation("STATE", "N" * size + "C" * (ncols - size))

    ## remove gene ids
    for id in mali.getIdentifiers():
        if options.separator in id:
            species = id.split(options.separator)[0]
            mali.rename(id, species)

    map_new2old = mali.mapIdentifiers()
    map_old2new = IOTools.getInvertedDictionary(map_new2old, make_unique=True)

    ids = mali.getIdentifiers()
    xgram = XGram.XGram()

    if options.xrate_min_increment:
        xgram.setMinIncrement(options.xrate_min_increment)

    ninput, noutput, nskipped = 0, 0, 0

    # remove empty columns and masked columns
    if options.clean_mali:
        mali.mGapChars = mali.mGapChars + ("n", "N")
        mali.removeGaps(minimum_gaps=1, frame=3)

    if options.input_filename_tree:
        nexus = TreeTools.Newick2Nexus(open(options.input_filename_tree, "r"))
        tree = nexus.trees[0]
        tree.relabel(map_old2new)
    else:
        tree = None

    annotation = mali.getAnnotation("STATE")
    chars = set(list(annotation))
    for c in chars:
        assert c in (
            "N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized"
    if len(chars) == 1:
        if options.loglevel >= 1:
            options.stdlog.write("# WARNING: only a single block")
        blocks = (("B0_", chars[0]), )
    else:
        blocks = (("B0_", "N"), ("B1_", "C"))

    result, mali, ids = prepareGrammar(xgram, mali, tree, map_old2new, blocks,
                                       options)

    trained_model = result.getModel()

    pis, matrices = RateEstimation.getRateMatrix(trained_model)

    annotation = mali.getAnnotation("STATE")

    for block, code in blocks:

        terminals = ("%sCOD0" % block, "%sCOD1" % block, "%sCOD2" % block)

        pi = pis[terminals]

        if options.shared_rates == "all":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa":
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "omega":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = block
        elif options.shared_rates == "omega-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = ""
        elif options.shared_rates == "ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block
        else:
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block

        if options.shared_frequencies:
            frequency_prefix = ""
        else:
            frequency_prefix = block

        rs = trained_model.mGrammar.getParameter('%sRs' % rate_prefix_rs)
        rn = trained_model.mGrammar.getParameter('%sRn' % rate_prefix_rn)
        ri = trained_model.mGrammar.getParameter('%sRi' % rate_prefix_ri)
        rv = trained_model.mGrammar.getParameter('%sRv' % rate_prefix_rv)

        nchars = annotation.count(code)

        msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % (
            result.getNumIterations(), rs, rn, ri, rv)

        try:
            Q, t = RateEstimation.getQMatrix(pi,
                                             Rsi=rs * ri,
                                             Rsv=rs * rv,
                                             Rni=rn * ri,
                                             Rnv=rn * rv)
            avg_omega = (rs + rn) / 2.0
            Q0, t0 = RateEstimation.getQMatrix(pi,
                                               Rsi=ri * avg_omega,
                                               Rsv=rv * avg_omega,
                                               Rni=ri * avg_omega,
                                               Rnv=rv * avg_omega)

            avg_kappa = (ri + rv) / 2.0
            Q1, t1 = RateEstimation.getQMatrix(pi,
                                               Rsi=rs * avg_kappa,
                                               Rsv=rs * avg_kappa,
                                               Rni=rn * avg_kappa,
                                               Rnv=rn * avg_kappa)

            rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q)
            rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0)
            rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1)

            dS = rS / (3 * rS0) * t
            dN = rN / (3 * rN0) * t

            o_kappa = options.value_format % (rI / rI0 * rV0 / rV)
            o_omega = options.value_format % (dN / dS)

            o_dn = options.value_format % dN
            o_ds = options.value_format % dS
            o_rn = options.value_format % rN
            o_rs = options.value_format % rS
            o_rn0 = options.value_format % rN0
            o_rs0 = options.value_format % rS0
            o_t = options.value_format % t
            o_t0 = options.value_format % t0

        except ZeroDivisionError:

            o_kappa = "na"
            o_omega = "na"
            o_dn = "na"
            o_ds = "na"
            o_rn = "na"
            o_rs = "na"
            o_rn0 = "na"
            o_rs0 = "na"
            o_t = "na"
            o_t0 = "na"
            Q = None
            msg = "insufficient data to estimate rate matrix."

        options.stdout.write("\t".join(
            map(str, (code, block, o_dn, o_ds, o_omega, "na", "na", "na", "na",
                      o_kappa, result.getLogLikelihood(), "na", nchars))))

        if options.with_rho:
            options.stdout.write(
                "\t" +
                "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0))))

        options.stdout.write("\t%s\n" % msg)

Пример #4

Показать файл

def prepareGrammar(xgram, mali, tree, map_old2new, blocks, options):
    """prepare grammar for custom grammars."""

    labels = map(lambda x: x[1], blocks)
    nblocks = len(blocks)

    annotate_terminals = {}
    for x in range(len(labels)):
        annotations = []
        key = []

        for c in range(0, 3):
            t = "B%i_COD%i" % (x, c)
            key.append(t)
            annotations.append(
                Annotation(row="STATE", column=t, label=labels[x]))

        annotate_terminals[tuple(key)] = annotations

    input_model = Codons.buildCodonML(
        codon_model="f3x4-fourproducts",
        num_blocks=nblocks,
        grammar_type="linear-blocks",
        annotate_terminals=annotate_terminals,
        shared_frequencies=options.shared_frequencies,
        shared_rates=False,
    )

    ## manually share rates between blocks
    if options.shared_rates == "kappa":
        for c in range(0, nblocks):
            input_model.renameParameter("B%i_Ri" % c, "Ri")
            input_model.renameParameter("B%i_Rv" % c, "Rv")
    elif options.shared_rates == "kappa-ds":
        for c in range(0, nblocks):
            input_model.renameParameter("B%i_Ri" % c, "Ri")
            input_model.renameParameter("B%i_Rv" % c, "Rv")
            input_model.renameParameter("B%i_Rs" % c, "Rs")
    elif options.shared_rates == "omega":
        for c in range(0, nblocks):
            input_model.renameParameter("B%i_Rs" % c, "Rs")
            input_model.renameParameter("B%i_Rn" % c, "Rn")
    elif options.shared_rates == "omega-ds":
        for c in range(0, nblocks):
            input_model.renameParameter("B%i_Rv" % c, "Rv")
            input_model.renameParameter("B%i_Rs" % c, "Rs")
            input_model.renameParameter("B%i_Rn" % c, "Rn")
    elif options.shared_rates == "ds":
        for c in range(0, nblocks):
            input_model.renameParameter("B%i_Rs" % c, "Rs")
    elif options.shared_rates == "all":
        for c in range(0, nblocks):
            input_model.renameParameter("B%i_Rv" % c, "Rv")
            input_model.renameParameter("B%i_Rs" % c, "Rs")
            input_model.renameParameter("B%i_Rn" % c, "Rn")
            input_model.renameParameter("B%i_Ri" % c, "Ri")

    writeModel(input_model, "input", options)

    ids = mali.getIdentifiers()

    fh, filename = tempfile.mkstemp()

    os.close(fh)
    outfile = open(filename, "w")

    ## clip mali by supplied blocks
    mali.clipByAnnotation("STATE", "".join(labels))

    if tree:
        tree.rescaleBranchLengths(1.0)
        tree_options = "#=GF NH %s" % tree.to_string(branchlengths_only=True,
                                                     format="nh")
    elif mali.getNumSequences() == 2:
        tree_options = "#=GF NH (%s:1.0)%s;" % tuple(map_old2new.values())
    else:
        raise "Please supply a tree."

    mali.writeToFile(outfile,
                     format="stockholm",
                     write_ranges=False,
                     options=(tree_options, ))
    outfile.close()

    ## prefix, code
    if options.shared_frequencies:
        frequency_codes = (("", ""), )
    else:
        frequency_codes = blocks

    if options.insert_frequencies:
        for prefix, code in frequency_codes:
            temp_mali = mali.getClone()
            temp_mali.clipByAnnotation("STATE", code)
            RateEstimation.setFrequencies(input_model, temp_mali, prefix)

    if options.fix_frequencies:
        for prefix, code in frequency_codes:
            for char in ('a', 'c', 'g', 't'):
                for x in (0, 1, 2):
                    param = "%sp%s%i" % (prefix, char, x)
                    input_model.mGrammar.moveVariableToConst(param)

    writeModel(input_model, "input", options)

    t1 = time.time()

    result = xgram.train(input_model, filename)

    if options.dump:
        options.stdlog.write("".join(result.mData))
        options.stdlog.write("".join(result.mLog))
        mali.writeToFile(options.stdlog,
                         format="stockholm",
                         write_ranges=False,
                         options=(tree_options, ))

    t2 = time.time()

    trained_model = result.getModel()

    writeModel(trained_model, "trained", options)

    return result, mali, ids

Пример #5

Показать файл

def outputXRateResult(mali, result, rsi, rsv, rni, rnv, msg):
    """output the results of running the Xrate four parameter grammar.
    """
    ids = mali.getIdentifiers()

    pi, matrix = RateEstimation.getRateMatrix(result.getModel(),
                                              terminals=('COD0', 'COD1',
                                                         'COD2'))

    if rsi == None:
        o_dn, o_ds, o_omega = "na", "na", "na"
        o_rn, o_rn0, o_rs, o_rs0 = "na", "na", "na", "na"
        o_t, o_t0 = "na", "na"
        o_N, o_S = "na", "na"
        o_kappa = "na",
        msg = "estimated rate parameters are zero"
    else:
        Q, t = RateEstimation.getQMatrix(pi,
                                         Rsi=rsi,
                                         Rsv=rsv,
                                         Rni=rni,
                                         Rnv=rnv)

        ## get rate matrix as if omega was set to 1
        Q0, t0 = RateEstimation.getQMatrix(pi,
                                           Rsi=(rsi + rni) / 2.0,
                                           Rsv=(rsv + rnv) / 2.0,
                                           Rni=(rsi + rni) / 2.0,
                                           Rnv=(rsv + rnv) / 2.0)

        ## get rate matrix as if kappa was set to 1
        Q1, t1 = RateEstimation.getQMatrix(pi,
                                           Rsi=(rsi + rsv) / 2.0,
                                           Rsv=(rsi + rsv) / 2.0,
                                           Rni=(rni + rnv) / 2.0,
                                           Rnv=(rni + rnv) / 2.0)

        rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q)
        rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0)
        rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1)

        # 64.0/61.0 results from the fact that xrate does not normalize
        # the terminals
        dS = rS / (3 * rS0) * t
        dN = rN / (3 * rN0) * t

        o_omega = options.value_format % (dN / dS)
        o_dn = options.value_format % dN
        o_ds = options.value_format % dS
        o_rn = options.value_format % rN
        o_rs = options.value_format % rS
        o_rn0 = options.value_format % rN0
        o_rs0 = options.value_format % rS0
        o_t = options.value_format % t
        o_t0 = options.value_format % t0
        o_S = options.value_format % (mali.getNumColumns() * rS0)
        o_N = options.value_format % (mali.getNumColumns() * rN0)

        ## kappa is given normalized by sites like omega
        o_kappa = options.value_format % (rI / rI1 * rV1 / rV)

        ## kappa1 is given by the ratio of the rates NOT normalized by the sites.
        msg += " rI/rV=%f rI0/rV0=%f kappa1=%s" % (rI / rV, rI0 / rV0,
                                                   options.value_format %
                                                   ((rsi + rni) / (rsv + rnv)))

    options.stdout.write("\t".join(
        map(str, (mali.getEntry(ids[0]).mId, mali.getEntry(
            ids[1]).mId, o_dn, o_ds, o_omega, o_N, o_S, "na", "na", o_kappa,
                  result.getLogLikelihood(), "na"))))

    if options.with_rho:
        options.stdout.write(
            "\t" + "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0))))

    if options.with_counts:
        info = Genomics.CalculatePairIndices(mali[ids[0]], mali[ids[1]])
        options.stdout.write("\t%s" % (str(info)))

    options.stdout.write("\t%s\n" % msg)
    options.stdout.flush()

Пример #6

Показать файл

Файл: mali2kaks.py Проект: Charlie-George/cgat

def outputXRateResult(mali, result, rsi, rsv, rni, rnv, msg):
    """output the results of running the Xrate four parameter grammar.
    """
    ids = mali.getIdentifiers()

    pi, matrix = RateEstimation.getRateMatrix(result.getModel(),
                                              terminals=('COD0', 'COD1', 'COD2'))

    if rsi is None:
        o_dn, o_ds, o_omega = "na", "na", "na"
        o_rn, o_rn0, o_rs, o_rs0 = "na", "na", "na", "na"
        o_t, o_t0 = "na", "na"
        o_N, o_S = "na", "na"
        o_kappa = "na",
        msg = "estimated rate parameters are zero"
    else:
        Q, t = RateEstimation.getQMatrix(pi,
                                         Rsi=rsi,
                                         Rsv=rsv,
                                         Rni=rni,
                                         Rnv=rnv)

        # get rate matrix as if omega was set to 1
        Q0, t0 = RateEstimation.getQMatrix(pi,
                                           Rsi=(rsi + rni) / 2.0,
                                           Rsv = (rsv + rnv) / 2.0,
                                           Rni = (rsi + rni) / 2.0,
                                           Rnv = (rsv + rnv) / 2.0)

        # get rate matrix as if kappa was set to 1
        Q1, t1 = RateEstimation.getQMatrix(pi,
                                           Rsi=(rsi + rsv) / 2.0,
                                           Rsv = (rsi + rsv) / 2.0,
                                           Rni = (rni + rnv) / 2.0,
                                           Rnv = (rni + rnv) / 2.0)

        rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q)
        rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0)
        rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1)

        # 64.0/61.0 results from the fact that xrate does not normalize
        # the terminals
        dS = rS / (3 * rS0) * t
        dN = rN / (3 * rN0) * t

        o_omega = options.value_format % (dN / dS)
        o_dn = options.value_format % dN
        o_ds = options.value_format % dS
        o_rn = options.value_format % rN
        o_rs = options.value_format % rS
        o_rn0 = options.value_format % rN0
        o_rs0 = options.value_format % rS0
        o_t = options.value_format % t
        o_t0 = options.value_format % t0
        o_S = options.value_format % (mali.getNumColumns() * rS0)
        o_N = options.value_format % (mali.getNumColumns() * rN0)

        # kappa is given normalized by sites like omega
        o_kappa = options.value_format % (rI / rI1 * rV1 / rV)

        # kappa1 is given by the ratio of the rates NOT normalized by the
        # sites.
        msg += " rI/rV=%f rI0/rV0=%f kappa1=%s" % (rI / rV,
                                                   rI0 / rV0,
                                                   options.value_format % ((rsi + rni) / (rsv + rnv)))

    options.stdout.write("\t".join(map(str, (mali.getEntry(ids[0]).mId,
                                             mali.getEntry(ids[1]).mId,
                                             o_dn, o_ds, o_omega,
                                             o_N, o_S, "na", "na",
                                             o_kappa, result.getLogLikelihood(
    ),
        "na"))))

    if options.with_rho:
        options.stdout.write("\t" + "\t".join(map(str, (o_rn, o_rs, o_t,
                                                        o_rn0, o_rs0, o_t0))))

    if options.with_counts:
        info = Genomics.CalculatePairIndices(mali[ids[0]], mali[ids[1]])
        options.stdout.write("\t%s" % (str(info)))

    options.stdout.write("\t%s\n" % msg)
    options.stdout.flush()

Пример #7

Показать файл

Файл: mali2rates.py Проект: CGATOxford/Optic

def runXrate(mali, pairs, options):

    from XGram.Generator.Prebuilt import DNA
    from XGram.Model import Annotation
    import XGram.Run

    xgram = XGram.XGram()
    if options.xrate_min_increment:
        xgram.setMinIncrement(options.xrate_min_increment)

    ninput, noutput, nskipped = 0, 0, 0

    tempdir = tempfile.mkdtemp()
    data = tempdir + "/data"

    if options.distance == "K80":
        model = DNA.buildModel(substitution_model="k80")
    elif options.distance == "JC69":
        model = DNA.buildModel(substitution_model="jc69")
    elif options.distance == "REV":
        model = DNA.buildModel(substitution_model="gtr")
    else:
        raise "distance %s not implemented for xrate" % (options.distance)

    writeModel(model, "input", options)

    if options.output_format == "list":
        options.stdout.write(
            "\t".join(("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg")))

        if options.with_counts:
            options.stdout.write(
                "\t%s" % Genomics.SequencePairInfo().getHeader())
        options.stdout.write("\n")

    for x, y in pairs:

        m1 = mali.getSequence(ids[x])
        ninput += 1
        temp_mali = Mali.Mali()
        m2 = mali.getSequence(ids[y])

        temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString)
        temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString)

# if temp_mali.getWidth() < options.min_overlap:
# if options.loglevel >= 1:
# options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId,
# mali.getEntry(ids[y]).mId,
# temp_mali.getWidth()) )

#             nskipped += 1
# continue

        outfile = open(data, "w")
        temp_mali.writeToFile(outfile, format="stockholm",
                              write_ranges=False,
                              options=("#=GF NH (%s:1.0)%s;" % tuple(temp_mali.getIdentifiers()),))
        outfile.close()

        o_alpha, o_kappa = "na", "na"
        o_distance = "na"
        msg = ""

        if options.test_xrate:
            for alpha in (0.1, 0.5, 1.0, 1.5):
                for beta in (0.1, 0.5, 1.0, 1.5):
                    model.mGrammar.setParameter("alpha", alpha)
                    model.mGrammar.setParameter("beta", beta)
                    result = xgram.train(model, data)
                    trained_model = result.getModel()
                    xalpha, xbeta = \
                        (trained_model.mGrammar.getParameter('alpha'),
                         trained_model.mGrammar.getParameter('beta'))
                    # this assumes that the branch length in the input is normalized to 1
                    # this is the normalization constant
                    o_distance = options.format % (2 * xbeta + xalpha)
                    o_kappa = options.format % (xalpha / xbeta)

                    msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta)

                    options.stdout.write("\t".join(("%f" % alpha,
                                                    "%f" % beta,
                                                    o_distance,
                                                    options.format % result.getLogLikelihood(
                                                    ),
                                                    o_alpha,
                                                    o_kappa,
                                                    msg)))
                    options.stdout.write("\n")
            continue

        options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId))

        if options.distance in ("K80", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()

        elif options.distance in ("REV", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()
            alpha, beta, gamma, delta, epsilon, theta = \
                (trained_model.mGrammar.getParameter('alpha'),
                 trained_model.mGrammar.getParameter('beta'),
                 trained_model.mGrammar.getParameter('gamma'),
                 trained_model.mGrammar.getParameter('delta'),
                 trained_model.mGrammar.getParameter('epsilon'),
                 trained_model.mGrammar.getParameter('theta'))

            pi = trained_model.evaluateTerminalFrequencies(('A0',))[('A0',)]
            matrix = trained_model.evaluateRateMatrix(('A0',))[('A0',)]
            q, d = RateEstimation.getDistanceGTR(pi, matrix)
            o_distance = options.format % (d)
            o_kappa = ""
            msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % (
                alpha, beta, gamma, delta, epsilon, theta)

        elif options.distance in ('JC69', ):
            result = xgram.buildTree(model, data)

        if options.distance == "K80":
            alpha, beta = \
                (trained_model.mGrammar.getParameter('alpha'),
                    trained_model.mGrammar.getParameter('beta'))
            # this assumes that the branch length in the input is normalized to 1
            # this is the normalization constant
            o_distance = options.format % (2 * beta + alpha)
            o_kappa = options.format % (alpha / beta)

            msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta)
            alpha = "na"

        elif options.distance == "JC69":

            tree = result.getTree()
            # multiply distance by tree, as rates are set to 1 and
            # thus the matrix is scaled by a factor of 3
            o_distance = options.format % (
                3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0]))
            o_kappa = "na"
            msg = ""

        writeModel(result.mModel, "trained", options)

        options.stdout.write("\t".join((o_distance,
                                        options.format % result.getLogLikelihood(
                                        ),
                                        o_alpha,
                                        o_kappa,
                                        msg)))

        if options.with_counts:
            info = Genomics.CalculatePairIndices(
                mali[ids[x]], mali[ids[y]], with_codons=options.is_codons)
            options.stdout.write("\t%s" % (str(info)))

        options.stdout.write("\n")

    shutil.rmtree(tempdir)

Пример #8

Показать файл

Файл: mali2rates.py Проект: santayana/cgat

def runXrate(mali, pairs, options):

    from XGram.Generator.Prebuilt import DNA
    from XGram.Model import Annotation
    import XGram.Run

    xgram = XGram.XGram()
    if options.xrate_min_increment:
        xgram.setMinIncrement(options.xrate_min_increment)

    ninput, noutput, nskipped = 0, 0, 0

    tempdir = tempfile.mkdtemp()
    data = tempdir + "/data"

    if options.distance == "K80":
        model = DNA.buildModel(substitution_model="k80")
    elif options.distance == "JC69":
        model = DNA.buildModel(substitution_model="jc69")
    elif options.distance == "REV":
        model = DNA.buildModel(substitution_model="gtr")
    else:
        raise "distance %s not implemented for xrate" % (options.distance)

    writeModel(model, "input", options)

    if options.output_format == "list":
        options.stdout.write("\t".join(
            ("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg")))

        if options.with_counts:
            options.stdout.write("\t%s" %
                                 Genomics.SequencePairInfo().getHeader())
        options.stdout.write("\n")

    for x, y in pairs:

        m1 = mali.getSequence(ids[x])
        ninput += 1
        temp_mali = Mali.Mali()
        m2 = mali.getSequence(ids[y])

        temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString)
        temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString)

        # if temp_mali.getWidth() < options.min_overlap:
        # if options.loglevel >= 1:
        # options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId,
        # mali.getEntry(ids[y]).mId,
        # temp_mali.getWidth()) )

        ##             nskipped += 1
        # continue

        outfile = open(data, "w")
        temp_mali.writeToFile(outfile,
                              format="stockholm",
                              write_ranges=False,
                              options=("#=GF NH (%s:1.0)%s;" %
                                       tuple(temp_mali.getIdentifiers()), ))
        outfile.close()

        o_alpha, o_kappa = "na", "na"
        o_distance = "na"
        msg = ""

        if options.test_xrate:
            for alpha in (0.1, 0.5, 1.0, 1.5):
                for beta in (0.1, 0.5, 1.0, 1.5):
                    model.mGrammar.setParameter("alpha", alpha)
                    model.mGrammar.setParameter("beta", beta)
                    result = xgram.train(model, data)
                    trained_model = result.getModel()
                    xalpha, xbeta = \
                        (trained_model.mGrammar.getParameter('alpha'),
                         trained_model.mGrammar.getParameter('beta'))
                    # this assumes that the branch length in the input is normalized to 1
                    # this is the normalization constant
                    o_distance = options.format % (2 * xbeta + xalpha)
                    o_kappa = options.format % (xalpha / xbeta)

                    msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta)

                    options.stdout.write("\t".join(
                        ("%f" % alpha, "%f" % beta, o_distance,
                         options.format % result.getLogLikelihood(), o_alpha,
                         o_kappa, msg)))
                    options.stdout.write("\n")
            continue

        options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId))

        if options.distance in ("K80", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()

        elif options.distance in ("REV", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()
            alpha, beta, gamma, delta, epsilon, theta = \
                (trained_model.mGrammar.getParameter('alpha'),
                 trained_model.mGrammar.getParameter('beta'),
                 trained_model.mGrammar.getParameter('gamma'),
                 trained_model.mGrammar.getParameter('delta'),
                 trained_model.mGrammar.getParameter('epsilon'),
                 trained_model.mGrammar.getParameter('theta'))

            pi = trained_model.evaluateTerminalFrequencies(('A0', ))[('A0', )]
            matrix = trained_model.evaluateRateMatrix(('A0', ))[('A0', )]
            q, d = RateEstimation.getDistanceGTR(pi, matrix)
            o_distance = options.format % (d)
            o_kappa = ""
            msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % (
                alpha, beta, gamma, delta, epsilon, theta)

        elif options.distance in ('JC69', ):
            result = xgram.buildTree(model, data)

        if options.distance == "K80":
            alpha, beta = \
                (trained_model.mGrammar.getParameter('alpha'),
                    trained_model.mGrammar.getParameter('beta'))
            # this assumes that the branch length in the input is normalized to 1
            # this is the normalization constant
            o_distance = options.format % (2 * beta + alpha)
            o_kappa = options.format % (alpha / beta)

            msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta)
            alpha = "na"

        elif options.distance == "JC69":

            tree = result.getTree()
            # multiply distance by tree, as rates are set to 1 and
            # thus the matrix is scaled by a factor of 3
            o_distance = options.format % (
                3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0]))
            o_kappa = "na"
            msg = ""

        writeModel(result.mModel, "trained", options)

        options.stdout.write("\t".join(
            (o_distance, options.format % result.getLogLikelihood(), o_alpha,
             o_kappa, msg)))

        if options.with_counts:
            info = Genomics.CalculatePairIndices(mali[ids[x]],
                                                 mali[ids[y]],
                                                 with_codons=options.is_codons)
            options.stdout.write("\t%s" % (str(info)))

        options.stdout.write("\n")

    shutil.rmtree(tempdir)

Пример #9

Показать файл

Файл: xrate_tms.py Проект: BioinformaticsArchive/cgat

def processMali( mali, options ):

    ncols = mali.getNumColumns()

    if ncols == 0:
        raise "refusing to process empty alignment."

    ## add annotation of states
    if options.block_size != None:
        if options.block_size < 1:
            size = int( float( ncols ) / 3.0 * options.block_size) * 3
        else:
            size = int( options.block_size ) * 3
        
        size = min( size, ncols )
        mali.addAnnotation( "STATE", "N" * size + "C" * (ncols - size))
            
    ## remove gene ids
    for id in mali.getIdentifiers():
        if options.separator in id:
            species = id.split(options.separator)[0]
            mali.rename( id, species )

    map_new2old = mali.mapIdentifiers()
    map_old2new = IOTools.getInvertedDictionary( map_new2old, make_unique = True )
    
    ids = mali.getIdentifiers()
    xgram = XGram.XGram()

    if options.xrate_min_increment:
        xgram.setMinIncrement( options.xrate_min_increment )

    ninput, noutput, nskipped = 0, 0, 0

    # remove empty columns and masked columns
    if options.clean_mali:
        mali.mGapChars = mali.mGapChars + ("n", "N")
        mali.removeGaps( minimum_gaps = 1, frame=3 )

    if options.input_filename_tree:
        nexus = TreeTools.Newick2Nexus( open(options.input_filename_tree,"r") )
        tree = nexus.trees[0]
        tree.relabel( map_old2new )
    else:
        tree = None

    annotation = mali.getAnnotation( "STATE" )
    chars = set(list(annotation))
    for c in chars:
        assert c in ("N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized"
    if len(chars) == 1:
        if options.loglevel >= 1:
            options.stdlog.write("# WARNING: only a single block" )
        blocks = ( ("B0_", chars[0]), )
    else:
        blocks = ( ("B0_", "N"), 
                   ("B1_", "C") )
    
    result, mali, ids = prepareGrammar( xgram, mali, tree, map_old2new, blocks, options )

    trained_model = result.getModel()

    pis, matrices = RateEstimation.getRateMatrix( trained_model )

    annotation = mali.getAnnotation( "STATE" )

    for block, code in blocks :

        terminals = ( "%sCOD0" % block,
                      "%sCOD1" % block,
                      "%sCOD2" % block )
        
        pi = pis[terminals]

        if options.shared_rates == "all":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa":
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "omega":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = block
        elif options.shared_rates == "omega-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = ""
        elif options.shared_rates == "ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block
        else:
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block
        
        if options.shared_frequencies:
            frequency_prefix = ""
        else:
            frequency_prefix = block

        rs = trained_model.mGrammar.getParameter( '%sRs' % rate_prefix_rs )
        rn = trained_model.mGrammar.getParameter( '%sRn' % rate_prefix_rn )
        ri = trained_model.mGrammar.getParameter( '%sRi' % rate_prefix_ri )
        rv = trained_model.mGrammar.getParameter( '%sRv' % rate_prefix_rv )    

        nchars = annotation.count( code )

        msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % ( result.getNumIterations(), rs, rn, ri, rv )
        
        try:
            Q, t = RateEstimation.getQMatrix( pi,
                                              Rsi=rs * ri,
                                              Rsv=rs * rv,
                                              Rni=rn * ri,
                                              Rnv=rn * rv )
            avg_omega = (rs + rn) / 2.0
            Q0, t0 = RateEstimation.getQMatrix( pi,
                                                Rsi = ri * avg_omega,
                                                Rsv = rv * avg_omega,
                                                Rni = ri * avg_omega,
                                                Rnv = rv * avg_omega )

            avg_kappa = (ri + rv) / 2.0
            Q1, t1 = RateEstimation.getQMatrix( pi,
                                                Rsi = rs * avg_kappa,
                                                Rsv = rs * avg_kappa,
                                                Rni = rn * avg_kappa,
                                                Rnv = rn * avg_kappa )

            rI, rV, rS, rN = RateEstimation.countSubstitutions( pi, Q )
            rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions( pi, Q0 )    
            rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions( pi, Q1 )    

            dS = rS / (3 * rS0) * t
            dN = rN / (3 * rN0) * t

            o_kappa = options.value_format % ( rI / rI0 * rV0 / rV )
            o_omega = options.value_format % (dN / dS)

            o_dn = options.value_format % dN
            o_ds = options.value_format % dS
            o_rn = options.value_format % rN
            o_rs = options.value_format % rS
            o_rn0 = options.value_format % rN0
            o_rs0 = options.value_format % rS0
            o_t = options.value_format % t
            o_t0 = options.value_format % t0

        except ZeroDivisionError:

            o_kappa = "na"
            o_omega = "na"
            o_dn = "na"
            o_ds = "na"
            o_rn = "na"
            o_rs = "na"
            o_rn0 = "na"
            o_rs0 = "na"
            o_t = "na"
            o_t0 = "na"
            Q = None
            msg = "insufficient data to estimate rate matrix."
        
        options.stdout.write( "\t".join( map(str, (
                        code, block,
                        o_dn, o_ds, o_omega,
                        "na", "na", "na", "na",
                        o_kappa, 
                        result.getLogLikelihood(),
                        "na",
                        nchars ))))

        if options.with_rho:
            options.stdout.write( "\t" + "\t".join( map(str, (o_rn, o_rs, o_t,
                                                              o_rn0, o_rs0, o_t0 ))))
            
        options.stdout.write( "\t%s\n" %  msg )

Пример #10

Показать файл

Файл: xrate_tms.py Проект: BioinformaticsArchive/cgat

def prepareGrammar( xgram, mali, tree, map_old2new, blocks, options ):
    """prepare grammar for custom grammars."""
    
    labels = map( lambda x: x[1], blocks )
    nblocks = len(blocks)
    
    annotate_terminals = {}
    for x in range(len(labels)):
        annotations = []
        key = []

        for c in range( 0,3 ):
            t = "B%i_COD%i" % (x, c)
            key.append(t)
            annotations.append( Annotation( row = "STATE",
                                            column = t,
                                            label = labels[x] ))
            
        annotate_terminals[ tuple(key) ] = annotations

    input_model = Codons.buildCodonML( codon_model = "f3x4-fourproducts",
                                       num_blocks = nblocks,
                                       grammar_type = "linear-blocks",
                                       annotate_terminals=annotate_terminals,
                                       shared_frequencies = options.shared_frequencies,
                                       shared_rates = False,
                                       )

    ## manually share rates between blocks
    if options.shared_rates == "kappa":
        for c in range( 0, nblocks):
            input_model.renameParameter( "B%i_Ri" % c, "Ri" )
            input_model.renameParameter( "B%i_Rv" % c, "Rv" )
    elif options.shared_rates == "kappa-ds":
        for c in range( 0, nblocks):
            input_model.renameParameter( "B%i_Ri" % c, "Ri" )
            input_model.renameParameter( "B%i_Rv" % c, "Rv" )
            input_model.renameParameter( "B%i_Rs" % c, "Rs" )
    elif options.shared_rates == "omega":
        for c in range( 0, nblocks):
            input_model.renameParameter( "B%i_Rs" % c, "Rs" )
            input_model.renameParameter( "B%i_Rn" % c, "Rn" )
    elif options.shared_rates == "omega-ds":
        for c in range( 0, nblocks):
            input_model.renameParameter( "B%i_Rv" % c, "Rv" )
            input_model.renameParameter( "B%i_Rs" % c, "Rs" )
            input_model.renameParameter( "B%i_Rn" % c, "Rn" )
    elif options.shared_rates == "ds":
        for c in range( 0, nblocks):
            input_model.renameParameter( "B%i_Rs" % c, "Rs" )
    elif options.shared_rates == "all":
        for c in range( 0, nblocks):
            input_model.renameParameter( "B%i_Rv" % c, "Rv" )
            input_model.renameParameter( "B%i_Rs" % c, "Rs" )
            input_model.renameParameter( "B%i_Rn" % c, "Rn" )
            input_model.renameParameter( "B%i_Ri" % c, "Ri" )

    writeModel( input_model, "input", options )
    
    ids = mali.getIdentifiers()

    fh, filename = tempfile.mkstemp()

    os.close(fh)
    outfile = open(filename, "w" )
    
    ## clip mali by supplied blocks
    mali.clipByAnnotation( "STATE", "".join(labels))

    if tree:
        tree.rescaleBranchLengths( 1.0 )
        tree_options = "#=GF NH %s" % tree.to_string( branchlengths_only=True, format="nh")
    elif mali.getNumSequences() == 2:
        tree_options = "#=GF NH (%s:1.0)%s;" % tuple(map_old2new.values())
    else:
        raise "Please supply a tree."

    mali.writeToFile( outfile, 
                      format="stockholm",
                      write_ranges = False,
                      options = ( tree_options, ) )
    outfile.close()
    
    ## prefix, code
    if options.shared_frequencies:
        frequency_codes = ( ("", ""), )
    else:
        frequency_codes = blocks
        
    if options.insert_frequencies:
        for prefix, code in frequency_codes:
            temp_mali = mali.getClone()
            temp_mali.clipByAnnotation( "STATE", code )
            RateEstimation.setFrequencies( input_model, temp_mali, prefix )
            
    if options.fix_frequencies:
        for prefix, code in frequency_codes:
            for char in ('a', 'c', 'g', 't'):
                for x in (0, 1, 2):
                    param = "%sp%s%i" % (prefix, char, x)
                    input_model.mGrammar.moveVariableToConst( param )

    writeModel( input_model, "input", options )
    
    t1 = time.time()

    result = xgram.train( input_model, filename )

    if options.dump:
        options.stdlog.write( "".join(result.mData) )
        options.stdlog.write( "".join(result.mLog) )
        mali.writeToFile( options.stdlog, 
                          format="stockholm",
                          write_ranges = False,
                          options = (tree_options,))

    t2 = time.time()
    
    trained_model = result.getModel()

    writeModel( trained_model, "trained", options )

    return result, mali, ids