Exemplo n.º 1
0
def sim_tree_hpip(tree,
                  iRateSeg,
                  dRateSeg,
                  piProbSeg,
                  qMatSeg,
                  ratesList,
                  piProb,
                  qMat,
                  cList,
                  fixSegNumber=False):
    """
    generate sequences based on a tree using hPIP model.
    """
    piProb = pi_from_qmat(qMat)
    for node in tree.preorder_node_iter():
        if node.parent_node is None:
            value = sim_segs_initial(iRateSeg, dRateSeg, piProbSeg, ratesList,
                                     piProb, cList, fixSegNumber)
            node.value = value
        else:
            value = segs_change(node.parent_node.value, piProbSeg, iRateSeg,
                                dRateSeg, qMatSeg, node.edge.length, ratesList,
                                qMat, piProb, cList)
            node.value = value
    return tree
Exemplo n.º 2
0
def sim_tree(tree,
             p,
             ratesList,
             piProbSeg,
             piProb,
             qMat,
             cList,
             fixSegNumber=False):
    """
    Generate sequences based on a tree using the GeoPIP model.
    input:
        tree: a tree, of dendropy.Tree class
        p: parameter of geometric distribution
        ratesList: a list of all rates
        piProb, qMat, cList: ...
    output:
        tree: updated tree, with value at each node
    """
    piProb = pi_from_qmat(qMat)
    for node in tree.preorder_node_iter():
        if node.parent_node is None:
            value = sim_segs_initial(p, ratesList, piProbSeg, piProb, cList,
                                     fixSegNumber)
            node.value = value
        else:
            value = segs_change(node.parent_node.value, node.edge.length, qMat,
                                piProb, cList)
            node.value = value
    return tree
Exemplo n.º 3
0
def sim_tree_hpip(tree, iRateSeg, dRateSeg, piProbSeg, qMatSeg, ratesList, piProb, qMat, cList, fixSegNumber=False):
    """
    generate sequences based on a tree using hPIP model.
    """
    piProb = pi_from_qmat(qMat)
    for node in tree.preorder_node_iter():
        if node.parent_node is None:
            value = sim_segs_initial(iRateSeg, dRateSeg, piProbSeg, ratesList, piProb, cList, fixSegNumber)
            node.value = value
        else:
            value = segs_change(node.parent_node.value, piProbSeg, iRateSeg, dRateSeg, qMatSeg, node.edge.length, ratesList, qMat, piProb, cList)
            node.value = value
    return tree
Exemplo n.º 4
0
def param_for_estep(qMat, rate=1.0, categoryPriors=1.0):
    """
    generate the input.json file required for the E step
    contains rate matrix and categories of rates
    NOTE: THIS FUNCTION CURRENTLY ONLY WORK FOR ONE RATE
          DIFFERENT FROM R CODE INPUT
    """
    res = {}
    # nCharType = qMat.shape[0]
    # nRate = length(rate)
    # nCate = length(categoryPriors)
    piProb = pi_from_qmat(qMat)
    res['categoryPriors'] = [1]
    res['observationErrorProbability'] = 0
    res['stationaryDistributions'] = [piProb]
    res['rateMatrices'] = [qMat]
    return res
Exemplo n.º 5
0
def qmat_paml_format(qMat):
    """
    get PaML format from a rate matrix, i.e., frequencies and relative rate parameters
    input:
        qMat: a rate matrix
    output:
        piProb: stationary distribution of the rate matrix
        rate: relative rate parameters, in the order of
              A <-> C, A <-> G, A <-> T, C <-> G, C <-> T, G <-> T
    """
    piProb = pi_from_qmat(qMat)
    tMat = qMat / piProb
    # dimRow, dimCol = qMat.shape
    # tem = np.array([tMat[i, (i+1):4] for i in xrange(4-1)])
    rate = list([list(tMat[0, 1:4]), list(tMat[1, 2:4]), list(tMat[2, 3:4])])
    rate = np.hstack(rate)
    rate = rate / rate[-1]
    return piProb, rate
Exemplo n.º 6
0
def qmat_paml_format(qMat):
    """
    get PaML format from a rate matrix, i.e., frequencies and relative rate parameters
    input:
        qMat: a rate matrix
    output:
        piProb: stationary distribution of the rate matrix
        rate: relative rate parameters, in the order of
              A <-> C, A <-> G, A <-> T, C <-> G, C <-> T, G <-> T
    """
    piProb = pi_from_qmat(qMat)
    tMat = qMat / piProb
    # dimRow, dimCol = qMat.shape
    # tem = np.array([tMat[i, (i+1):4] for i in xrange(4-1)])
    rate = list([list(tMat[0, 1:4]), list(tMat[1, 2:4]), list(tMat[2, 3:4])])
    rate = np.hstack(rate)
    rate = rate / rate[-1]
    return piProb, rate
Exemplo n.º 7
0
def sim_tree(tree, p, ratesList, piProbSeg, piProb, qMat, cList, fixSegNumber=False):
    """
    Generate sequences based on a tree using the GeoPIP model.
    input:
        tree: a tree, of dendropy.Tree class
        p: parameter of geometric distribution
        ratesList: a list of all rates
        piProb, qMat, cList: ...
    output:
        tree: updated tree, with value at each node
    """
    piProb = pi_from_qmat(qMat)
    for node in tree.preorder_node_iter():
        if node.parent_node is None:
            value = sim_segs_initial(p, ratesList, piProbSeg, piProb, cList, fixSegNumber)
            node.value = value
        else:
            value = segs_change(node.parent_node.value, node.edge.length, qMat, piProb, cList)
            node.value = value
    return tree
Exemplo n.º 8
0
def opt_pip_full(rate, qMat, multiAlign, javaDirectory, modelDirectory, eStepFile, parametersPath, inputLoc, outputLoc, dataLoc, execsLoc, rFileLoc, cList, qRates=[1.], suffix='', updateQ=True, updateRate=True, updateRateFixdRateTimesb=True, tol=1.e-2, bTol=1.e-3, iterMax=100):
    """
    optimization for all parameters: iRate, dRate, qMat, tree (bDict) in PIP
    updating in iRate and dRate, qMat, tree (bDict) iteratively
    estimate bDict first given other, so starting bDict (tree) is not needed
    """
    outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files(dataLoc, suffix)
    rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc)
    print 'simulation run: %s' % (inputLoc)
    # directory for runing EM
    alignInSeg = pair_align_from_multi_align(multiAlign)
    pairsList = alignInSeg.keys()
    piProb = pi_from_qmat(qMat)
    print '###  initialize tree  ###'
    segRateDict = {0: rate}   # only one segment
    bDict = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb, qMat, cList, qRates)
    # write_dist_from_bdict(bDict, dataLoc, suffix)
    tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True)
    outAlignFile = inputLoc + '/' + 'all.align.txt'
    outTreeFile = inputLoc + '/' + 'all.tree.txt'
    dict_write_align_fasta(multiAlign, outAlignFile)
    write_tree(tree, outTreeFile)
    tree.reroot_at_midpoint()
    dif = 1.e10
    bDictRelativeDif = 1
    iterNum = 1
    nllk = 1.e10
    nllkDif = 1
    while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)):
        if updateRateFixdRateTimesb:
            print '### updating insertion and deletion rate when dRate*b is fixed ###'
            dRate = rate[1]
            print tree.length()
            rateNew, tree = opt_drate_fix_drate_times_b(multiAlign, dRate, tree, qMat, piProb, cList, qRates=[1.])
            print tree.length()
            print rate
            print rateNew
            print (np.array(rateNew) - np.array(rate)) / np.array(rate)
            dRateRelativeDifFixdRateTimesb = abs(rate[1] - rateNew[1]) / rate[1]
            rate = rateNew
        else:
            dRateRelativeDifFixdRateTimesb = 0
        if updateRate:
            print '### updating insertion and deletion rate ###\n'
            # rateNew = opt_rate(rate, multiAlign, tree, qMat, piProb, cList)
            rateNew = opt_drate(multiAlign, tree, qMat, piProb, cList, qRates)
            print rate
            print rateNew
            print (np.array(rateNew) - np.array(rate)) / np.array(rate)
            dRateRelativeDif = abs(rate[1] - rateNew[1]) / rate[1]
            rate = rateNew
        else:
            print 'rate is fixed:', rate
            dRateRelativeDif = 0
        if updateQ:
            print '### updating rate matrix Q ###\n'
            qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc, outputLoc, javaDirectory, modelDirectory, eStepFile, parametersPath, execsLoc)
            qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat)
            qMatRelativeDif = qMatRelativeDifMat.max()
            qMat = qMatNew
            piProb = piProbNew
        else:
            print '### fixing rate matrix Q ###\n'
            qMatRelativeDif = 0
        print '### updating tree ###'
        segRateDict = {0: rate}   # only one segment
        bDictNew = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb, qMat, cList, qRates)
        bDictRelativeDifVec = [abs(bDictNew[key] - bDict[key]) / bDict[key] for key in bDict.keys()]
        bDictRelativeDif = np.array(bDictRelativeDifVec).max()
        print (np.array(bDictNew.values()) - np.array(bDict.values())) / np.array(bDict.values())
        bDict = bDictNew
        # write_dist_from_bdict(bDict, dataLoc, suffix)
        tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True)
        # print 'iter=%s: iRate diff = %s, Q diff = %s, bDict diff = %s' % (iterNum, iRateRelativeDif, qMatRelativeDif, bDictRelativeDif)
        # dif = max(iRateRelativeDif, qMatRelativeDif, bDictRelativeDif)
        print 'iter=%s: dRate diff = %s, dRate diff (fix dRate*b) = %s, Q diff = %s, bDict diff = %s' % (iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesb, qMatRelativeDif, bDictRelativeDif)
        dif = max(dRateRelativeDif, qMatRelativeDif, bDictRelativeDif, dRateRelativeDifFixdRateTimesb)
        tree.reroot_at_midpoint()
        nllkNew = nllk_rate(rate, multiAlign, tree, qMat, piProb, cList)
        nllkDif = nllk - nllkNew
        print 'llk increase =', nllkDif
        nllk = nllkNew
        write_tree(tree, outTreeFile)
        iterNum += 1
        # if dRateRelativeDifFixdRateTimesb is small, then skip that update
        if dRateRelativeDifFixdRateTimesb < 0.5:
            updateRateFixdRateTimesb = False
        if nllkDif <= 0:
            print 'Log-Lilkelihood is decreasing! BREAK'
            break
    if iterNum == iterMax:
        print 'maximum iteration %d reached' % (iterMax)
    else:
        print 'optimization sucess!'
    rate = [rate]
    return rate, qMat, bDict, tree
Exemplo n.º 9
0
def opt_pip_full(rate,
                 qMat,
                 multiAlign,
                 javaDirectory,
                 modelDirectory,
                 eStepFile,
                 parametersPath,
                 inputLoc,
                 outputLoc,
                 dataLoc,
                 execsLoc,
                 rFileLoc,
                 cList,
                 qRates=[1.],
                 suffix='',
                 updateQ=True,
                 updateRate=True,
                 updateRateFixdRateTimesb=True,
                 tol=1.e-2,
                 bTol=1.e-3,
                 iterMax=100):
    """
    optimization for all parameters: iRate, dRate, qMat, tree (bDict) in PIP
    updating in iRate and dRate, qMat, tree (bDict) iteratively
    estimate bDict first given other, so starting bDict (tree) is not needed
    """
    outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files(
        dataLoc, suffix)
    rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc)
    print 'simulation run: %s' % (inputLoc)
    # directory for runing EM
    alignInSeg = pair_align_from_multi_align(multiAlign)
    pairsList = alignInSeg.keys()
    piProb = pi_from_qmat(qMat)
    print '###  initialize tree  ###'
    segRateDict = {0: rate}  # only one segment
    bDict = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb, qMat,
                             cList, qRates)
    # write_dist_from_bdict(bDict, dataLoc, suffix)
    tree = tree_use_r_for_unknown_number_of_leaves(bDict,
                                                   pairsList,
                                                   rCodeNj,
                                                   dataLoc,
                                                   outTreeLoc,
                                                   suffix,
                                                   rooted=True)
    outAlignFile = inputLoc + '/' + 'all.align.txt'
    outTreeFile = inputLoc + '/' + 'all.tree.txt'
    dict_write_align_fasta(multiAlign, outAlignFile)
    write_tree(tree, outTreeFile)
    tree.reroot_at_midpoint()
    dif = 1.e10
    bDictRelativeDif = 1
    iterNum = 1
    nllk = 1.e10
    nllkDif = 1
    while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)):
        if updateRateFixdRateTimesb:
            print '### updating insertion and deletion rate when dRate*b is fixed ###'
            dRate = rate[1]
            print tree.length()
            rateNew, tree = opt_drate_fix_drate_times_b(multiAlign,
                                                        dRate,
                                                        tree,
                                                        qMat,
                                                        piProb,
                                                        cList,
                                                        qRates=[1.])
            print tree.length()
            print rate
            print rateNew
            print(np.array(rateNew) - np.array(rate)) / np.array(rate)
            dRateRelativeDifFixdRateTimesb = abs(rate[1] -
                                                 rateNew[1]) / rate[1]
            rate = rateNew
        else:
            dRateRelativeDifFixdRateTimesb = 0
        if updateRate:
            print '### updating insertion and deletion rate ###\n'
            # rateNew = opt_rate(rate, multiAlign, tree, qMat, piProb, cList)
            rateNew = opt_drate(multiAlign, tree, qMat, piProb, cList, qRates)
            print rate
            print rateNew
            print(np.array(rateNew) - np.array(rate)) / np.array(rate)
            dRateRelativeDif = abs(rate[1] - rateNew[1]) / rate[1]
            rate = rateNew
        else:
            print 'rate is fixed:', rate
            dRateRelativeDif = 0
        if updateQ:
            print '### updating rate matrix Q ###\n'
            qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc,
                                                  outputLoc, javaDirectory,
                                                  modelDirectory, eStepFile,
                                                  parametersPath, execsLoc)
            qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat)
            qMatRelativeDif = qMatRelativeDifMat.max()
            qMat = qMatNew
            piProb = piProbNew
        else:
            print '### fixing rate matrix Q ###\n'
            qMatRelativeDif = 0
        print '### updating tree ###'
        segRateDict = {0: rate}  # only one segment
        bDictNew = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb,
                                    qMat, cList, qRates)
        bDictRelativeDifVec = [
            abs(bDictNew[key] - bDict[key]) / bDict[key]
            for key in bDict.keys()
        ]
        bDictRelativeDif = np.array(bDictRelativeDifVec).max()
        print(np.array(bDictNew.values()) -
              np.array(bDict.values())) / np.array(bDict.values())
        bDict = bDictNew
        # write_dist_from_bdict(bDict, dataLoc, suffix)
        tree = tree_use_r_for_unknown_number_of_leaves(bDict,
                                                       pairsList,
                                                       rCodeNj,
                                                       dataLoc,
                                                       outTreeLoc,
                                                       suffix,
                                                       rooted=True)
        # print 'iter=%s: iRate diff = %s, Q diff = %s, bDict diff = %s' % (iterNum, iRateRelativeDif, qMatRelativeDif, bDictRelativeDif)
        # dif = max(iRateRelativeDif, qMatRelativeDif, bDictRelativeDif)
        print 'iter=%s: dRate diff = %s, dRate diff (fix dRate*b) = %s, Q diff = %s, bDict diff = %s' % (
            iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesb,
            qMatRelativeDif, bDictRelativeDif)
        dif = max(dRateRelativeDif, qMatRelativeDif, bDictRelativeDif,
                  dRateRelativeDifFixdRateTimesb)
        tree.reroot_at_midpoint()
        nllkNew = nllk_rate(rate, multiAlign, tree, qMat, piProb, cList)
        nllkDif = nllk - nllkNew
        print 'llk increase =', nllkDif
        nllk = nllkNew
        write_tree(tree, outTreeFile)
        iterNum += 1
        # if dRateRelativeDifFixdRateTimesb is small, then skip that update
        if dRateRelativeDifFixdRateTimesb < 0.5:
            updateRateFixdRateTimesb = False
        if nllkDif <= 0:
            print 'Log-Lilkelihood is decreasing! BREAK'
            break
    if iterNum == iterMax:
        print 'maximum iteration %d reached' % (iterMax)
    else:
        print 'optimization sucess!'
    rate = [rate]
    return rate, qMat, bDict, tree