Exemplo n.º 1
0
def opt_ctmc_full(qMat, multiAlign, javaDirectory, modelDirectory, eStepFile, parametersPath, inputLoc, outputLoc, dataLoc, execsLoc, rFileLoc, cList, qRates=[1], suffix='', updateQ=True, tol=1.e-2, bTol=1.e-3, iterMax=100):
    """
    optimization for all parameters in the CTMC model: qMat, tree (bDict)
    update qMat and tree (bDict) iteratively
    """
    outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files(dataLoc, suffix)
    rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc)
    print 'CTMC estimate for data in: %s' % (inputLoc)
    pairAlignSubsOnly = pair_align_from_multi_align_subs_only(multiAlign)
    piProb = pi_from_qmat(qMat)
    pairsList = pairAlignSubsOnly.keys()
    print '###  initialize tree  ###'
    bDict, nllkAll = opt_nstr_ctmc_bonly(pairAlignSubsOnly, piProb, qMat, cList, qRates)
    tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True)
    outAlignFile = inputLoc + '/' + 'all.align.txt'
    outTreeFile = inputLoc + '/' + 'all.tree.txt'
    dict_write_align_fasta(multiAlign, outAlignFile)
    write_tree(tree, outTreeFile)
    dif = 1.e10
    bDictRelativeDif = 1
    iterNum = 1
    while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)):
        if updateQ:
            print '### updating rate matrix Q ###\n'
            # write_align(pairsList, pairAlignSubsOnly, bDict, inputLoc)
            qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc, outputLoc, javaDirectory, modelDirectory, eStepFile, parametersPath, execsLoc)
            qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat)
            qMatRelativeDif = qMatRelativeDifMat.max()
            qMat = qMatNew
            piProb = piProbNew
        else:
            print '### fixing rate matrix Q ###\n'
            qMatRelativeDif = 0
        print '### updating tree ###'
        bDictNew, nllkAll = opt_nstr_ctmc_bonly(pairAlignSubsOnly, piProb, qMat, cList, qRates)
        bDictRelativeDifVec = [abs(bDictNew[key] - bDict[key]) / bDict[key] for key in bDict.keys()]
        bDictRelativeDif = np.array(bDictRelativeDifVec).max()
        bDict = bDictNew
        print 'iter=%s: Q diff = %s, bDict diff = %s' % (iterNum, qMatRelativeDif, bDictRelativeDif)
        dif = max(qMatRelativeDif, bDictRelativeDif)
        tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix='', rooted=True)
        write_tree(tree, outTreeFile)
        iterNum += 1
    if iterNum == iterMax:
        print 'maximum iteration %d reached' % (iterMax)
    else:
        print 'optimization sucess!'
        print 'nllk = %s' % (nllkAll)
    tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True)
    write_tree(tree, outTreeFile)
    return qMat, bDict, tree, nllkAll
Exemplo n.º 2
0
def opt_geopip_full(m, p, qMat, segRateDict, piProbRates, ratesList, multiAlign, lenSegs, javaDirectory, modelDirectory, eStepFile, parametersPath, inputLoc, outputLoc, dataLoc, execsLoc, rFileLoc, cList, qRates=[1.], suffix='', updateQ=True, updateSeg=True, updateRate=True, updateRateFixdRateTimesTau=True, rooted=True, tol=1.e-2, bTol=1.e-3, iterMax=100):
    """
    main function for optimization
    updating in segment rates, out segment rates, qMat, bDict iteratively
    we estimate bDict first, so bDict is not needed as input
    """
    outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files(dataLoc, suffix)
    rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc)
    seqNames = multiAlign.keys()
    msaList = zip(*multiAlign.values())
    print 'simulation run: %s' % (inputLoc)
    multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg(multiAlign, lenSegs)
    alignsInSeg = pair_align_from_multi_align(multiAlign, lenSegs)
    pairsList = alignsInSeg.keys()
    piProb = pi_from_qmat(qMat)
    print '###  initialize branch lengths  ###'
    bDict = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict, piProb, qMat, cList, qRates)
    # write_dist_from_bdict(bDict, dataLoc, suffix)    # this function can be improved, by using outNameLoc, outDistLoc, outTreeLoc instead
    tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted)
    outAlignFile = inputLoc + '/' + 'all.align.txt'
    outTreeFile = inputLoc + '/' + 'all.tree.txt'
    dict_write_align_fasta(multiAlign, outAlignFile)
    write_tree(tree, outTreeFile)
    tree.reroot_at_midpoint()
    dif = 1.e10
    bDictRelativeDif = 1
    iterNum = 1
    nllk = 1.e10
    # nllkDif = 1
    while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)):
        if updateSeg:
            print '### update segmentation ###'
            lenSegs, rateSegs = mle_seg_len_rate(p, msaList, ratesList, seqNames, tree, qMat, piProb, piProbRates, cList)
            nSeg = len(rateSegs)
            alignsInSeg, segRateDict = update_segmentation_in_align_in_seg(lenSegs, rateSegs, ratesList, multiAlign)
            multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg(multiAlign, lenSegs)
            piProbRates = update_piprobrates(rateSegs, m, True)
            p = update_p(nSeg)
        print lenSegs
        if updateRateFixdRateTimesTau:
            print '### updating insertion and deletion rate when dRate*tau is fixed ###'
            ratesListNew, segRateDictNew, treeNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate_fix_drate_times_b(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList)
            print 'rate:', ratesList
            print 'new rate:', ratesListNew
            dRateRelativeDifFixdRateTimesTauList = [abs(ratesListNew[index][1] - ratesList[index][1]) / ratesList[index][1] for index in xrange(m) if ratesListNew[index][1] > 1.e-3]
            dRateRelativeDifFixdRateTimesTau = np.array(dRateRelativeDifFixdRateTimesTauList).max()
            ratesList = ratesListNew
            segRateDict = segRateDictNew
            tree = treeNew
        else:
            dRateRelativeDifFixdRateTimesTau = 0
        if updateRate:
            print '### updating in segment rates ###'
            # segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList)
            segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList)
            print 'rate:', ratesList
            print 'new rate:', ratesListNew
            dRateRelativeDifList = [abs(ratesListNew[index][1] - ratesList[index][1]) / ratesList[index][1] for index in xrange(m) if ratesListNew[index][1] > 1.e-3]
            dRateRelativeDif = np.array(dRateRelativeDifList).max()
            segRateDict = segRateDictNew
            ratesList = ratesListNew
        else:
            dRateRelativeDif = 0
            print 'rate is Fixed:', ratesList
        #####################
        # another approach based on clustering
        # segRateDictNew = opt_nlists_rate_all_inseg_only(segRateDict, alignsInSeg, bDict, pairsList, piProb, qMat, cList)
        # # IMPROVE THIS LATER, scale each rate separately first maybe?
        # inSegRateDif = max([abs(segRateDictNew[segId] - np.array(segRateDict[segId])).max() for segId in segRateDictNew.keys()])
        # # update ratesList
        # ratesList, segRateDictNew = ratesList_kmean(segRateDictNew, m)
        # print segRateDict
        # print segRateDictNew
        # segRateDict = segRateDictNew
        # display estiamteed in segment rates
        # for segId, segRate in segRateDict.iteritems():
        #     print 'seg id = %s, \t segRate = %s' %(segId, segRate)
        ######################
        if updateQ:
            print '### updating rate matrix Q ###'
            # if alignUpdate:
            #     aligns = align_mle(lans, pairsList, bDict, iRate, dRate, qMat, piProb, cList)
            # if cogUpdate:
            #     alignsCogOnlyEst = est_align_cog(lans, pairsList, iRate, dRate, cRate, cRateVec, bDict, piProb, qMat, cList, aligns, criticalValue)
            # else:
            #     alignsCogOnlyEst = alignsCogOnly
            # write_align(pairsList, alignsInSeg, bDict, inputLoc)
            qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc, outputLoc, javaDirectory, modelDirectory, eStepFile, parametersPath, execsLoc, lEstepTol=1.e-2, iterMax=100)
            qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat)
            qMatRelativeDif = qMatRelativeDifMat.max()
            # qMatDif = abs(qMatNew - qMat).max()
            qMat = qMatNew
            piProb = piProbNew
            # qMatDif = 0
        else:
            print '### fixing rate matrix Q ###'
            qMatRelativeDif = 0
        print '### updating branch lengths ###'
        bDictNew = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict, piProb, qMat, cList, qRates)
        # bDictDifVec = [abs(bDictNew[key] - bDict[key]) for key in bDict.keys()]
        bDictRelativeDifVec = [abs(bDictNew[key] - bDict[key]) / bDict[key] for key in bDict.keys()]
        bDictRelativeDif = np.array(bDictRelativeDifVec).max()
        # bDictDif = np.array(bDictDifVec).max()
        bDict = bDictNew
        # write_dist_from_bdict(bDict, dataLoc, suffix)
        tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted)
        print 'iter=%s: rates in seg diff = %s, rates in seg diff (fix dRate*tau) = %s, Q diff = %s, b diff = %s' % (iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesTau, qMatRelativeDif, bDictRelativeDif)
        # print 'iter=%s: rates in seg diff = %s, Q diff = %s, b diff = %s' % (iterNum, segRateDif, qMatDif, bDictDif)
        dif = max(dRateRelativeDif, dRateRelativeDifFixdRateTimesTau, qMatRelativeDif, bDictRelativeDif)
        tree.reroot_at_midpoint()
        nllkNew = nllk_msa_geopip_final(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList, piProbRates, p, rateSegs)
        nllkDif = -nllkNew + nllk
        nllkOld = nllk
        nllk = nllkNew
        print 'llk increase =', nllkDif
        if nllkDif <= 0:
            print 'Log-Likelihood is decreasing! BREAK!'
            bDict = bDictOld
            qMat = qMatOld
            segRateDict = segRateDictOld
            alignsInSeg = alignsInSegOld
            p = pOld
            piProbRates = piProbRatesOld
            ratesList = ratesListOld
            lenSegs = lenSegsOld
            tree = treeOld
            nllk = nllkOld
            break
        # do not update fixing dRate*tau if changes is small
        if dRateRelativeDifFixdRateTimesTau < 0.05:
            updateRateFixdRateTimesTau = False
        write_tree(tree, outTreeFile)
        iterNum += 1
        bDictOld = bDict
        qMatOld = qMat
        segRateDictOld = segRateDict
        alignsInSegOld = alignsInSeg
        pOld = p
        piProbRatesOld = piProbRates
        ratesListOld = ratesList
        lenSegsOld = lenSegs
        treeOld = tree
    if iterNum == iterMax:
        print 'maximum iteration %d reached' % (iterMax)
    else:
        print 'optimization sucess!'
    return bDict, qMat, segRateDict, alignsInSeg, p, piProbRates, ratesList, lenSegs, tree, nllk
Exemplo n.º 3
0
def opt_ctmc_full(qMat,
                  multiAlign,
                  javaDirectory,
                  modelDirectory,
                  eStepFile,
                  parametersPath,
                  inputLoc,
                  outputLoc,
                  dataLoc,
                  execsLoc,
                  rFileLoc,
                  cList,
                  qRates=[1],
                  suffix='',
                  updateQ=True,
                  tol=1.e-2,
                  bTol=1.e-3,
                  iterMax=100):
    """
    optimization for all parameters in the CTMC model: qMat, tree (bDict)
    update qMat and tree (bDict) iteratively
    """
    outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files(
        dataLoc, suffix)
    rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc)
    print 'CTMC estimate for data in: %s' % (inputLoc)
    pairAlignSubsOnly = pair_align_from_multi_align_subs_only(multiAlign)
    piProb = pi_from_qmat(qMat)
    pairsList = pairAlignSubsOnly.keys()
    print '###  initialize tree  ###'
    bDict, nllkAll = opt_nstr_ctmc_bonly(pairAlignSubsOnly, piProb, qMat,
                                         cList, qRates)
    tree = tree_use_r_for_unknown_number_of_leaves(bDict,
                                                   pairsList,
                                                   rCodeNj,
                                                   dataLoc,
                                                   outTreeLoc,
                                                   suffix,
                                                   rooted=True)
    outAlignFile = inputLoc + '/' + 'all.align.txt'
    outTreeFile = inputLoc + '/' + 'all.tree.txt'
    dict_write_align_fasta(multiAlign, outAlignFile)
    write_tree(tree, outTreeFile)
    dif = 1.e10
    bDictRelativeDif = 1
    iterNum = 1
    while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)):
        if updateQ:
            print '### updating rate matrix Q ###\n'
            # write_align(pairsList, pairAlignSubsOnly, bDict, inputLoc)
            qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc,
                                                  outputLoc, javaDirectory,
                                                  modelDirectory, eStepFile,
                                                  parametersPath, execsLoc)
            qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat)
            qMatRelativeDif = qMatRelativeDifMat.max()
            qMat = qMatNew
            piProb = piProbNew
        else:
            print '### fixing rate matrix Q ###\n'
            qMatRelativeDif = 0
        print '### updating tree ###'
        bDictNew, nllkAll = opt_nstr_ctmc_bonly(pairAlignSubsOnly, piProb,
                                                qMat, cList, qRates)
        bDictRelativeDifVec = [
            abs(bDictNew[key] - bDict[key]) / bDict[key]
            for key in bDict.keys()
        ]
        bDictRelativeDif = np.array(bDictRelativeDifVec).max()
        bDict = bDictNew
        print 'iter=%s: Q diff = %s, bDict diff = %s' % (
            iterNum, qMatRelativeDif, bDictRelativeDif)
        dif = max(qMatRelativeDif, bDictRelativeDif)
        tree = tree_use_r_for_unknown_number_of_leaves(bDict,
                                                       pairsList,
                                                       rCodeNj,
                                                       dataLoc,
                                                       outTreeLoc,
                                                       suffix='',
                                                       rooted=True)
        write_tree(tree, outTreeFile)
        iterNum += 1
    if iterNum == iterMax:
        print 'maximum iteration %d reached' % (iterMax)
    else:
        print 'optimization sucess!'
        print 'nllk = %s' % (nllkAll)
    tree = tree_use_r_for_unknown_number_of_leaves(bDict,
                                                   pairsList,
                                                   rCodeNj,
                                                   dataLoc,
                                                   outTreeLoc,
                                                   suffix,
                                                   rooted=True)
    write_tree(tree, outTreeFile)
    return qMat, bDict, tree, nllkAll
Exemplo n.º 4
0
def opt_pip_full(rate, qMat, multiAlign, javaDirectory, modelDirectory, eStepFile, parametersPath, inputLoc, outputLoc, dataLoc, execsLoc, rFileLoc, cList, qRates=[1.], suffix='', updateQ=True, updateRate=True, updateRateFixdRateTimesb=True, tol=1.e-2, bTol=1.e-3, iterMax=100):
    """
    optimization for all parameters: iRate, dRate, qMat, tree (bDict) in PIP
    updating in iRate and dRate, qMat, tree (bDict) iteratively
    estimate bDict first given other, so starting bDict (tree) is not needed
    """
    outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files(dataLoc, suffix)
    rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc)
    print 'simulation run: %s' % (inputLoc)
    # directory for runing EM
    alignInSeg = pair_align_from_multi_align(multiAlign)
    pairsList = alignInSeg.keys()
    piProb = pi_from_qmat(qMat)
    print '###  initialize tree  ###'
    segRateDict = {0: rate}   # only one segment
    bDict = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb, qMat, cList, qRates)
    # write_dist_from_bdict(bDict, dataLoc, suffix)
    tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True)
    outAlignFile = inputLoc + '/' + 'all.align.txt'
    outTreeFile = inputLoc + '/' + 'all.tree.txt'
    dict_write_align_fasta(multiAlign, outAlignFile)
    write_tree(tree, outTreeFile)
    tree.reroot_at_midpoint()
    dif = 1.e10
    bDictRelativeDif = 1
    iterNum = 1
    nllk = 1.e10
    nllkDif = 1
    while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)):
        if updateRateFixdRateTimesb:
            print '### updating insertion and deletion rate when dRate*b is fixed ###'
            dRate = rate[1]
            print tree.length()
            rateNew, tree = opt_drate_fix_drate_times_b(multiAlign, dRate, tree, qMat, piProb, cList, qRates=[1.])
            print tree.length()
            print rate
            print rateNew
            print (np.array(rateNew) - np.array(rate)) / np.array(rate)
            dRateRelativeDifFixdRateTimesb = abs(rate[1] - rateNew[1]) / rate[1]
            rate = rateNew
        else:
            dRateRelativeDifFixdRateTimesb = 0
        if updateRate:
            print '### updating insertion and deletion rate ###\n'
            # rateNew = opt_rate(rate, multiAlign, tree, qMat, piProb, cList)
            rateNew = opt_drate(multiAlign, tree, qMat, piProb, cList, qRates)
            print rate
            print rateNew
            print (np.array(rateNew) - np.array(rate)) / np.array(rate)
            dRateRelativeDif = abs(rate[1] - rateNew[1]) / rate[1]
            rate = rateNew
        else:
            print 'rate is fixed:', rate
            dRateRelativeDif = 0
        if updateQ:
            print '### updating rate matrix Q ###\n'
            qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc, outputLoc, javaDirectory, modelDirectory, eStepFile, parametersPath, execsLoc)
            qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat)
            qMatRelativeDif = qMatRelativeDifMat.max()
            qMat = qMatNew
            piProb = piProbNew
        else:
            print '### fixing rate matrix Q ###\n'
            qMatRelativeDif = 0
        print '### updating tree ###'
        segRateDict = {0: rate}   # only one segment
        bDictNew = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb, qMat, cList, qRates)
        bDictRelativeDifVec = [abs(bDictNew[key] - bDict[key]) / bDict[key] for key in bDict.keys()]
        bDictRelativeDif = np.array(bDictRelativeDifVec).max()
        print (np.array(bDictNew.values()) - np.array(bDict.values())) / np.array(bDict.values())
        bDict = bDictNew
        # write_dist_from_bdict(bDict, dataLoc, suffix)
        tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True)
        # print 'iter=%s: iRate diff = %s, Q diff = %s, bDict diff = %s' % (iterNum, iRateRelativeDif, qMatRelativeDif, bDictRelativeDif)
        # dif = max(iRateRelativeDif, qMatRelativeDif, bDictRelativeDif)
        print 'iter=%s: dRate diff = %s, dRate diff (fix dRate*b) = %s, Q diff = %s, bDict diff = %s' % (iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesb, qMatRelativeDif, bDictRelativeDif)
        dif = max(dRateRelativeDif, qMatRelativeDif, bDictRelativeDif, dRateRelativeDifFixdRateTimesb)
        tree.reroot_at_midpoint()
        nllkNew = nllk_rate(rate, multiAlign, tree, qMat, piProb, cList)
        nllkDif = nllk - nllkNew
        print 'llk increase =', nllkDif
        nllk = nllkNew
        write_tree(tree, outTreeFile)
        iterNum += 1
        # if dRateRelativeDifFixdRateTimesb is small, then skip that update
        if dRateRelativeDifFixdRateTimesb < 0.5:
            updateRateFixdRateTimesb = False
        if nllkDif <= 0:
            print 'Log-Lilkelihood is decreasing! BREAK'
            break
    if iterNum == iterMax:
        print 'maximum iteration %d reached' % (iterMax)
    else:
        print 'optimization sucess!'
    rate = [rate]
    return rate, qMat, bDict, tree
Exemplo n.º 5
0
def opt_geopip_full(m,
                    p,
                    qMat,
                    segRateDict,
                    piProbRates,
                    ratesList,
                    multiAlign,
                    lenSegs,
                    javaDirectory,
                    modelDirectory,
                    eStepFile,
                    parametersPath,
                    inputLoc,
                    outputLoc,
                    dataLoc,
                    execsLoc,
                    rFileLoc,
                    cList,
                    qRates=[1.],
                    suffix='',
                    updateQ=True,
                    updateSeg=True,
                    updateRate=True,
                    updateRateFixdRateTimesTau=True,
                    rooted=True,
                    tol=1.e-2,
                    bTol=1.e-3,
                    iterMax=100):
    """
    main function for optimization
    updating in segment rates, out segment rates, qMat, bDict iteratively
    we estimate bDict first, so bDict is not needed as input
    """
    outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files(
        dataLoc, suffix)
    rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc)
    seqNames = multiAlign.keys()
    msaList = zip(*multiAlign.values())
    print 'simulation run: %s' % (inputLoc)
    multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg(
        multiAlign, lenSegs)
    alignsInSeg = pair_align_from_multi_align(multiAlign, lenSegs)
    pairsList = alignsInSeg.keys()
    piProb = pi_from_qmat(qMat)
    print '###  initialize branch lengths  ###'
    bDict = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict, piProb, qMat,
                             cList, qRates)
    # write_dist_from_bdict(bDict, dataLoc, suffix)    # this function can be improved, by using outNameLoc, outDistLoc, outTreeLoc instead
    tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj,
                                                   dataLoc, outTreeLoc, suffix,
                                                   rooted)
    outAlignFile = inputLoc + '/' + 'all.align.txt'
    outTreeFile = inputLoc + '/' + 'all.tree.txt'
    dict_write_align_fasta(multiAlign, outAlignFile)
    write_tree(tree, outTreeFile)
    tree.reroot_at_midpoint()
    dif = 1.e10
    bDictRelativeDif = 1
    iterNum = 1
    nllk = 1.e10
    # nllkDif = 1
    while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)):
        if updateSeg:
            print '### update segmentation ###'
            lenSegs, rateSegs = mle_seg_len_rate(p, msaList, ratesList,
                                                 seqNames, tree, qMat, piProb,
                                                 piProbRates, cList)
            nSeg = len(rateSegs)
            alignsInSeg, segRateDict = update_segmentation_in_align_in_seg(
                lenSegs, rateSegs, ratesList, multiAlign)
            multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg(
                multiAlign, lenSegs)
            piProbRates = update_piprobrates(rateSegs, m, True)
            p = update_p(nSeg)
        print lenSegs
        if updateRateFixdRateTimesTau:
            print '### updating insertion and deletion rate when dRate*tau is fixed ###'
            ratesListNew, segRateDictNew, treeNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate_fix_drate_times_b(
                ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat,
                cList)
            print 'rate:', ratesList
            print 'new rate:', ratesListNew
            dRateRelativeDifFixdRateTimesTauList = [
                abs(ratesListNew[index][1] - ratesList[index][1]) /
                ratesList[index][1] for index in xrange(m)
                if ratesListNew[index][1] > 1.e-3
            ]
            dRateRelativeDifFixdRateTimesTau = np.array(
                dRateRelativeDifFixdRateTimesTauList).max()
            ratesList = ratesListNew
            segRateDict = segRateDictNew
            tree = treeNew
        else:
            dRateRelativeDifFixdRateTimesTau = 0
        if updateRate:
            print '### updating in segment rates ###'
            # segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList)
            segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate(
                ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat,
                cList)
            print 'rate:', ratesList
            print 'new rate:', ratesListNew
            dRateRelativeDifList = [
                abs(ratesListNew[index][1] - ratesList[index][1]) /
                ratesList[index][1] for index in xrange(m)
                if ratesListNew[index][1] > 1.e-3
            ]
            dRateRelativeDif = np.array(dRateRelativeDifList).max()
            segRateDict = segRateDictNew
            ratesList = ratesListNew
        else:
            dRateRelativeDif = 0
            print 'rate is Fixed:', ratesList
        #####################
        # another approach based on clustering
        # segRateDictNew = opt_nlists_rate_all_inseg_only(segRateDict, alignsInSeg, bDict, pairsList, piProb, qMat, cList)
        # # IMPROVE THIS LATER, scale each rate separately first maybe?
        # inSegRateDif = max([abs(segRateDictNew[segId] - np.array(segRateDict[segId])).max() for segId in segRateDictNew.keys()])
        # # update ratesList
        # ratesList, segRateDictNew = ratesList_kmean(segRateDictNew, m)
        # print segRateDict
        # print segRateDictNew
        # segRateDict = segRateDictNew
        # display estiamteed in segment rates
        # for segId, segRate in segRateDict.iteritems():
        #     print 'seg id = %s, \t segRate = %s' %(segId, segRate)
        ######################
        if updateQ:
            print '### updating rate matrix Q ###'
            # if alignUpdate:
            #     aligns = align_mle(lans, pairsList, bDict, iRate, dRate, qMat, piProb, cList)
            # if cogUpdate:
            #     alignsCogOnlyEst = est_align_cog(lans, pairsList, iRate, dRate, cRate, cRateVec, bDict, piProb, qMat, cList, aligns, criticalValue)
            # else:
            #     alignsCogOnlyEst = alignsCogOnly
            # write_align(pairsList, alignsInSeg, bDict, inputLoc)
            qMatNew, piProbNew = opt_qmat_em_full(qMat,
                                                  cList,
                                                  inputLoc,
                                                  outputLoc,
                                                  javaDirectory,
                                                  modelDirectory,
                                                  eStepFile,
                                                  parametersPath,
                                                  execsLoc,
                                                  lEstepTol=1.e-2,
                                                  iterMax=100)
            qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat)
            qMatRelativeDif = qMatRelativeDifMat.max()
            # qMatDif = abs(qMatNew - qMat).max()
            qMat = qMatNew
            piProb = piProbNew
            # qMatDif = 0
        else:
            print '### fixing rate matrix Q ###'
            qMatRelativeDif = 0
        print '### updating branch lengths ###'
        bDictNew = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict,
                                    piProb, qMat, cList, qRates)
        # bDictDifVec = [abs(bDictNew[key] - bDict[key]) for key in bDict.keys()]
        bDictRelativeDifVec = [
            abs(bDictNew[key] - bDict[key]) / bDict[key]
            for key in bDict.keys()
        ]
        bDictRelativeDif = np.array(bDictRelativeDifVec).max()
        # bDictDif = np.array(bDictDifVec).max()
        bDict = bDictNew
        # write_dist_from_bdict(bDict, dataLoc, suffix)
        tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList,
                                                       rCodeNj, dataLoc,
                                                       outTreeLoc, suffix,
                                                       rooted)
        print 'iter=%s: rates in seg diff = %s, rates in seg diff (fix dRate*tau) = %s, Q diff = %s, b diff = %s' % (
            iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesTau,
            qMatRelativeDif, bDictRelativeDif)
        # print 'iter=%s: rates in seg diff = %s, Q diff = %s, b diff = %s' % (iterNum, segRateDif, qMatDif, bDictDif)
        dif = max(dRateRelativeDif, dRateRelativeDifFixdRateTimesTau,
                  qMatRelativeDif, bDictRelativeDif)
        tree.reroot_at_midpoint()
        nllkNew = nllk_msa_geopip_final(ratesList, segRateDict,
                                        multiAlignAllSeg, tree, piProb, qMat,
                                        cList, piProbRates, p, rateSegs)
        nllkDif = -nllkNew + nllk
        nllkOld = nllk
        nllk = nllkNew
        print 'llk increase =', nllkDif
        if nllkDif <= 0:
            print 'Log-Likelihood is decreasing! BREAK!'
            bDict = bDictOld
            qMat = qMatOld
            segRateDict = segRateDictOld
            alignsInSeg = alignsInSegOld
            p = pOld
            piProbRates = piProbRatesOld
            ratesList = ratesListOld
            lenSegs = lenSegsOld
            tree = treeOld
            nllk = nllkOld
            break
        # do not update fixing dRate*tau if changes is small
        if dRateRelativeDifFixdRateTimesTau < 0.05:
            updateRateFixdRateTimesTau = False
        write_tree(tree, outTreeFile)
        iterNum += 1
        bDictOld = bDict
        qMatOld = qMat
        segRateDictOld = segRateDict
        alignsInSegOld = alignsInSeg
        pOld = p
        piProbRatesOld = piProbRates
        ratesListOld = ratesList
        lenSegsOld = lenSegs
        treeOld = tree
    if iterNum == iterMax:
        print 'maximum iteration %d reached' % (iterMax)
    else:
        print 'optimization sucess!'
    return bDict, qMat, segRateDict, alignsInSeg, p, piProbRates, ratesList, lenSegs, tree, nllk
Exemplo n.º 6
0
def opt_pip_full(rate,
                 qMat,
                 multiAlign,
                 javaDirectory,
                 modelDirectory,
                 eStepFile,
                 parametersPath,
                 inputLoc,
                 outputLoc,
                 dataLoc,
                 execsLoc,
                 rFileLoc,
                 cList,
                 qRates=[1.],
                 suffix='',
                 updateQ=True,
                 updateRate=True,
                 updateRateFixdRateTimesb=True,
                 tol=1.e-2,
                 bTol=1.e-3,
                 iterMax=100):
    """
    optimization for all parameters: iRate, dRate, qMat, tree (bDict) in PIP
    updating in iRate and dRate, qMat, tree (bDict) iteratively
    estimate bDict first given other, so starting bDict (tree) is not needed
    """
    outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files(
        dataLoc, suffix)
    rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc)
    print 'simulation run: %s' % (inputLoc)
    # directory for runing EM
    alignInSeg = pair_align_from_multi_align(multiAlign)
    pairsList = alignInSeg.keys()
    piProb = pi_from_qmat(qMat)
    print '###  initialize tree  ###'
    segRateDict = {0: rate}  # only one segment
    bDict = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb, qMat,
                             cList, qRates)
    # write_dist_from_bdict(bDict, dataLoc, suffix)
    tree = tree_use_r_for_unknown_number_of_leaves(bDict,
                                                   pairsList,
                                                   rCodeNj,
                                                   dataLoc,
                                                   outTreeLoc,
                                                   suffix,
                                                   rooted=True)
    outAlignFile = inputLoc + '/' + 'all.align.txt'
    outTreeFile = inputLoc + '/' + 'all.tree.txt'
    dict_write_align_fasta(multiAlign, outAlignFile)
    write_tree(tree, outTreeFile)
    tree.reroot_at_midpoint()
    dif = 1.e10
    bDictRelativeDif = 1
    iterNum = 1
    nllk = 1.e10
    nllkDif = 1
    while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)):
        if updateRateFixdRateTimesb:
            print '### updating insertion and deletion rate when dRate*b is fixed ###'
            dRate = rate[1]
            print tree.length()
            rateNew, tree = opt_drate_fix_drate_times_b(multiAlign,
                                                        dRate,
                                                        tree,
                                                        qMat,
                                                        piProb,
                                                        cList,
                                                        qRates=[1.])
            print tree.length()
            print rate
            print rateNew
            print(np.array(rateNew) - np.array(rate)) / np.array(rate)
            dRateRelativeDifFixdRateTimesb = abs(rate[1] -
                                                 rateNew[1]) / rate[1]
            rate = rateNew
        else:
            dRateRelativeDifFixdRateTimesb = 0
        if updateRate:
            print '### updating insertion and deletion rate ###\n'
            # rateNew = opt_rate(rate, multiAlign, tree, qMat, piProb, cList)
            rateNew = opt_drate(multiAlign, tree, qMat, piProb, cList, qRates)
            print rate
            print rateNew
            print(np.array(rateNew) - np.array(rate)) / np.array(rate)
            dRateRelativeDif = abs(rate[1] - rateNew[1]) / rate[1]
            rate = rateNew
        else:
            print 'rate is fixed:', rate
            dRateRelativeDif = 0
        if updateQ:
            print '### updating rate matrix Q ###\n'
            qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc,
                                                  outputLoc, javaDirectory,
                                                  modelDirectory, eStepFile,
                                                  parametersPath, execsLoc)
            qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat)
            qMatRelativeDif = qMatRelativeDifMat.max()
            qMat = qMatNew
            piProb = piProbNew
        else:
            print '### fixing rate matrix Q ###\n'
            qMatRelativeDif = 0
        print '### updating tree ###'
        segRateDict = {0: rate}  # only one segment
        bDictNew = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb,
                                    qMat, cList, qRates)
        bDictRelativeDifVec = [
            abs(bDictNew[key] - bDict[key]) / bDict[key]
            for key in bDict.keys()
        ]
        bDictRelativeDif = np.array(bDictRelativeDifVec).max()
        print(np.array(bDictNew.values()) -
              np.array(bDict.values())) / np.array(bDict.values())
        bDict = bDictNew
        # write_dist_from_bdict(bDict, dataLoc, suffix)
        tree = tree_use_r_for_unknown_number_of_leaves(bDict,
                                                       pairsList,
                                                       rCodeNj,
                                                       dataLoc,
                                                       outTreeLoc,
                                                       suffix,
                                                       rooted=True)
        # print 'iter=%s: iRate diff = %s, Q diff = %s, bDict diff = %s' % (iterNum, iRateRelativeDif, qMatRelativeDif, bDictRelativeDif)
        # dif = max(iRateRelativeDif, qMatRelativeDif, bDictRelativeDif)
        print 'iter=%s: dRate diff = %s, dRate diff (fix dRate*b) = %s, Q diff = %s, bDict diff = %s' % (
            iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesb,
            qMatRelativeDif, bDictRelativeDif)
        dif = max(dRateRelativeDif, qMatRelativeDif, bDictRelativeDif,
                  dRateRelativeDifFixdRateTimesb)
        tree.reroot_at_midpoint()
        nllkNew = nllk_rate(rate, multiAlign, tree, qMat, piProb, cList)
        nllkDif = nllk - nllkNew
        print 'llk increase =', nllkDif
        nllk = nllkNew
        write_tree(tree, outTreeFile)
        iterNum += 1
        # if dRateRelativeDifFixdRateTimesb is small, then skip that update
        if dRateRelativeDifFixdRateTimesb < 0.5:
            updateRateFixdRateTimesb = False
        if nllkDif <= 0:
            print 'Log-Lilkelihood is decreasing! BREAK'
            break
    if iterNum == iterMax:
        print 'maximum iteration %d reached' % (iterMax)
    else:
        print 'optimization sucess!'
    rate = [rate]
    return rate, qMat, bDict, tree