def opt_ctmc_full(qMat, multiAlign, javaDirectory, modelDirectory, eStepFile, parametersPath, inputLoc, outputLoc, dataLoc, execsLoc, rFileLoc, cList, qRates=[1], suffix='', updateQ=True, tol=1.e-2, bTol=1.e-3, iterMax=100): """ optimization for all parameters in the CTMC model: qMat, tree (bDict) update qMat and tree (bDict) iteratively """ outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files(dataLoc, suffix) rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc) print 'CTMC estimate for data in: %s' % (inputLoc) pairAlignSubsOnly = pair_align_from_multi_align_subs_only(multiAlign) piProb = pi_from_qmat(qMat) pairsList = pairAlignSubsOnly.keys() print '### initialize tree ###' bDict, nllkAll = opt_nstr_ctmc_bonly(pairAlignSubsOnly, piProb, qMat, cList, qRates) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True) outAlignFile = inputLoc + '/' + 'all.align.txt' outTreeFile = inputLoc + '/' + 'all.tree.txt' dict_write_align_fasta(multiAlign, outAlignFile) write_tree(tree, outTreeFile) dif = 1.e10 bDictRelativeDif = 1 iterNum = 1 while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)): if updateQ: print '### updating rate matrix Q ###\n' # write_align(pairsList, pairAlignSubsOnly, bDict, inputLoc) qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc, outputLoc, javaDirectory, modelDirectory, eStepFile, parametersPath, execsLoc) qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat) qMatRelativeDif = qMatRelativeDifMat.max() qMat = qMatNew piProb = piProbNew else: print '### fixing rate matrix Q ###\n' qMatRelativeDif = 0 print '### updating tree ###' bDictNew, nllkAll = opt_nstr_ctmc_bonly(pairAlignSubsOnly, piProb, qMat, cList, qRates) bDictRelativeDifVec = [abs(bDictNew[key] - bDict[key]) / bDict[key] for key in bDict.keys()] bDictRelativeDif = np.array(bDictRelativeDifVec).max() bDict = bDictNew print 'iter=%s: Q diff = %s, bDict diff = %s' % (iterNum, qMatRelativeDif, bDictRelativeDif) dif = max(qMatRelativeDif, bDictRelativeDif) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix='', rooted=True) write_tree(tree, outTreeFile) iterNum += 1 if iterNum == iterMax: print 'maximum iteration %d reached' % (iterMax) else: print 'optimization sucess!' print 'nllk = %s' % (nllkAll) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True) write_tree(tree, outTreeFile) return qMat, bDict, tree, nllkAll
def opt_geopip_full(m, p, qMat, segRateDict, piProbRates, ratesList, multiAlign, lenSegs, javaDirectory, modelDirectory, eStepFile, parametersPath, inputLoc, outputLoc, dataLoc, execsLoc, rFileLoc, cList, qRates=[1.], suffix='', updateQ=True, updateSeg=True, updateRate=True, updateRateFixdRateTimesTau=True, rooted=True, tol=1.e-2, bTol=1.e-3, iterMax=100): """ main function for optimization updating in segment rates, out segment rates, qMat, bDict iteratively we estimate bDict first, so bDict is not needed as input """ outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files(dataLoc, suffix) rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc) seqNames = multiAlign.keys() msaList = zip(*multiAlign.values()) print 'simulation run: %s' % (inputLoc) multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg(multiAlign, lenSegs) alignsInSeg = pair_align_from_multi_align(multiAlign, lenSegs) pairsList = alignsInSeg.keys() piProb = pi_from_qmat(qMat) print '### initialize branch lengths ###' bDict = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict, piProb, qMat, cList, qRates) # write_dist_from_bdict(bDict, dataLoc, suffix) # this function can be improved, by using outNameLoc, outDistLoc, outTreeLoc instead tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted) outAlignFile = inputLoc + '/' + 'all.align.txt' outTreeFile = inputLoc + '/' + 'all.tree.txt' dict_write_align_fasta(multiAlign, outAlignFile) write_tree(tree, outTreeFile) tree.reroot_at_midpoint() dif = 1.e10 bDictRelativeDif = 1 iterNum = 1 nllk = 1.e10 # nllkDif = 1 while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)): if updateSeg: print '### update segmentation ###' lenSegs, rateSegs = mle_seg_len_rate(p, msaList, ratesList, seqNames, tree, qMat, piProb, piProbRates, cList) nSeg = len(rateSegs) alignsInSeg, segRateDict = update_segmentation_in_align_in_seg(lenSegs, rateSegs, ratesList, multiAlign) multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg(multiAlign, lenSegs) piProbRates = update_piprobrates(rateSegs, m, True) p = update_p(nSeg) print lenSegs if updateRateFixdRateTimesTau: print '### updating insertion and deletion rate when dRate*tau is fixed ###' ratesListNew, segRateDictNew, treeNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate_fix_drate_times_b(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList) print 'rate:', ratesList print 'new rate:', ratesListNew dRateRelativeDifFixdRateTimesTauList = [abs(ratesListNew[index][1] - ratesList[index][1]) / ratesList[index][1] for index in xrange(m) if ratesListNew[index][1] > 1.e-3] dRateRelativeDifFixdRateTimesTau = np.array(dRateRelativeDifFixdRateTimesTauList).max() ratesList = ratesListNew segRateDict = segRateDictNew tree = treeNew else: dRateRelativeDifFixdRateTimesTau = 0 if updateRate: print '### updating in segment rates ###' # segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList) segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList) print 'rate:', ratesList print 'new rate:', ratesListNew dRateRelativeDifList = [abs(ratesListNew[index][1] - ratesList[index][1]) / ratesList[index][1] for index in xrange(m) if ratesListNew[index][1] > 1.e-3] dRateRelativeDif = np.array(dRateRelativeDifList).max() segRateDict = segRateDictNew ratesList = ratesListNew else: dRateRelativeDif = 0 print 'rate is Fixed:', ratesList ##################### # another approach based on clustering # segRateDictNew = opt_nlists_rate_all_inseg_only(segRateDict, alignsInSeg, bDict, pairsList, piProb, qMat, cList) # # IMPROVE THIS LATER, scale each rate separately first maybe? # inSegRateDif = max([abs(segRateDictNew[segId] - np.array(segRateDict[segId])).max() for segId in segRateDictNew.keys()]) # # update ratesList # ratesList, segRateDictNew = ratesList_kmean(segRateDictNew, m) # print segRateDict # print segRateDictNew # segRateDict = segRateDictNew # display estiamteed in segment rates # for segId, segRate in segRateDict.iteritems(): # print 'seg id = %s, \t segRate = %s' %(segId, segRate) ###################### if updateQ: print '### updating rate matrix Q ###' # if alignUpdate: # aligns = align_mle(lans, pairsList, bDict, iRate, dRate, qMat, piProb, cList) # if cogUpdate: # alignsCogOnlyEst = est_align_cog(lans, pairsList, iRate, dRate, cRate, cRateVec, bDict, piProb, qMat, cList, aligns, criticalValue) # else: # alignsCogOnlyEst = alignsCogOnly # write_align(pairsList, alignsInSeg, bDict, inputLoc) qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc, outputLoc, javaDirectory, modelDirectory, eStepFile, parametersPath, execsLoc, lEstepTol=1.e-2, iterMax=100) qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat) qMatRelativeDif = qMatRelativeDifMat.max() # qMatDif = abs(qMatNew - qMat).max() qMat = qMatNew piProb = piProbNew # qMatDif = 0 else: print '### fixing rate matrix Q ###' qMatRelativeDif = 0 print '### updating branch lengths ###' bDictNew = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict, piProb, qMat, cList, qRates) # bDictDifVec = [abs(bDictNew[key] - bDict[key]) for key in bDict.keys()] bDictRelativeDifVec = [abs(bDictNew[key] - bDict[key]) / bDict[key] for key in bDict.keys()] bDictRelativeDif = np.array(bDictRelativeDifVec).max() # bDictDif = np.array(bDictDifVec).max() bDict = bDictNew # write_dist_from_bdict(bDict, dataLoc, suffix) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted) print 'iter=%s: rates in seg diff = %s, rates in seg diff (fix dRate*tau) = %s, Q diff = %s, b diff = %s' % (iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesTau, qMatRelativeDif, bDictRelativeDif) # print 'iter=%s: rates in seg diff = %s, Q diff = %s, b diff = %s' % (iterNum, segRateDif, qMatDif, bDictDif) dif = max(dRateRelativeDif, dRateRelativeDifFixdRateTimesTau, qMatRelativeDif, bDictRelativeDif) tree.reroot_at_midpoint() nllkNew = nllk_msa_geopip_final(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList, piProbRates, p, rateSegs) nllkDif = -nllkNew + nllk nllkOld = nllk nllk = nllkNew print 'llk increase =', nllkDif if nllkDif <= 0: print 'Log-Likelihood is decreasing! BREAK!' bDict = bDictOld qMat = qMatOld segRateDict = segRateDictOld alignsInSeg = alignsInSegOld p = pOld piProbRates = piProbRatesOld ratesList = ratesListOld lenSegs = lenSegsOld tree = treeOld nllk = nllkOld break # do not update fixing dRate*tau if changes is small if dRateRelativeDifFixdRateTimesTau < 0.05: updateRateFixdRateTimesTau = False write_tree(tree, outTreeFile) iterNum += 1 bDictOld = bDict qMatOld = qMat segRateDictOld = segRateDict alignsInSegOld = alignsInSeg pOld = p piProbRatesOld = piProbRates ratesListOld = ratesList lenSegsOld = lenSegs treeOld = tree if iterNum == iterMax: print 'maximum iteration %d reached' % (iterMax) else: print 'optimization sucess!' return bDict, qMat, segRateDict, alignsInSeg, p, piProbRates, ratesList, lenSegs, tree, nllk
def opt_ctmc_full(qMat, multiAlign, javaDirectory, modelDirectory, eStepFile, parametersPath, inputLoc, outputLoc, dataLoc, execsLoc, rFileLoc, cList, qRates=[1], suffix='', updateQ=True, tol=1.e-2, bTol=1.e-3, iterMax=100): """ optimization for all parameters in the CTMC model: qMat, tree (bDict) update qMat and tree (bDict) iteratively """ outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files( dataLoc, suffix) rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc) print 'CTMC estimate for data in: %s' % (inputLoc) pairAlignSubsOnly = pair_align_from_multi_align_subs_only(multiAlign) piProb = pi_from_qmat(qMat) pairsList = pairAlignSubsOnly.keys() print '### initialize tree ###' bDict, nllkAll = opt_nstr_ctmc_bonly(pairAlignSubsOnly, piProb, qMat, cList, qRates) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True) outAlignFile = inputLoc + '/' + 'all.align.txt' outTreeFile = inputLoc + '/' + 'all.tree.txt' dict_write_align_fasta(multiAlign, outAlignFile) write_tree(tree, outTreeFile) dif = 1.e10 bDictRelativeDif = 1 iterNum = 1 while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)): if updateQ: print '### updating rate matrix Q ###\n' # write_align(pairsList, pairAlignSubsOnly, bDict, inputLoc) qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc, outputLoc, javaDirectory, modelDirectory, eStepFile, parametersPath, execsLoc) qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat) qMatRelativeDif = qMatRelativeDifMat.max() qMat = qMatNew piProb = piProbNew else: print '### fixing rate matrix Q ###\n' qMatRelativeDif = 0 print '### updating tree ###' bDictNew, nllkAll = opt_nstr_ctmc_bonly(pairAlignSubsOnly, piProb, qMat, cList, qRates) bDictRelativeDifVec = [ abs(bDictNew[key] - bDict[key]) / bDict[key] for key in bDict.keys() ] bDictRelativeDif = np.array(bDictRelativeDifVec).max() bDict = bDictNew print 'iter=%s: Q diff = %s, bDict diff = %s' % ( iterNum, qMatRelativeDif, bDictRelativeDif) dif = max(qMatRelativeDif, bDictRelativeDif) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix='', rooted=True) write_tree(tree, outTreeFile) iterNum += 1 if iterNum == iterMax: print 'maximum iteration %d reached' % (iterMax) else: print 'optimization sucess!' print 'nllk = %s' % (nllkAll) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True) write_tree(tree, outTreeFile) return qMat, bDict, tree, nllkAll
def opt_pip_full(rate, qMat, multiAlign, javaDirectory, modelDirectory, eStepFile, parametersPath, inputLoc, outputLoc, dataLoc, execsLoc, rFileLoc, cList, qRates=[1.], suffix='', updateQ=True, updateRate=True, updateRateFixdRateTimesb=True, tol=1.e-2, bTol=1.e-3, iterMax=100): """ optimization for all parameters: iRate, dRate, qMat, tree (bDict) in PIP updating in iRate and dRate, qMat, tree (bDict) iteratively estimate bDict first given other, so starting bDict (tree) is not needed """ outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files(dataLoc, suffix) rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc) print 'simulation run: %s' % (inputLoc) # directory for runing EM alignInSeg = pair_align_from_multi_align(multiAlign) pairsList = alignInSeg.keys() piProb = pi_from_qmat(qMat) print '### initialize tree ###' segRateDict = {0: rate} # only one segment bDict = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb, qMat, cList, qRates) # write_dist_from_bdict(bDict, dataLoc, suffix) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True) outAlignFile = inputLoc + '/' + 'all.align.txt' outTreeFile = inputLoc + '/' + 'all.tree.txt' dict_write_align_fasta(multiAlign, outAlignFile) write_tree(tree, outTreeFile) tree.reroot_at_midpoint() dif = 1.e10 bDictRelativeDif = 1 iterNum = 1 nllk = 1.e10 nllkDif = 1 while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)): if updateRateFixdRateTimesb: print '### updating insertion and deletion rate when dRate*b is fixed ###' dRate = rate[1] print tree.length() rateNew, tree = opt_drate_fix_drate_times_b(multiAlign, dRate, tree, qMat, piProb, cList, qRates=[1.]) print tree.length() print rate print rateNew print (np.array(rateNew) - np.array(rate)) / np.array(rate) dRateRelativeDifFixdRateTimesb = abs(rate[1] - rateNew[1]) / rate[1] rate = rateNew else: dRateRelativeDifFixdRateTimesb = 0 if updateRate: print '### updating insertion and deletion rate ###\n' # rateNew = opt_rate(rate, multiAlign, tree, qMat, piProb, cList) rateNew = opt_drate(multiAlign, tree, qMat, piProb, cList, qRates) print rate print rateNew print (np.array(rateNew) - np.array(rate)) / np.array(rate) dRateRelativeDif = abs(rate[1] - rateNew[1]) / rate[1] rate = rateNew else: print 'rate is fixed:', rate dRateRelativeDif = 0 if updateQ: print '### updating rate matrix Q ###\n' qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc, outputLoc, javaDirectory, modelDirectory, eStepFile, parametersPath, execsLoc) qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat) qMatRelativeDif = qMatRelativeDifMat.max() qMat = qMatNew piProb = piProbNew else: print '### fixing rate matrix Q ###\n' qMatRelativeDif = 0 print '### updating tree ###' segRateDict = {0: rate} # only one segment bDictNew = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb, qMat, cList, qRates) bDictRelativeDifVec = [abs(bDictNew[key] - bDict[key]) / bDict[key] for key in bDict.keys()] bDictRelativeDif = np.array(bDictRelativeDifVec).max() print (np.array(bDictNew.values()) - np.array(bDict.values())) / np.array(bDict.values()) bDict = bDictNew # write_dist_from_bdict(bDict, dataLoc, suffix) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True) # print 'iter=%s: iRate diff = %s, Q diff = %s, bDict diff = %s' % (iterNum, iRateRelativeDif, qMatRelativeDif, bDictRelativeDif) # dif = max(iRateRelativeDif, qMatRelativeDif, bDictRelativeDif) print 'iter=%s: dRate diff = %s, dRate diff (fix dRate*b) = %s, Q diff = %s, bDict diff = %s' % (iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesb, qMatRelativeDif, bDictRelativeDif) dif = max(dRateRelativeDif, qMatRelativeDif, bDictRelativeDif, dRateRelativeDifFixdRateTimesb) tree.reroot_at_midpoint() nllkNew = nllk_rate(rate, multiAlign, tree, qMat, piProb, cList) nllkDif = nllk - nllkNew print 'llk increase =', nllkDif nllk = nllkNew write_tree(tree, outTreeFile) iterNum += 1 # if dRateRelativeDifFixdRateTimesb is small, then skip that update if dRateRelativeDifFixdRateTimesb < 0.5: updateRateFixdRateTimesb = False if nllkDif <= 0: print 'Log-Lilkelihood is decreasing! BREAK' break if iterNum == iterMax: print 'maximum iteration %d reached' % (iterMax) else: print 'optimization sucess!' rate = [rate] return rate, qMat, bDict, tree
def opt_geopip_full(m, p, qMat, segRateDict, piProbRates, ratesList, multiAlign, lenSegs, javaDirectory, modelDirectory, eStepFile, parametersPath, inputLoc, outputLoc, dataLoc, execsLoc, rFileLoc, cList, qRates=[1.], suffix='', updateQ=True, updateSeg=True, updateRate=True, updateRateFixdRateTimesTau=True, rooted=True, tol=1.e-2, bTol=1.e-3, iterMax=100): """ main function for optimization updating in segment rates, out segment rates, qMat, bDict iteratively we estimate bDict first, so bDict is not needed as input """ outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files( dataLoc, suffix) rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc) seqNames = multiAlign.keys() msaList = zip(*multiAlign.values()) print 'simulation run: %s' % (inputLoc) multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg( multiAlign, lenSegs) alignsInSeg = pair_align_from_multi_align(multiAlign, lenSegs) pairsList = alignsInSeg.keys() piProb = pi_from_qmat(qMat) print '### initialize branch lengths ###' bDict = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict, piProb, qMat, cList, qRates) # write_dist_from_bdict(bDict, dataLoc, suffix) # this function can be improved, by using outNameLoc, outDistLoc, outTreeLoc instead tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted) outAlignFile = inputLoc + '/' + 'all.align.txt' outTreeFile = inputLoc + '/' + 'all.tree.txt' dict_write_align_fasta(multiAlign, outAlignFile) write_tree(tree, outTreeFile) tree.reroot_at_midpoint() dif = 1.e10 bDictRelativeDif = 1 iterNum = 1 nllk = 1.e10 # nllkDif = 1 while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)): if updateSeg: print '### update segmentation ###' lenSegs, rateSegs = mle_seg_len_rate(p, msaList, ratesList, seqNames, tree, qMat, piProb, piProbRates, cList) nSeg = len(rateSegs) alignsInSeg, segRateDict = update_segmentation_in_align_in_seg( lenSegs, rateSegs, ratesList, multiAlign) multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg( multiAlign, lenSegs) piProbRates = update_piprobrates(rateSegs, m, True) p = update_p(nSeg) print lenSegs if updateRateFixdRateTimesTau: print '### updating insertion and deletion rate when dRate*tau is fixed ###' ratesListNew, segRateDictNew, treeNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate_fix_drate_times_b( ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList) print 'rate:', ratesList print 'new rate:', ratesListNew dRateRelativeDifFixdRateTimesTauList = [ abs(ratesListNew[index][1] - ratesList[index][1]) / ratesList[index][1] for index in xrange(m) if ratesListNew[index][1] > 1.e-3 ] dRateRelativeDifFixdRateTimesTau = np.array( dRateRelativeDifFixdRateTimesTauList).max() ratesList = ratesListNew segRateDict = segRateDictNew tree = treeNew else: dRateRelativeDifFixdRateTimesTau = 0 if updateRate: print '### updating in segment rates ###' # segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList) segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate( ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList) print 'rate:', ratesList print 'new rate:', ratesListNew dRateRelativeDifList = [ abs(ratesListNew[index][1] - ratesList[index][1]) / ratesList[index][1] for index in xrange(m) if ratesListNew[index][1] > 1.e-3 ] dRateRelativeDif = np.array(dRateRelativeDifList).max() segRateDict = segRateDictNew ratesList = ratesListNew else: dRateRelativeDif = 0 print 'rate is Fixed:', ratesList ##################### # another approach based on clustering # segRateDictNew = opt_nlists_rate_all_inseg_only(segRateDict, alignsInSeg, bDict, pairsList, piProb, qMat, cList) # # IMPROVE THIS LATER, scale each rate separately first maybe? # inSegRateDif = max([abs(segRateDictNew[segId] - np.array(segRateDict[segId])).max() for segId in segRateDictNew.keys()]) # # update ratesList # ratesList, segRateDictNew = ratesList_kmean(segRateDictNew, m) # print segRateDict # print segRateDictNew # segRateDict = segRateDictNew # display estiamteed in segment rates # for segId, segRate in segRateDict.iteritems(): # print 'seg id = %s, \t segRate = %s' %(segId, segRate) ###################### if updateQ: print '### updating rate matrix Q ###' # if alignUpdate: # aligns = align_mle(lans, pairsList, bDict, iRate, dRate, qMat, piProb, cList) # if cogUpdate: # alignsCogOnlyEst = est_align_cog(lans, pairsList, iRate, dRate, cRate, cRateVec, bDict, piProb, qMat, cList, aligns, criticalValue) # else: # alignsCogOnlyEst = alignsCogOnly # write_align(pairsList, alignsInSeg, bDict, inputLoc) qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc, outputLoc, javaDirectory, modelDirectory, eStepFile, parametersPath, execsLoc, lEstepTol=1.e-2, iterMax=100) qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat) qMatRelativeDif = qMatRelativeDifMat.max() # qMatDif = abs(qMatNew - qMat).max() qMat = qMatNew piProb = piProbNew # qMatDif = 0 else: print '### fixing rate matrix Q ###' qMatRelativeDif = 0 print '### updating branch lengths ###' bDictNew = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict, piProb, qMat, cList, qRates) # bDictDifVec = [abs(bDictNew[key] - bDict[key]) for key in bDict.keys()] bDictRelativeDifVec = [ abs(bDictNew[key] - bDict[key]) / bDict[key] for key in bDict.keys() ] bDictRelativeDif = np.array(bDictRelativeDifVec).max() # bDictDif = np.array(bDictDifVec).max() bDict = bDictNew # write_dist_from_bdict(bDict, dataLoc, suffix) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted) print 'iter=%s: rates in seg diff = %s, rates in seg diff (fix dRate*tau) = %s, Q diff = %s, b diff = %s' % ( iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesTau, qMatRelativeDif, bDictRelativeDif) # print 'iter=%s: rates in seg diff = %s, Q diff = %s, b diff = %s' % (iterNum, segRateDif, qMatDif, bDictDif) dif = max(dRateRelativeDif, dRateRelativeDifFixdRateTimesTau, qMatRelativeDif, bDictRelativeDif) tree.reroot_at_midpoint() nllkNew = nllk_msa_geopip_final(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList, piProbRates, p, rateSegs) nllkDif = -nllkNew + nllk nllkOld = nllk nllk = nllkNew print 'llk increase =', nllkDif if nllkDif <= 0: print 'Log-Likelihood is decreasing! BREAK!' bDict = bDictOld qMat = qMatOld segRateDict = segRateDictOld alignsInSeg = alignsInSegOld p = pOld piProbRates = piProbRatesOld ratesList = ratesListOld lenSegs = lenSegsOld tree = treeOld nllk = nllkOld break # do not update fixing dRate*tau if changes is small if dRateRelativeDifFixdRateTimesTau < 0.05: updateRateFixdRateTimesTau = False write_tree(tree, outTreeFile) iterNum += 1 bDictOld = bDict qMatOld = qMat segRateDictOld = segRateDict alignsInSegOld = alignsInSeg pOld = p piProbRatesOld = piProbRates ratesListOld = ratesList lenSegsOld = lenSegs treeOld = tree if iterNum == iterMax: print 'maximum iteration %d reached' % (iterMax) else: print 'optimization sucess!' return bDict, qMat, segRateDict, alignsInSeg, p, piProbRates, ratesList, lenSegs, tree, nllk
def opt_pip_full(rate, qMat, multiAlign, javaDirectory, modelDirectory, eStepFile, parametersPath, inputLoc, outputLoc, dataLoc, execsLoc, rFileLoc, cList, qRates=[1.], suffix='', updateQ=True, updateRate=True, updateRateFixdRateTimesb=True, tol=1.e-2, bTol=1.e-3, iterMax=100): """ optimization for all parameters: iRate, dRate, qMat, tree (bDict) in PIP updating in iRate and dRate, qMat, tree (bDict) iteratively estimate bDict first given other, so starting bDict (tree) is not needed """ outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files( dataLoc, suffix) rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc) print 'simulation run: %s' % (inputLoc) # directory for runing EM alignInSeg = pair_align_from_multi_align(multiAlign) pairsList = alignInSeg.keys() piProb = pi_from_qmat(qMat) print '### initialize tree ###' segRateDict = {0: rate} # only one segment bDict = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb, qMat, cList, qRates) # write_dist_from_bdict(bDict, dataLoc, suffix) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True) outAlignFile = inputLoc + '/' + 'all.align.txt' outTreeFile = inputLoc + '/' + 'all.tree.txt' dict_write_align_fasta(multiAlign, outAlignFile) write_tree(tree, outTreeFile) tree.reroot_at_midpoint() dif = 1.e10 bDictRelativeDif = 1 iterNum = 1 nllk = 1.e10 nllkDif = 1 while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)): if updateRateFixdRateTimesb: print '### updating insertion and deletion rate when dRate*b is fixed ###' dRate = rate[1] print tree.length() rateNew, tree = opt_drate_fix_drate_times_b(multiAlign, dRate, tree, qMat, piProb, cList, qRates=[1.]) print tree.length() print rate print rateNew print(np.array(rateNew) - np.array(rate)) / np.array(rate) dRateRelativeDifFixdRateTimesb = abs(rate[1] - rateNew[1]) / rate[1] rate = rateNew else: dRateRelativeDifFixdRateTimesb = 0 if updateRate: print '### updating insertion and deletion rate ###\n' # rateNew = opt_rate(rate, multiAlign, tree, qMat, piProb, cList) rateNew = opt_drate(multiAlign, tree, qMat, piProb, cList, qRates) print rate print rateNew print(np.array(rateNew) - np.array(rate)) / np.array(rate) dRateRelativeDif = abs(rate[1] - rateNew[1]) / rate[1] rate = rateNew else: print 'rate is fixed:', rate dRateRelativeDif = 0 if updateQ: print '### updating rate matrix Q ###\n' qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc, outputLoc, javaDirectory, modelDirectory, eStepFile, parametersPath, execsLoc) qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat) qMatRelativeDif = qMatRelativeDifMat.max() qMat = qMatNew piProb = piProbNew else: print '### fixing rate matrix Q ###\n' qMatRelativeDif = 0 print '### updating tree ###' segRateDict = {0: rate} # only one segment bDictNew = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb, qMat, cList, qRates) bDictRelativeDifVec = [ abs(bDictNew[key] - bDict[key]) / bDict[key] for key in bDict.keys() ] bDictRelativeDif = np.array(bDictRelativeDifVec).max() print(np.array(bDictNew.values()) - np.array(bDict.values())) / np.array(bDict.values()) bDict = bDictNew # write_dist_from_bdict(bDict, dataLoc, suffix) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True) # print 'iter=%s: iRate diff = %s, Q diff = %s, bDict diff = %s' % (iterNum, iRateRelativeDif, qMatRelativeDif, bDictRelativeDif) # dif = max(iRateRelativeDif, qMatRelativeDif, bDictRelativeDif) print 'iter=%s: dRate diff = %s, dRate diff (fix dRate*b) = %s, Q diff = %s, bDict diff = %s' % ( iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesb, qMatRelativeDif, bDictRelativeDif) dif = max(dRateRelativeDif, qMatRelativeDif, bDictRelativeDif, dRateRelativeDifFixdRateTimesb) tree.reroot_at_midpoint() nllkNew = nllk_rate(rate, multiAlign, tree, qMat, piProb, cList) nllkDif = nllk - nllkNew print 'llk increase =', nllkDif nllk = nllkNew write_tree(tree, outTreeFile) iterNum += 1 # if dRateRelativeDifFixdRateTimesb is small, then skip that update if dRateRelativeDifFixdRateTimesb < 0.5: updateRateFixdRateTimesb = False if nllkDif <= 0: print 'Log-Lilkelihood is decreasing! BREAK' break if iterNum == iterMax: print 'maximum iteration %d reached' % (iterMax) else: print 'optimization sucess!' rate = [rate] return rate, qMat, bDict, tree