def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ, outputFileName): thres = 10 if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["redundantRvsQ", fileR, fileQ, ""]], houseKeeper.globalParallel) dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut") lenDicR = IORobot.obtainLength(folderName, fileR) lenDicQ = IORobot.obtainLength(folderName, fileQ) isRedundantList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] l1, l2 = lenDicR[name1], lenDicQ[name2] if abs(l2 - match2) < thres: isRedundantList.append(name2) #print lenDicQ nonRedundantList = obtainComplement(lenDicQ, isRedundantList) print nonRedundantList IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList) os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName + "SC_n.fasta")
def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ, outputFileName): thres = 10 if True: alignerRobot.useMummerAlignBatch( mummerLink, folderName, [["redundantRvsQ", fileR, fileQ, ""]], houseKeeper.globalParallel ) dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut") lenDicR = IORobot.obtainLength(folderName, fileR) lenDicQ = IORobot.obtainLength(folderName, fileQ) isRedundantList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] l1, l2 = lenDicR[name1], lenDicQ[name2] if abs(l2 - match2) < thres: isRedundantList.append(name2) # print lenDicQ nonRedundantList = obtainComplement(lenDicQ, isRedundantList) print nonRedundantList IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList) os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName + "SC_n.fasta")
def removeEmbedded(folderName , mummerLink): print "removeEmbedded" thres = 10 os.system("sed -e 's/|//g' " + folderName + "contigs.fasta > " + folderName + "contigs2.fasta") os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta") if not os.path.isfile(folderName + "selfOut"): alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel ) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName lenDic = IORobot.obtainLength(folderName, 'contigs.fasta') removeList = alignerRobot.extractMumDataAndRemove(folderName,"selfOut",lenDic,thres) nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
def removeRedundantWithFile(folderName, mummerLink, inputFilename, mummerTmpName, outputFileName): thres = 10 os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta > " + folderName + inputFilename + "2.fasta") os.system("cp " + folderName + inputFilename + "2.fasta " + folderName + inputFilename + ".fasta") if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[ mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta", "" ]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, inputFilename + '.fasta') removeList = [] shortEmbedClusterDic = {} for eachitem in lenDic: shortEmbedClusterDic[eachitem] = clusterElem(eachitem) for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2]) nameList = obtainComplement(lenDic, removeList) returnList = [] for eachitem in nameList: if find(shortEmbedClusterDic[eachitem]).id == eachitem: returnList.append(eachitem) print "len(nameList), len(returnList)", len(nameList), len(returnList) IORobot.putListToFileO(folderName, inputFilename + ".fasta", outputFileName, returnList)
def removeRedundantWithFile(folderName, mummerLink, inputFilename, mummerTmpName, outputFileName): thres = 10 os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta > " + folderName + inputFilename + "2.fasta") os.system("cp " + folderName + inputFilename + "2.fasta " + folderName + inputFilename + ".fasta") if True: alignerRobot.useMummerAlignBatch( mummerLink, folderName, [[mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta", ""]], houseKeeper.globalParallel, ) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, inputFilename + ".fasta") removeList = [] shortEmbedClusterDic = {} for eachitem in lenDic: shortEmbedClusterDic[eachitem] = clusterElem(eachitem) for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2]) nameList = obtainComplement(lenDic, removeList) returnList = [] for eachitem in nameList: if find(shortEmbedClusterDic[eachitem]).id == eachitem: returnList.append(eachitem) print "len(nameList), len(returnList)", len(nameList), len(returnList) IORobot.putListToFileO(folderName, inputFilename + ".fasta", outputFileName, returnList)
def removeEmbedded(folderName, mummerLink): print "removeEmbedded" thres = 10 os.system("sed -e 's/|//g' " + folderName + "contigs.fasta > " + folderName + "contigs2.fasta") os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta") if True: print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta" alignerRobot.useMummerAlignBatch( mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName print "removeEmbedded: Extracting MUMmer data from delta files to selfOut" dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, 'contigs.fasta') removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta" IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
def removeEmbedded(folderName, mummerLink): print "removeEmbedded" thres = 10 command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName + houseKeeper.globalReadName os.system(command) command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName + houseKeeper.globalContigName os.system(command) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[ "self", houseKeeper.globalContigName, houseKeeper.globalContigName, "" ]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName) removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed", nameList)
def removeEmbedded(folderName , mummerLink): print "removeEmbedded" thres = 10 command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName + houseKeeper.globalReadName os.system(command) command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName + houseKeeper.globalContigName os.system(command) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["self", houseKeeper.globalContigName, houseKeeper.globalContigName, ""]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName) removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed", nameList)
def removeEmbedded(folderName, mummerLink): print "removeEmbedded" thres = 10 os.system("sed -e 's/|//g' " + folderName + "contigs.fasta > " + folderName + "contigs2.fasta") os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta") if True: print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta" alignerRobot.useMummerAlignBatch( mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel ) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName print "removeEmbedded: Extracting MUMmer data from delta files to selfOut" dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, "contigs.fasta") removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta" IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)