def GTFinder(folderName, inputfile, mummerPath): # GTFinder { I: LC.fasta, LC_filtered.fasta, reference.fasta; O: GTMap = [ [ contigsName, [ [start1,end1], [start2, end2], ... ]] , ... , ] } # "Format of the dataList : 1 765 | 11596 10822 | 765 775 | 84.25 | scf7180000000702 ref_NC_001133_" ### Finding the alignment if True: alignerRobot.useMummerAlign(mummerPath, folderName, "groundTruthMatchFixer" + inputfile, inputfile, "reference.fasta", False, "", False) dataList = alignerRobot.extractMumData( folderName, "groundTruthMatchFixer" + inputfile + "Out") lenDic = IORobot.obtainLength(folderName, inputfile) #print len(dataList) ### Parsing the alignment GTMap = [] dataList.sort(key=itemgetter(-2)) for key, items in groupby(dataList, itemgetter(-2)): contigName = key tmpList = list(items) tmpList.sort(key=itemgetter(0)) #print len(tmpList) #rangeList= rangeParser(tmpList, lenDic[key]) thres = 100 B = intervalunion.intervalCover(tmpList, thres) rangeList = intervalunion.reportMisAssemblyIntervals( B, lenDic[key], thres) GTMap.append([contigName, rangeList]) with open(folderName + inputfile + "GTMap.json", 'w') as outfile: json.dump(GTMap, outfile)
def generateAssociatedReadDic(folderName): dataList = [] numberOfFiles = houseKeeper.globalParallelFileNum for i in range(1, 1 + numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList + alignerRobot.extractMumData( folderName, "outAbun" + str(indexOfMum) + "Out") dataList.sort(key=itemgetter(-1)) contigToReadsDic = {} lenContigDic = IORobot.obtainLength(folderName, "improved3.fasta") for eachitem in lenContigDic: contigToReadsDic[eachitem] = [] for key, items in groupby(dataList, itemgetter(-1)): maxLen = 0 tmpTarget = "" for eachitem in items: if eachitem[-4] > maxLen: maxLen = eachitem[-4] tmpTarget = eachitem[-2] contigToReadsDic[tmpTarget].append(key) with open(folderName + "contigToReadsDic.json", 'w') as outfile: json.dump(contigToReadsDic, outfile)
def loadRListDic(folderName): numberOfFiles = houseKeeper.globalParallelFileNum thres = 10000 dataList = [] for i in range(1, 1 + numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList + alignerRobot.extractMumData( folderName, "outAbun" + str(indexOfMum) + "Out") dataList.sort(key=itemgetter(-2)) lenDic = IORobot.obtainLength(folderName, "improved3.fasta") RListDic = {} for key, items in groupby(dataList, itemgetter(-2)): RListDic[key] = [] for eachitem in items: if eachitem[0] < thres or eachitem[1] > lenDic[ eachitem[-2]] - thres: readName = eachitem[-1] #RListDic[key].append("Contig" + readName[5:] + "_p") #RListDic[key].append("Contig" + readName[5:] + "_d") RListDic[key].append(readName) RListDic[key] = abunHouseKeeper.getDistinct(RListDic[key]) return RListDic
def findGroundTruth(folderName, mummerPath): # "Format of the dataList : 1 765 | 11596 10822 | 765 775 | 84.25 | ref_NC_001133_ scf7180000000702" if True: alignerRobot.useMummerAlign(mummerPath, folderName, "groundTruthMatch", "reference.fasta", "improved3_Double.fasta", False, "", False) dataList = alignerRobot.extractMumData(folderName, "groundTruthMatch" + "Out") lenDic = IORobot.obtainLength(folderName, "improved3_Double.fasta") lenDicRef = IORobot.obtainLength(folderName, "reference.fasta") dataList.sort(key=itemgetter(-1)) # print "print len(lenDic), len(lenDicRef), len(dataList)", len(lenDic), len(lenDicRef), len(dataList) # Format of newList : [refName, refStart, refEnd, contigName] newList = [] for key, items in groupby(dataList, itemgetter(-1)): for eachitem in items: if isMatch(eachitem, lenDic) == 'f': newList.append( [eachitem[-2], eachitem[0], eachitem[1], eachitem[-1]]) #break elif isMatch(eachitem, lenDic) == 'r': refName = eachitem[-2] newList.append([ refName + "_r", lenDicRef[refName] - eachitem[1], lenDicRef[refName] - eachitem[1], eachitem[-1] ]) #break newList.sort() succList = [] for key, items in groupby(newList, itemgetter(0)): tmpList = list(items) # print "len(tmpList)", len(tmpList) for i in range(len(tmpList) - 1): succList.append([tmpList[i][-1], tmpList[i + 1][-1]]) # print "len(succList)", len(succList) return succList
def computeScore(folderName, eachMatching, lambdas, interiors, readMatching, constants, isDebug, mummerLink): ''' Input : lambdas, interiors, readMatching, constants Output : score \in real Algorithm : 1) Compute the total edits scores (from interiors, readMatching = [read2templateDic, template2readDic], and RList.fasta) 2) Compute the abundance scores (from lambda) 3) Combine them to give the final score ''' score = 0 # 1) editScore = 0 q = 0.01 if not isDebug: #a) Perform an alignment and parse the results alignerRobot.useMummerAlign(mummerLink, folderName, "interiorAnchor", "RList_Double.fasta", "interiors.fasta", False, "", False) readAnchorDic = {} dataList = alignerRobot.extractMumData(folderName, "interiorAnchor" + "Out") thres = 30 dataList.sort(key=itemgetter(-2)) for key, items in groupby(dataList, itemgetter(-2)): maxMatch = 0 for eachitem in items: if key in readMatching[0][key] and len( readMatching[0] [key]) > 0 and eachitem[-1] == readMatching[0][key][0]: if eachitem[4] > maxMatch: maxMatch = eachitem[4] readAnchorDic[key] = [ eachitem[0], eachitem[1], eachitem[2], eachitem[3] ] with open(folderName + "readAnchorDic.json", 'w') as outfile: json.dump(readAnchorDic, outfile) else: readAnchorDic = readInJSON(folderName, "readAnchorDic.json") #b) Perform careful edit distance computation interiorsDic = IORobot.loadContigsFromFile(folderName, "interiors.fasta") readsDic = IORobot.loadContigsFromFile(folderName, "RList_Double.fasta") for i in range(len(interiors)): tmpScore = 0 for eachitem in readMatching[1]["Segkk" + str(i)]: readName = eachitem[-2] #print readName if readName in readAnchorDic: readStart, readEnd, templateStart, templateEnd = readAnchorDic[ readName] tmpScore += Levenshtein.distance( readsDic[readName][readStart - 1:readEnd], interiorsDic["Segkk" + str(i)][templateStart - 1:templateEnd]) editScore += math.log(1.0 * q / (1 - 2 * q)) * tmpScore # 2) ### Need to correct the errors NiList = [] internalReads = [] for i in range(len(lambdas)): internalReads += readMatching[1]["Segkk" + str(i)][0] internalReadsSet = set(internalReads) contigToReadsDic = readInJSON(folderName, "contigToReadsDic.json") for i in range(len(eachMatching)): leftContig, rightContig = convertName(eachMatching[i][0]), convertName( eachMatching[i][1]) Ni = len(set(contigToReadsDic[leftContig]) - internalReadsSet) + \ len(set(contigToReadsDic[rightContig]) - internalReadsSet) + \ len(readMatching[1]["Segkk" + str(i)]) NiList.append(Ni) LiList = [] contigsDic = IORobot.loadContigsFromFile(folderName, "improved3_Double.fasta") for i in range(len(eachMatching)): leftContig, rightContig = eachMatching[i][0], eachMatching[i][1] left, middle, right = contigsDic[leftContig], interiorsDic[ "Segkk" + str(i)], contigsDic[rightContig] totalLen = len(left) + len(middle) + len(right) overlap = IORobot.align(left, middle, folderName, mummerLink) totalLen += overlap[0] overlap = IORobot.align(middle, right, folderName, mummerLink) totalLen += overlap[0] LiList.append(totalLen) abunScore = 0 for i in range(len(lambdas)): abunScore += math.log(lambdas[i] / LiList[i]) * NiList[i] # 3) score = editScore + abunScore print score, editScore, abunScore, lambdas return score
def preparation(folderName): ''' Prepare RList.fasta, contigLeft.json, contigRight.json, intermediate.fasta This step will not be needed in production as it should be automatically given or will follow a different logic of generation ''' CLeftList, CRightList = [], [] RList = [] templateList = [] contigReadGraph = "phaseStringGraph1" G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) lenDic = IORobot.obtainLength(folderName, "improved3_Double.fasta") N1 = len(lenDic) kthres, edgeThres = 3, 1 G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) if True: adj = [[] for i in range(N1)] for i in range(N1): tmpList = abunGraphLib.findAllReachable(i, N1, G) for j in tmpList: if len(abunGraphLib.findAllPathK(i, j, G, kthres)) >= edgeThres: adj[i].append(j) print adj if True: CLeftList, CRightList = [0, 6], [4, 8] RList = [] templateList = [] numberOfFiles = houseKeeper.globalParallelFileNum dataList = [] for i in range(1, 1 + numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList + alignerRobot.extractMumData( folderName, "outAbun" + str(indexOfMum) + "Out") middleList = [2] CLeftNameList, CRightNameList, middleNameList \ = [abunHouseKeeper.parseIDToName(i, 'C', 0) for i in CLeftList] \ , [abunHouseKeeper.parseIDToName(i, 'C', 0) for i in CRightList] \ , [abunHouseKeeper.parseIDToName(i, 'C', 0) for i in middleList] dataList.sort(key=itemgetter(-2)) for key, items in groupby(dataList, itemgetter(-2)): print key if int(key[5:]) == 1: for eachitem in items: RList.append(eachitem[-1]) #print eachitem[-4] #print "len(RList) : ", len(RList) RList = abunHouseKeeper.getDistinct(RList) print "len(RList) : ", len(RList) lenDic = IORobot.obtainLength(folderName, "improved3.fasta") print lenDic["Segkk1"] # print RList IORobot.putListToFileO(folderName, "raw_reads.fasta", "RList", RList) ctgList = ["Contig0_p", "Contig3_p"] with open(folderName + "contigLeft.json", 'w') as outfile: json.dump(ctgList, outfile) ctgList = ["Contig2_p", "Contig4_p"] with open(folderName + "contigRight.json", 'w') as outfile: json.dump(ctgList, outfile) contigDic = IORobot.loadContigsFromFile(folderName, "improved3_Double.fasta") #addNoise(contigDic["Contig1_p"]) ### no noise # IORobot.writeSegOut([contigDic["Contig1_p"]], folderName, "intermediate.fasta") ### with noise noisyIntermediate = dataGenLib.createANoisyRead( len(contigDic["Contig1_p"]), 0.01, contigDic["Contig1_p"]) IORobot.writeSegOut([noisyIntermediate], folderName, "intermediate.fasta") IORobot.writeSegOut([contigDic["Contig1_p"]], folderName, "intermediateNoiseless.fasta")
def chopUpReads(folderName, mummerLink): print "chopUpReads" interiors = [] ### Initializtion read2templateDic = readInJSON(folderName, "read2templateDic.json") template2readDic = readInJSON(folderName, "template2readDic.json") for eachitem in template2readDic: print "Length", eachitem, len(template2readDic[eachitem]) #assert(False) dataList = alignerRobot.extractMumData(folderName, "templateAnchor" + "Out") lenDicTemplates = IORobot.obtainLength(folderName, "templates.fasta") templatesDic = IORobot.loadContigsFromFile(folderName, "templates.fasta") readsDic = IORobot.loadContigsFromFile(folderName, "RList_Double.fasta") dataList.sort(key=itemgetter(-1)) ### Set up bins templateBeginEndDic = {} ell = 50 for key, items in groupby(dataList, itemgetter(-1)): begin, end = 10**9, -1 for eachitem in items: if eachitem[2] < begin: begin = eachitem[2] if eachitem[3] > end: end = eachitem[3] templateBeginEndDic[key] = [begin, end] print templateBeginEndDic GTDic = IORobot.loadContigsFromFile(folderName, "GTDic.fasta") for i in range(len(lenDicTemplates)): nameOfTemplate = "Segkk" + str(i) begin, end = templateBeginEndDic[nameOfTemplate] numberOfBins = int(math.ceil((end - begin) * 1.0 / ell)) print "numberOfBins", numberOfBins bins = [ consensusBins(j, begin + j * ell, min(begin + ell * (j + 1), end)) for j in range(numberOfBins) ] temp2readAlignDic = loadAlignment(folderName, nameOfTemplate, template2readDic[nameOfTemplate], mummerLink) #for eachdebug in temp2readAlignDic: # print temp2readAlignDic[eachdebug] ### Align reads to bins for eachalign in template2readDic[nameOfTemplate]: templateStart, templateEnd, readStart, readEnd = eachalign[ 2], eachalign[3], eachalign[0], eachalign[1] readName = eachalign[-2] #print readName, templateStart, templateEnd, readStart, readEnd , nameOfTemplate #assert(False) indexOfBin = min(math.ceil((templateStart - begin) * 1.0 / ell), numberOfBins - 1) indexOfBin = int(indexOfBin) while bins[indexOfBin].end < templateEnd: binStart, binEnd = bins[indexOfBin].begin, bins[indexOfBin].end readSegStart, readSegEnd = temp2readAlignDic[readName][ binStart], temp2readAlignDic[readName][binEnd] bins[indexOfBin].addToReadList( [readName, readSegStart, readSegEnd]) indexOfBin += 1 timestart = time.time() returnString = localConsensus(folderName, bins, readsDic, templatesDic, nameOfTemplate) print time.time() - timestart #assert(False) interiors.append(returnString) print "TemplateDist : ", Levenshtein.distance( templatesDic[nameOfTemplate], GTDic[nameOfTemplate]) print "CleanedDist : ", Levenshtein.distance(returnString, GTDic[nameOfTemplate]) #print returnString[10257-3:10257+3], GTDic[nameOfTemplate][10259-3:10259+3] #for eachedit in Levenshtein.editops(returnString, GTDic[nameOfTemplate]): # print eachedit # assert(False) IORobot.writeSegOut(interiors, folderName, "interiors.fasta") return interiors
def findAnchors(folderName, prevIteration, isDebug, mummerLink): ''' Input: IORobot.writeSegOut(ctgList, folderName, "templates.fasta") IORobot.putListToFileO(folderName, "raw_reads.fasta", "RList", RList) Output : the assignmentDic and lookUpDic ''' if not isDebug: alignerRobot.useMummerAlign(mummerLink, folderName, "templateAnchor", "RList_Double.fasta", "templates.fasta", False, "", False) dataList = alignerRobot.extractMumData(folderName, "templateAnchor" + "Out") lenDicReads = IORobot.obtainLength(folderName, "RList_Double.fasta") lenDicTemplates = IORobot.obtainLength(folderName, "templates.fasta") templatesDic = IORobot.loadContigsFromFile(folderName, "templates.fasta") readsDic = IORobot.loadContigsFromFile(folderName, "RList_Double.fasta") #print templatesDic["Segkk0"][1144:1144+ 50] #print readsDic["Segkk11098"][47:47+50] #print Levenshtein.distance(templatesDic["Segkk0"][1144:1144+ 50], readsDic["Segkk11098"][47:47+50]) #assert(False) read2templateDic = {} template2readDic = {} for eachitem in lenDicTemplates: template2readDic[eachitem] = [] for eachitem in lenDicReads: read2templateDic[eachitem] = [] thres = 30 dataList.sort(key=itemgetter(-2)) for key, items in groupby(dataList, itemgetter(-2)): L = lenDicReads[key] tmpList = [] for eachitem in items: if eachitem[4] > L - thres and eachitem[2] < eachitem[3]: tmpList.append(eachitem) if len(tmpList) >= 1: returnItem = resolveCompetingTemplates(folderName, tmpList, key, templatesDic, readsDic, prevIteration[0]) readName, templateName = returnItem[-2], returnItem[-1] read2templateDic[readName].append(returnItem[-1]) template2readDic[templateName].append(returnItem) with open(folderName + "read2templateDic.json", 'w') as outfile: json.dump(read2templateDic, outfile) with open(folderName + "template2readDic.json", 'w') as outfile: json.dump(template2readDic, outfile) print len(dataList), len(lenDicReads), len(lenDicTemplates), len( read2templateDic) # assert(False) else: read2templateDic = readInJSON(folderName, "read2templateDic.json") template2readDic = readInJSON(folderName, "template2readDic.json") return [read2templateDic, template2readDic]
def BResolvePreparation(folderName, inList, outList, G, Grev, N1, mummerLink): print "BResolvePreparation" # format : resolvedList, brResolvedList, inList, outList [] [[3, 1], [3, 7]] [6] [3, 15] # resolvedList in standard format ... just that inList, outList has unnecessary *2 for head/tail difference # Input : brtest/ [0, 8] [5, 13] # print folderName, inList, outList resolvedList = [] if len(inList) > 1 and len(outList) > 1: # prepare left/righ contigs contigLeft = [] for eachitem in inList: contigLeft.append( abunHouseKeeper.parseIDToName(eachitem / 2, 'C', 0)) contigRight = [] for eachitem in outList: contigRight.append( abunHouseKeeper.parseIDToName(eachitem / 2, 'C', 0)) print "contigLeft, contigRight", contigLeft, contigRight with open(folderName + "contigLeft.json", 'w') as outfile: json.dump(contigLeft, outfile) with open(folderName + "contigRight.json", 'w') as outfile: json.dump(contigRight, outfile) # prepare RList RListDic = loadRListDic(folderName) RList = [] for eachkey in contigLeft + contigRight: nodeIndex = abunHouseKeeper.parseEdgeNameToID(eachkey, 'C') nodeName = "Segkk" + str(nodeIndex / 2) RList = RList + RListDic[nodeName] RList = abunHouseKeeper.getDistinct(RList) IORobot.putListToFileO(folderName, "raw_reads.fasta", "RList", RList) IORobot.writeToFile_Double1(folderName, "RList.fasta", "RList_Double.fasta", "contig") # prepare intermediate ### Look for a path and then join here. pathList = findPathList(folderName, G, N1, contigLeft, contigRight) paths = findAPair(pathList) path1, path2 = findPathSegments(folderName, paths, N1, mummerLink) IORobot.writeSegOut([path1], folderName, "path1.fasta") IORobot.writeSegOut([path2], folderName, "path2.fasta") alignerRobot.useMummerAlign(mummerLink, folderName, "comparison", "path1.fasta", "path2.fasta", False, "", False) dataList = alignerRobot.extractMumData(folderName, "comparison" + "Out") dataList.sort(key=itemgetter(-2)) begin, end = 1, 100 for key, items in groupby(dataList, itemgetter(-2)): maxLen = -1 for eachitem in items: if eachitem[4] > maxLen: begin, end = eachitem[0], eachitem[1] maxLen = eachitem[4] path1Dic = IORobot.loadContigsFromFile(folderName, "path1.fasta") IORobot.writeSegOut([path1Dic["Segkk0"][begin - 1:end]], folderName, "intermediate.fasta") ratioScore, matching, contentForBetterInteriorToFlank = EMFlow( folderName, mummerLink) #assert(False) if 1 / ratioScore > 1.001: print "kkbug score", ratioScore for eachsub in matching: resolvedList.append([ abunHouseKeeper.parseEdgeNameToID(eachsub[0], 'C'), abunHouseKeeper.parseEdgeNameToID(eachsub[1], 'C') ]) return resolvedList