def writeToFile_Double1(folderName, fileName1, fileName2, option="contig"): f2 = open(folderName + fileName2, 'w') fOriginal = open(folderName + fileName1, 'r') readSet = [] tmp = fOriginal.readline().rstrip() tmpRead = "" while len(tmp) > 0: if tmp[0] == '>': if len(tmpRead) > 0: readSet.append(tmpRead) tmpRead = "" else: tmpRead = tmpRead + tmp tmp = fOriginal.readline().rstrip() readSet.append(tmpRead) print "len(readSet)", len(readSet) fOriginal.close() if option == "contig": header = ">Contig" else: header = ">Read" for eachcontig, dum in zip(readSet, range(len(readSet))): f2.write(header + str(dum) + "_p\n") f2.write(eachcontig + '\n') f2.write(header + str(dum) + "_d\n") f2.write(houseKeeper.reverseComplement(eachcontig) + '\n') f2.close()
def formRelatedReadsFile(folderName, mummerLink): # Find associated read and extract into a file associatedReads.fasta # Input: contigs.fasta, cleaned_Reads.fasta # Output: relatedReads.fasta # ## Extract heads of the contigs print ">formRelatedReadsFile" f = open(folderName + "improved.fasta", 'r') f2 = open(folderName + "improvedTrunc.fasta", 'w') temp = f.readline() tempContig = "" thres = 400 runningIndex = 0 endThres = 10 while len(temp) > 0: if temp[-1] == '\n': temp = temp[0:-1] if temp[0] == '>': if len(tempContig) > 0: IORobot.writeToFile(f2, runningIndex, tempContig[0:thres]) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, tempContig[-thres:]) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[0:thres])) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[-thres:])) runningIndex = runningIndex + 1 tempContig = "" else: tempContig = tempContig + temp temp = f.readline() IORobot.writeToFile(f2, runningIndex, tempContig[0:thres]) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, tempContig[-thres:]) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[0:thres])) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[-thres:])) runningIndex = runningIndex + 1 f2.close() f.close() # ## Write double stranded reads IORobot.writeToFile_Double1(folderName, "improved.fasta", "improved_Double.fasta", "contig") # writeToFile_Double1(folderName, "raw_reads.fasta", "raw_reads_Double.fasta","read") # ## Apply MUMMER on them using cleanedReads against them assoiatedReadIndex = [] nameList = [] numberOfFiles = max(20, houseKeeper.globalParallel) if True: bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = bindir + "/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + houseKeeper.globalReadName os.system(command) workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= "outGapFillRaw"+indexOfMum , "improvedTrunc.fasta", houseKeeper.globalReadName[0:-6] + ".part-" + indexOfMum + ".fasta", "fromMum" + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,True) # alignerRobot.useMummerAlign(mummerLink, folderName, "out", "improvedTrunc.fasta", "raw_reads.part-" + indexOfMum + ".fasta", True, "fromMum" + indexOfMum ) ''' command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improvedTrunc.fasta raw_reads.part-" + indexOfMum + ".fasta" os.system(command) command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMum" + indexOfMum os.system(command) ''' for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) f = open(folderName + "fromMum" + indexOfMum, 'r') for i in range(6): tmp = f.readline() while len(tmp) > 0: infoArr = tmp.split('|') myArr = infoArr[-1].split('\t') rdGpArr = infoArr[-1].split('\t') contigName = rdGpArr[0].rstrip().lstrip() readName = rdGpArr[1].rstrip().lstrip() endSegArr = infoArr[0].split(" ") pos = [] for eachitem in endSegArr: if len(eachitem) > 0: pos.append(int(eachitem)) startPos = pos[0] endPos = pos[1] if startPos < endThres and endPos > thres - endThres: assoiatedReadIndex.append(myArr[1]) nameList.append([int(contigName.split('_')[1]), readName]) tmp = f.readline() f.close() nameList.sort() assoiatedReadIndex.sort() # print "assoiatedReadIndex", assoiatedReadIndex ckIndex = 0 f = open(folderName + "associatedNames.txt", 'w') oneItem = 0 keyFound = [] for key, items in groupby(assoiatedReadIndex): countItem = 0 for eachitem in items: countItem += 1 if countItem == 1: oneItem += 1 else: key = key.rstrip() if not key in keyFound: f.write(key + '\n') keyFound.append(key) ckIndex += 1 print "ckIndex,oneItem: ", ckIndex, oneItem f.close() fFilter = open(folderName + "associatedNames.txt", 'r') fout = open(folderName + "associatedNames2.txt", 'w') maxCount = 12000 mytmpDum = fFilter.readline() i = 0 while i < maxCount and len(mytmpDum) > 0: fout.write(mytmpDum) mytmpDum = fFilter.readline() i = i + 1 fout.close() fFilter.close() command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + "associatedNames2.txt " + folderName + houseKeeper.globalReadName +" > " + folderName + "relatedReads.fasta" os.system(command) IORobot.writeToFile_Double1(folderName, "relatedReads.fasta", "relatedReads_Double.fasta", "read")
def writeContigReadCombine(blockedSet, dataSet, folderName, rawReadList, numberOfContig, contigList, leftConnect, option, rightConnect, mummerLink): # ## repeat aware logging # print "myExtraLinkList", myExtraLinkList # ## end repeat aware logging myExtraLinkList = loggingReadsToRepeat(blockedSet + dataSet, contigList) i = 0 fOriginal = open(folderName + "improved.fasta", 'r') readSet = [] tmp = fOriginal.readline().rstrip() tmpRead = "" while len(tmp) > 0: if tmp[0] == '>': if len(tmpRead) > 0: readSet.append(tmpRead) tmpRead = "" else: tmpRead = tmpRead + tmp tmp = fOriginal.readline().rstrip() readSet.append(tmpRead) fOriginal.close() # ## Put the needed rawReads into the RAM using Dictionary fAppendRaw = open(folderName + "appendRaw.txt", 'w') for eachraw in rawReadList: fAppendRaw.write(eachraw) fAppendRaw.write('\n') fAppendRaw.close() command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + "appendRaw.txt " + folderName + "relatedReads_Double.fasta > " + folderName + "rawToAppend.fasta" os.system(command) rawRead = {} fOriginal = open(folderName + "rawToAppend.fasta", 'r') tmp = fOriginal.readline().rstrip() tmpRead = "" tmpName = "" while len(tmp) > 0: if tmp[0] == '>': if len(tmpRead) > 0: rawRead[tmpName] = tmpRead tmpRead = "" tmpName = tmp[1:] else: tmpRead = tmpRead + tmp tmp = fOriginal.readline().rstrip() rawRead[tmpName] = tmpRead # ## End seqToPrint = [] contigUsed = [False for i in range(numberOfContig / 2)] storedStrand = [[-1, 'n'] for i in range(numberOfContig)] finalList = [] for eachContig, i in zip(contigList, range(len(contigList))): tmpList = [] for eachitem in eachContig: readNum = eachitem / 2 if contigUsed[readNum] == False: seqToPrint.append(eachitem) tmpList.append(eachitem) contigUsed[readNum] = True # ## mark ouput strandinfo storedStrand[eachitem] = [len(finalList), 'p'] if len(tmpList) > 0: finalList.append(tmpList) for kkk in range(len(storedStrand)): if storedStrand[kkk][1] == 'n': if kkk % 2 == 0: storedStrand[kkk][0] = storedStrand[kkk + 1][0] storedStrand[kkk][1] = 'd' else: storedStrand[kkk][0] = storedStrand[kkk - 1][0] storedStrand[kkk][1] = 'd' # ## begin stored output blocked pairs blockExtraStored(storedStrand, myExtraLinkList, folderName) # ## end output blocked pairs stored fImproved = open(folderName + "improved2.fasta", 'w') for eachcontig, dummyIndex in zip(finalList, range(len(finalList))): fImproved.write(">Segkk" + str(dummyIndex) + '\n') tmpStore = -1997 tmpStore2 = -1998 tmpStore3 = -1999 for eachseg, hidum in zip(eachcontig, range(len(eachcontig))): readNum = eachseg / 2 orientation = eachseg % 2 newStart = 0 # Begin hack ### old statement x , y , l = tmpStore, leftConnect[eachseg][1], tmpStore2 ### End old statement if hidum == 0: x , y , l = tmpStore, leftConnect[eachseg][1], tmpStore2 else: prevseg = eachcontig[hidum-1] prevReadNum = prevseg/2 prevOrient = prevseg %2 if prevOrient == 0: leftSeg = readSet[prevReadNum] else: leftSeg = houseKeeper.reverseComplement(readSet[prevReadNum]) rightSeg = tmpStore3 overlapX = IORobot.align(leftSeg, rightSeg, folderName, mummerLink) leftSeg = tmpStore3 if orientation == 0: rightSeg = readSet[readNum] else: rightSeg = houseKeeper.reverseComplement(readSet[readNum]) overlapY = IORobot.align(leftSeg, rightSeg, folderName, mummerLink) print "Before : x, y , l : ", x, y , l x = overlapX[1] y = overlapY[0] l = tmpStore2 print "After : x, y , l : ", x, y , l # End hack extraRead = "" if hidum == 0: newStart = 0 else: if l < x + y: # begin hack ### old statement newStart = x + y - l ### end old statement prevseg = eachcontig[hidum-1] prevReadNum = prevseg/2 prevOrient = prevseg %2 if prevOrient == 0: leftSeg = readSet[prevReadNum] else: leftSeg = houseKeeper.reverseComplement(readSet[prevReadNum]) if orientation == 0: rightSeg = readSet[readNum] else: rightSeg = houseKeeper.reverseComplement(readSet[readNum]) print "Before : ", newStart overlapNewStart = IORobot.align(leftSeg, rightSeg, folderName, mummerLink) newStart = overlapNewStart[1] print "After : ", newStart # end hack else: newStart = 0 if option == 'polish': print "Missing polish" extraRead = tmpStore3[x:l - y] # extraRead = performPolishing(leftConnect[eachseg][0], eachseg, tmpStore3[x:l-y], dataSet, folderName) else: extraRead = tmpStore3[x:l - y] print extraRead[0:10], len(extraRead) fImproved.write(extraRead) if orientation == 0: fImproved.write(readSet[readNum][newStart:]) else: fImproved.write(houseKeeper.reverseComplement(readSet[readNum])[newStart:]) if rightConnect[eachseg][1] != -1: tmpStore = rightConnect[eachseg][1] tmpStore2 = len(rawRead[rightConnect[eachseg][2]]) tmpStore3 = rawRead[rightConnect[eachseg][2]] fImproved.write('\n') fImproved.close()