Пример #1
0
def writeToFile_Double1(folderName, fileName1, fileName2, option="contig"):

    f2 = open(folderName + fileName2, 'w')
    fOriginal = open(folderName + fileName1, 'r')

    readSet = []
    tmp = fOriginal.readline().rstrip()
    tmpRead = ""
    while len(tmp) > 0:
        if tmp[0] == '>':
            if len(tmpRead) > 0:
                readSet.append(tmpRead)
                tmpRead = ""
        else:
            tmpRead = tmpRead + tmp

        tmp = fOriginal.readline().rstrip()
    readSet.append(tmpRead)

    print "len(readSet)", len(readSet)

    fOriginal.close()

    if option == "contig":
        header = ">Contig"
    else:
        header = ">Read"
    for eachcontig, dum in zip(readSet, range(len(readSet))):
        f2.write(header + str(dum) + "_p\n")
        f2.write(eachcontig + '\n')
        f2.write(header + str(dum) + "_d\n")
        f2.write(houseKeeper.reverseComplement(eachcontig) + '\n')

    f2.close()
Пример #2
0
def writeToFile_Double1(folderName, fileName1, fileName2, option="contig"):

    f2 = open(folderName + fileName2, 'w')
    fOriginal = open(folderName + fileName1, 'r')
    
    readSet = []
    tmp = fOriginal.readline().rstrip()
    tmpRead = ""
    while len(tmp) > 0:
        if tmp[0] == '>':
            if len(tmpRead) > 0:
                readSet.append(tmpRead)
                tmpRead = ""
        else:
            tmpRead = tmpRead + tmp
            
        tmp = fOriginal.readline().rstrip()
    readSet.append(tmpRead)
    
    print "len(readSet)", len(readSet)
    
    fOriginal.close()
    
    if option == "contig":
        header = ">Contig"
    else:
        header = ">Read"
    for eachcontig, dum in zip(readSet, range(len(readSet))):
        f2.write(header + str(dum) + "_p\n")
        f2.write(eachcontig + '\n') 
        f2.write(header + str(dum) + "_d\n")
        f2.write(houseKeeper.reverseComplement(eachcontig) + '\n')
        
    f2.close()
Пример #3
0
def formRelatedReadsFile(folderName, mummerLink):    
    # Find associated read and extract into a file associatedReads.fasta
    # Input: contigs.fasta, cleaned_Reads.fasta 
    # Output: relatedReads.fasta

    # ## Extract heads of the contigs
    print ">formRelatedReadsFile"
    
    f = open(folderName + "improved.fasta", 'r')
    f2 = open(folderName + "improvedTrunc.fasta", 'w')
    temp = f.readline()
    tempContig = ""
    thres = 400
    runningIndex = 0
    endThres = 10 
    
    while len(temp) > 0:
        if temp[-1] == '\n':
            temp = temp[0:-1]
        
        
        if temp[0] == '>':

            if len(tempContig) > 0:
                IORobot.writeToFile(f2, runningIndex, tempContig[0:thres])
                runningIndex = runningIndex + 1
                
                IORobot.writeToFile(f2, runningIndex, tempContig[-thres:])
                runningIndex = runningIndex + 1 
                
                                
                IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[0:thres]))
                runningIndex = runningIndex + 1
                
                IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[-thres:]))
                runningIndex = runningIndex + 1
                
                tempContig = ""
        else:
            tempContig = tempContig + temp
        
        temp = f.readline()

    IORobot.writeToFile(f2, runningIndex, tempContig[0:thres])
    runningIndex = runningIndex + 1
    
    IORobot.writeToFile(f2, runningIndex, tempContig[-thres:])
    runningIndex = runningIndex + 1
                  
    IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[0:thres]))
    runningIndex = runningIndex + 1
    
    IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[-thres:]))
    runningIndex = runningIndex + 1
    
    
    f2.close()
    f.close()
    
    # ## Write double stranded reads
    IORobot.writeToFile_Double1(folderName, "improved.fasta", "improved_Double.fasta", "contig")
    # writeToFile_Double1(folderName, "raw_reads.fasta", "raw_reads_Double.fasta","read")
    
    # ## Apply MUMMER on them using cleanedReads against them
    assoiatedReadIndex = []
    nameList = []
    
    numberOfFiles = max(20, houseKeeper.globalParallel)
    
    if True:
        bindir = os.path.abspath(os.path.dirname(sys.argv[0]))
        command = bindir + "/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + houseKeeper.globalReadName
        os.system(command)
    
    
    workerList = []
    
    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)
            
        outputName, referenceName, queryName, specialName=  "outGapFillRaw"+indexOfMum , "improvedTrunc.fasta", houseKeeper.globalReadName[0:-6] + ".part-" + indexOfMum + ".fasta", "fromMum" + indexOfMum 
        workerList.append([outputName, referenceName, queryName, specialName])
    
    
    
    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,True)
        
        # alignerRobot.useMummerAlign(mummerLink, folderName, "out", "improvedTrunc.fasta", "raw_reads.part-" + indexOfMum + ".fasta", True, "fromMum" + indexOfMum )
        
        '''
        command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improvedTrunc.fasta raw_reads.part-" + indexOfMum + ".fasta"
        os.system(command)

        command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMum" + indexOfMum
        os.system(command)
        '''
        

    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)
        f = open(folderName + "fromMum" + indexOfMum, 'r')
    
        for i in range(6):
            tmp = f.readline()
        
        while len(tmp) > 0:
            infoArr = tmp.split('|')
            myArr = infoArr[-1].split('\t')
            rdGpArr = infoArr[-1].split('\t')
            contigName = rdGpArr[0].rstrip().lstrip()
            readName = rdGpArr[1].rstrip().lstrip()
            
            endSegArr = infoArr[0].split(" ")
            pos = []
            for eachitem in endSegArr:
                if len(eachitem) > 0:
                    pos.append(int(eachitem))
                    
            startPos = pos[0]
            endPos = pos[1]
            if startPos < endThres and endPos > thres - endThres:
                assoiatedReadIndex.append(myArr[1])
                nameList.append([int(contigName.split('_')[1]), readName])
            tmp = f.readline()
        
        f.close()
    
    
    nameList.sort()

    assoiatedReadIndex.sort()
    
    # print "assoiatedReadIndex", assoiatedReadIndex
    
    ckIndex = 0
    f = open(folderName + "associatedNames.txt", 'w')
    oneItem = 0
    keyFound = []
    for key, items in groupby(assoiatedReadIndex):
        
        countItem = 0
        for eachitem in items:
            countItem += 1
            
        if countItem == 1:
            
            oneItem += 1
        else:
            key = key.rstrip()
            if not key in keyFound:
                f.write(key + '\n')
                keyFound.append(key)

        ckIndex += 1
    
    print "ckIndex,oneItem: ", ckIndex, oneItem
    f.close()

    fFilter = open(folderName + "associatedNames.txt", 'r')
    
    fout = open(folderName + "associatedNames2.txt", 'w') 
    
    maxCount = 12000
    mytmpDum = fFilter.readline() 
    i = 0
    while i < maxCount and len(mytmpDum) > 0:
        fout.write(mytmpDum)  
        mytmpDum = fFilter.readline() 
        i = i + 1
        
    fout.close()   
    fFilter.close()

    command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + "associatedNames2.txt " + folderName + houseKeeper.globalReadName +" > " + folderName + "relatedReads.fasta"
    os.system(command)
    
    IORobot.writeToFile_Double1(folderName, "relatedReads.fasta", "relatedReads_Double.fasta", "read")
Пример #4
0
def writeContigReadCombine(blockedSet, dataSet, folderName, rawReadList, numberOfContig, contigList, leftConnect, option, rightConnect, mummerLink):
    # ## repeat aware logging
    # print "myExtraLinkList", myExtraLinkList
    # ## end repeat aware logging

    myExtraLinkList = loggingReadsToRepeat(blockedSet + dataSet, contigList)    
    i = 0
    fOriginal = open(folderName + "improved.fasta", 'r')
    readSet = []
    tmp = fOriginal.readline().rstrip()
    tmpRead = ""
    while len(tmp) > 0:
        if tmp[0] == '>':
            if len(tmpRead) > 0:
                readSet.append(tmpRead)
                tmpRead = ""
        else:
            tmpRead = tmpRead + tmp
            
        tmp = fOriginal.readline().rstrip()
    readSet.append(tmpRead)  
    fOriginal.close()
    
    # ## Put the needed rawReads into the RAM using Dictionary
    
    fAppendRaw = open(folderName + "appendRaw.txt", 'w')
    for eachraw in rawReadList:
        fAppendRaw.write(eachraw)
        fAppendRaw.write('\n')
    fAppendRaw.close()
    
    command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + "appendRaw.txt " + folderName + "relatedReads_Double.fasta > " + folderName + "rawToAppend.fasta"
    os.system(command)
    
    rawRead = {}
    
    fOriginal = open(folderName + "rawToAppend.fasta", 'r')
    tmp = fOriginal.readline().rstrip()
    tmpRead = ""
    tmpName = ""
    while len(tmp) > 0:
        if tmp[0] == '>':
            
            if len(tmpRead) > 0:
                rawRead[tmpName] = tmpRead
                tmpRead = ""
                
            tmpName = tmp[1:]
        else:
            tmpRead = tmpRead + tmp
            
        tmp = fOriginal.readline().rstrip()
        
    rawRead[tmpName] = tmpRead
    # ## End
    
    seqToPrint = []
    contigUsed = [False for i in range(numberOfContig / 2)]
    storedStrand = [[-1, 'n'] for i in range(numberOfContig)]
    
    finalList = []
    for eachContig, i in zip(contigList, range(len(contigList))):
        tmpList = []
        for eachitem in eachContig:

            readNum = eachitem / 2
            if contigUsed[readNum] == False:
                seqToPrint.append(eachitem)
                tmpList.append(eachitem)
                contigUsed[readNum] = True
                # ## mark ouput strandinfo
                storedStrand[eachitem] = [len(finalList), 'p']
                
                
        if len(tmpList) > 0:
            finalList.append(tmpList)
    
    
    for kkk in range(len(storedStrand)):
        if storedStrand[kkk][1] == 'n':
            if kkk % 2 == 0:
                storedStrand[kkk][0] = storedStrand[kkk + 1][0]
                storedStrand[kkk][1] = 'd'
            else:
                storedStrand[kkk][0] = storedStrand[kkk - 1][0]
                storedStrand[kkk][1] = 'd'
    
    # ## begin stored output blocked pairs
    blockExtraStored(storedStrand, myExtraLinkList, folderName)
    # ## end output blocked pairs stored
    

    fImproved = open(folderName + "improved2.fasta", 'w')
    
    for eachcontig, dummyIndex in zip(finalList, range(len(finalList))):
        fImproved.write(">Segkk" + str(dummyIndex) + '\n')
        tmpStore = -1997
        tmpStore2 = -1998
        tmpStore3 = -1999
        
        for eachseg, hidum in zip(eachcontig, range(len(eachcontig))):
            readNum = eachseg / 2
            orientation = eachseg % 2            
            newStart = 0 
    
    
            # Begin hack 
            ### old statement 
            x , y , l = tmpStore, leftConnect[eachseg][1], tmpStore2
            ### End old statement
            
            if hidum == 0:
                x , y , l = tmpStore, leftConnect[eachseg][1], tmpStore2
            else:
                
                prevseg = eachcontig[hidum-1]
                
                prevReadNum = prevseg/2
                prevOrient = prevseg %2
                
                
                if prevOrient == 0:
                    leftSeg = readSet[prevReadNum]
                else:
                    leftSeg = houseKeeper.reverseComplement(readSet[prevReadNum])
                    
                rightSeg = tmpStore3
                
                overlapX = IORobot.align(leftSeg, rightSeg, folderName, mummerLink)
                
                leftSeg = tmpStore3
                if orientation == 0:
                    rightSeg = readSet[readNum] 
                else:
                    rightSeg = houseKeeper.reverseComplement(readSet[readNum])
                    
                overlapY = IORobot.align(leftSeg, rightSeg, folderName, mummerLink)
                
                
                print "Before : x, y , l : ",  x, y , l 
                x = overlapX[1]
                y = overlapY[0]
                l = tmpStore2
                print "After : x, y , l : ",  x, y , l 
                 
            # End hack 
            
            extraRead = ""
            if hidum == 0:
                newStart = 0
            else:
                
                if l < x + y:
                    # begin hack 
                    ### old statement 
                    newStart = x + y - l
                    ### end old statement 
                    
                    prevseg = eachcontig[hidum-1]
                
                    prevReadNum = prevseg/2
                    prevOrient = prevseg %2
                    
                    if prevOrient == 0:
                        leftSeg = readSet[prevReadNum]
                    else:
                        leftSeg = houseKeeper.reverseComplement(readSet[prevReadNum])
                    
                        
                    if orientation == 0:
                        rightSeg = readSet[readNum] 
                    else:
                        rightSeg = houseKeeper.reverseComplement(readSet[readNum])
                    
                    print "Before : ", newStart
                    overlapNewStart = IORobot.align(leftSeg, rightSeg, folderName, mummerLink)
                    newStart = overlapNewStart[1]
                    print "After : ", newStart
                    # end hack 
                    
                else:
                    newStart = 0
                    if option == 'polish':
                        print "Missing polish"
                        extraRead = tmpStore3[x:l - y]
                        # extraRead = performPolishing(leftConnect[eachseg][0], eachseg, tmpStore3[x:l-y],  dataSet, folderName)
                    else:
                        extraRead = tmpStore3[x:l - y]
    
            print extraRead[0:10], len(extraRead)
            
            fImproved.write(extraRead)
            
            if orientation == 0:
                fImproved.write(readSet[readNum][newStart:])   

            else:
                fImproved.write(houseKeeper.reverseComplement(readSet[readNum])[newStart:])
            
            if rightConnect[eachseg][1] != -1:
                tmpStore = rightConnect[eachseg][1]
                tmpStore2 = len(rawRead[rightConnect[eachseg][2]])
                tmpStore3 = rawRead[rightConnect[eachseg][2]]
                
        fImproved.write('\n')
        
    fImproved.close()