Пример #1
0
def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ,
                              outputFileName):

    thres = 10

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName,
                                         [["redundantRvsQ", fileR, fileQ, ""]],
                                         houseKeeper.globalParallel)

    dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut")
    lenDicR = IORobot.obtainLength(folderName, fileR)
    lenDicQ = IORobot.obtainLength(folderName, fileQ)

    isRedundantList = []

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]
        l1, l2 = lenDicR[name1], lenDicQ[name2]

        if abs(l2 - match2) < thres:
            isRedundantList.append(name2)

    #print lenDicQ

    nonRedundantList = obtainComplement(lenDicQ, isRedundantList)

    print nonRedundantList
    IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList)

    os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName +
              "SC_n.fasta")
def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ, outputFileName):

    thres = 10

    if True:
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, [["redundantRvsQ", fileR, fileQ, ""]], houseKeeper.globalParallel
        )

    dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut")
    lenDicR = IORobot.obtainLength(folderName, fileR)
    lenDicQ = IORobot.obtainLength(folderName, fileQ)

    isRedundantList = []

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]
        l1, l2 = lenDicR[name1], lenDicQ[name2]

        if abs(l2 - match2) < thres:
            isRedundantList.append(name2)

    # print lenDicQ

    nonRedundantList = obtainComplement(lenDicQ, isRedundantList)

    print nonRedundantList
    IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList)

    os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName + "SC_n.fasta")
Пример #3
0
def removeEmbedded(folderName , mummerLink):
    print "removeEmbedded"
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + "contigs.fasta  > " + folderName + "contigs2.fasta")

    os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta") 

    if not os.path.isfile(folderName + "selfOut"):
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel )
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName
    
    lenDic = IORobot.obtainLength(folderName, 'contigs.fasta')
    removeList = alignerRobot.extractMumDataAndRemove(folderName,"selfOut",lenDic,thres)
        
    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)
    
    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)
    
    IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
Пример #4
0
def removeRedundantWithFile(folderName, mummerLink, inputFilename,
                            mummerTmpName, outputFileName):
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta  > " +
              folderName + inputFilename + "2.fasta")

    os.system("cp " + folderName + inputFilename + "2.fasta " + folderName +
              inputFilename + ".fasta")

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[
            mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta",
            ""
        ]], houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, inputFilename + '.fasta')

    removeList = []

    shortEmbedClusterDic = {}

    for eachitem in lenDic:
        shortEmbedClusterDic[eachitem] = clusterElem(eachitem)

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem
                union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2])

    nameList = obtainComplement(lenDic, removeList)

    returnList = []

    for eachitem in nameList:
        if find(shortEmbedClusterDic[eachitem]).id == eachitem:
            returnList.append(eachitem)

    print "len(nameList), len(returnList)", len(nameList), len(returnList)

    IORobot.putListToFileO(folderName, inputFilename + ".fasta",
                           outputFileName, returnList)
def removeRedundantWithFile(folderName, mummerLink, inputFilename, mummerTmpName, outputFileName):
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta  > " + folderName + inputFilename + "2.fasta")

    os.system("cp " + folderName + inputFilename + "2.fasta " + folderName + inputFilename + ".fasta")

    if True:
        alignerRobot.useMummerAlignBatch(
            mummerLink,
            folderName,
            [[mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta", ""]],
            houseKeeper.globalParallel,
        )
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, inputFilename + ".fasta")

    removeList = []

    shortEmbedClusterDic = {}

    for eachitem in lenDic:
        shortEmbedClusterDic[eachitem] = clusterElem(eachitem)

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem
                union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2])

    nameList = obtainComplement(lenDic, removeList)

    returnList = []

    for eachitem in nameList:
        if find(shortEmbedClusterDic[eachitem]).id == eachitem:
            returnList.append(eachitem)

    print "len(nameList), len(returnList)", len(nameList), len(returnList)

    IORobot.putListToFileO(folderName, inputFilename + ".fasta", outputFileName, returnList)
def removeEmbedded(folderName, mummerLink):
    print "removeEmbedded"
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + "contigs.fasta  > " +
              folderName + "contigs2.fasta")

    os.system("cp " + folderName + "contigs2.fasta " + folderName +
              "contigs.fasta")

    if True:
        print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta"
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName,
            [["self", "contigs.fasta", "contigs.fasta", ""]],
            houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    print "removeEmbedded: Extracting MUMmer data from delta files to selfOut"
    dataList = alignerRobot.extractMumData(folderName, "selfOut")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, 'contigs.fasta')

    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem

    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)

    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)

    print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta"
    IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
Пример #7
0
def removeEmbedded(folderName, mummerLink):
    print "removeEmbedded"

    thres = 10

    command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName + houseKeeper.globalReadName
    os.system(command)

    command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName + houseKeeper.globalContigName
    os.system(command)

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[
            "self", houseKeeper.globalContigName, houseKeeper.globalContigName,
            ""
        ]], houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    dataList = alignerRobot.extractMumData(folderName, "selfOut")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName)

    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem

    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)

    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)

    IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed",
                           nameList)
def removeEmbedded(folderName , mummerLink):
    print "removeEmbedded"
    
    thres = 10
    
    command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName  + houseKeeper.globalReadName
    os.system(command)

    command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName  + houseKeeper.globalContigName
    os.system(command)


    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["self", houseKeeper.globalContigName, houseKeeper.globalContigName, ""]], houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName
    
    dataList = alignerRobot.extractMumData(folderName, "selfOut")
    
    dataList = alignerRobot.transformCoor(dataList)
    
    lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName)
    
    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]
        
        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]
            
            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem
                
    
    
    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)
    
    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)
    
    IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed", nameList)
    
Пример #9
0
 def runningTestSet(self ,myFolderName, ctexpected):
     print "Integration test on FinisherSC:  " + myFolderName
     self.sourceFolder = myFolderName
     os.system("mkdir " + self.testingFolder)
     
     for eachitem in self.listOfFiles:
         os.system("cp "+ self.sourceFolder + eachitem + " " +self.testingFolder)
     
     os.system("python finisherSC.py -par 4 "+ self.testingFolder + " "+ self.mummerPath)
     lenDic = IORobot.obtainLength(self.testingFolder, "/improved3.fasta")
     print lenDic
     assert(len(lenDic) == ctexpected)
     os.system("rm -rf "+ self.testingFolder)
Пример #10
0
 def runningTestSet(self ,myFolderName, ctexpected):
     print "Integration test on FinisherSC:  " + myFolderName
     self.sourceFolder = myFolderName
     os.system("mkdir " + self.testingFolder)
     
     for eachitem in self.listOfFiles:
         os.system("cp "+ self.sourceFolder + eachitem + " " +self.testingFolder)
     
     os.system("python finisherSC.py -par 4 "+ self.testingFolder + " "+ self.mummerPath)
     lenDic = IORobot.obtainLength(self.testingFolder, "/improved3.fasta")
     print lenDic
     assert(len(lenDic) == ctexpected)
     os.system("rm -rf "+ self.testingFolder)
def removeEmbedded(folderName, mummerLink):
    print "removeEmbedded"
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + "contigs.fasta  > " + folderName + "contigs2.fasta")

    os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta")

    if True:
        print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta"
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel
        )
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    print "removeEmbedded: Extracting MUMmer data from delta files to selfOut"
    dataList = alignerRobot.extractMumData(folderName, "selfOut")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, "contigs.fasta")

    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem

    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)

    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)

    print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta"
    IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
Пример #12
0
def observeOverlap(folderName):

    dataList = alignerRobot.extractMumData(folderName, "selfOut")
    dataList = alignerRobot.transformCoor(dataList)
    lenDic = IORobot.obtainLength(folderName, 'contigs.fasta')
    matchThres = 10000
    nonMatchThres = 500
    count = 0

    newDataList = []
    for eachitem in dataList:
        name1, name2 = eachitem[-2], eachitem[-1]
        matchLen1, matchLen2 = eachitem[4], eachitem[5]
        start1, end1, start2, end2 = eachitem[0], eachitem[1], eachitem[
            2], eachitem[3]
        #        if name1!= name2 and min(lenDic[name1] - end1, lenDic[name2] - end2 ) > nonMatchThres \
        #        and min(start1, start2) > nonMatchThres \
        if name1!= name2 and ( min(lenDic[name1] - end1, lenDic[name2] - end2 ) > nonMatchThres \
        or min(start1, start2) > nonMatchThres ) \
        and matchLen1> matchThres:
            print "eachitem ", eachitem, lenDic[name1], lenDic[name2]
            count = count + 1
            newDataList.append(eachitem)

    print "Count: " + str(count)

    blkDic = getBreakPointFromDataList(folderName, newDataList)

    LCList = IORobot.loadContigsFromFile(folderName, "contigs.fasta")

    contigList = []

    for eachcontig in LCList:
        #print eachcontig
        if not eachcontig in blkDic:
            contigList = contigList + [LCList[eachcontig]]
        else:
            contigList = contigList + tmpBreakAcBkPts(LCList[eachcontig],
                                                      blkDic[eachcontig])

    print "len(contigList)", len(contigList)
    IORobot.writeSegOut(contigList, folderName, "breakChains.fasta")
Пример #13
0
def getBreakPointFromDataList(folderName, dataList):
    g = 1000
    blkDic = {}
    dataList.sort(key=itemgetter(-2))
    lenDic = IORobot.obtainLength(folderName, "contigs.fasta")

    json_data = open(folderName + "modifiedOutliners.json", 'r')
    breakPtsDic = json.load(json_data)
    sep = 5000

    for key, items in groupby(dataList, itemgetter(-2)):
        contigName = key
        newList = []
        for eachitem in items:
            newList.append([eachitem[0], eachitem[1]])
        newList.sort()

        bktmp = [0]

        if newList[0][0] > g:
            if withinBound(sep, breakPtsDic[contigName], newList[0][0]):
                bktmp.append(newList[0][0])

        #bktmp.append(newList[0][0])
        for i in range(len(newList) - 1):
            if newList[i + 1][0] > newList[i][1] + g:
                if withinBound(sep, breakPtsDic[contigName],
                               newList[i + 1][0]):
                    bktmp.append(newList[i + 1][0])

        bktmp.append(lenDic[contigName])

        blkDic[contigName] = bktmp
        print "contigName: " + contigName
        print "bktmp:", bktmp
        print "breakPtsDic[contigName]", breakPtsDic[contigName]

    return blkDic
Пример #14
0
def extractEdgeSet(folderName, mummerLink, option="nopolish"):
    # Tasks: reconstruct the string  graph
    
    # Input : relatedReads_Double.fasta, conig_Double.fasta
    # Intermediate files: fromMum_overlap , fromMum_overlap
    # Output: connectivity of eachNode: InList, OutList [critical]
    #         connectivity of eachNode: arrow representation with size [optional]
    
    
    # ## Perform MUMMER alignment
    print ">Extract Edge set"
    contigOnlyLengthDic = IORobot.obtainLength(folderName, "improved.fasta")
    
    # print lengthDic
    lengthDic = IORobot.findContigLength(folderName, "improved")
    
    numberOfContig = len(contigOnlyLengthDic)*2

    K = 400
    thres = 5
    
    
    # ## Apply MUMMER on them using cleanedReads against them
    IORobot.truncateEndOfContigs(folderName, "improved_Double.fasta", "smaller_improvedContig.fasta", 25000, lengthDic)
    dataSet = []
    
    numberOfFiles = max(20, houseKeeper.globalParallel)
    

    if True:
        bindir = os.path.abspath(os.path.dirname(sys.argv[0]))
        command = bindir + "/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + "relatedReads_Double.fasta"
        os.system(command)
        
        
    workerList = [] 
    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)
        
        outputName, referenceName, queryName, specialName=  "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta",  "relatedReads_Double.part-" + indexOfMum + ".fasta",  "fromMumRefine" + indexOfMum
        workerList.append([outputName, referenceName, queryName, specialName])
    
        
        
    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,True)
        
        # alignerRobot.useMummerAlign(mummerLink, folderName, "outRefine", "smaller_improvedContig.fasta", "relatedReads_Double.part-" + indexOfMum + ".fasta", True,  "fromMumRefine" + indexOfMum)
        
    
    for dummyI in range(1, numberOfFiles + 1):
        tmpSet = IORobot.obtainLinkInfoReadContig(dummyI, mummerLink, folderName,thres, lengthDic, K)
        dataSet = dataSet + tmpSet
    
    # ## repeat aware
    usableJunction = loadOpenList(folderName)
    dataSet, blockedSet = filterRepeatEnd(dataSet, usableJunction)
    # ## repeat aware end
    
    dataSet.sort()
    matchPair = formMatchPairFromReadInfo(dataSet)
    
    # Bug fix on repeat detection from reads alone
    matchPair = filterRepeatPair(matchPair)
    # end bug fix
    
    # print matchPair

    bestMatchPair = []
    
    for key, items in groupby(matchPair, itemgetter(0, 1)):
        maxvalue = -1
        maxLenPair = []
        for eachitem in items:
            if eachitem[2] > maxvalue:
                maxvalue = eachitem[2]
                maxLenPair = [eachitem[3], eachitem[4], eachitem[5]]
        bestMatchPair.append([key[0], key[1], maxvalue, maxLenPair[0], maxLenPair[1], maxLenPair[2]])
    
    contigList, leftConnect, rightConnect, rawReadList = formbestpair(bestMatchPair,numberOfContig)
    print "contigList", contigList
    
    writeContigReadCombine(blockedSet, dataSet, folderName, rawReadList, numberOfContig, contigList, leftConnect, option, rightConnect, mummerLink)
def xPhased(folderName , mummerLink):
    # ## Repeat resolution  [Proxy for MB]
    # 1. Re-form the contig string graph with ALL connections from contigs only V
    # 2. Log down the reads and associated blocked contigs V 
    # 3. Use reads to connect;
    # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important);
    # 5. Read out contigs
    
    numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb")
    
    lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta")
    
    confidenLenThres = 0 
    
    G = graphLib.seqGraph(numberOfContig)
    extraEdges = loadEdgeFromBlockedReads(folderName)
    
    for eachitem in dataSet:
        # print eachitem
        wt, myin, myout = eachitem
        myInData = myin[6:].split('_')
        myOutData = myout[6:].split('_')
        
        if myInData[1] == 'p':
            offsetin = 0
        else:
            offsetin = 1
        
        if myOutData[1] == 'p':
            offsetout = 0
        else:
            offsetout = 1
            
        i = int(myInData[0]) * 2 + offsetin
        j = int(myOutData[0]) * 2 + offsetout
        
        ck = False
        
        for eachedge in extraEdges:
            mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3]
            if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres:
                ck = True
                
        if ck:
            G.insertEdge(i, j, wt)
    
    
    # G.reportEdge()
    G.MBResolve()
    G.reportEdge()
    
    G.saveToFile(folderName, "condensedGraphMB.txt")
    graphFileName = "condensedGraphMB.txt"
    contigFile = "improved2_Double.fasta"
    outContigFile = "improved3.fasta"
    outOpenList = "openZoneMB.txt"
    
    IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
    
    # ## Repeat resolution  [Proxy for phasing step]
    # 6. Find out the repeat region by MSA
    # 7. Find out the location of SNPs and extend across repeat 
    # [short cut : use contig creator : your job here is to get data into the correct formats]
    
    
    
    
    print "xPhased"
Пример #16
0
def xPhased(folderName , mummerLink):
    # ## Repeat resolution  [Proxy for MB]
    # 1. Re-form the contig string graph with ALL connections from contigs only V
    # 2. Log down the reads and associated blocked contigs V 
    # 3. Use reads to connect;
    # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important);
    # 5. Read out contigs
    
    print "xPhased: Aligning improved2.fasta against itself, outputting to mb*.delta"
    numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb")
    
    lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta")
    
    confidenLenThres = 0 
    
    print "xPhased: Building seqGraph"
    G = graphLib.seqGraph(numberOfContig)
    extraEdges = loadEdgeFromBlockedReads(folderName)
    
    for eachitem in dataSet:
        # print eachitem
        wt, myin, myout = eachitem
        myInData = myin[6:].split('_')
        myOutData = myout[6:].split('_')
        
        if myInData[1] == 'p':
            offsetin = 0
        else:
            offsetin = 1
        
        if myOutData[1] == 'p':
            offsetout = 0
        else:
            offsetout = 1
            
        i = int(myInData[0]) * 2 + offsetin
        j = int(myOutData[0]) * 2 + offsetout
        
        ck = False
        
        for eachedge in extraEdges:
            mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3]
            if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres:
                ck = True
                
        if ck:
            G.insertEdge(i, j, wt)
    
    
    # G.reportEdge()
    G.MBResolve()
    G.reportEdge()
    
    print "xPhased: Saving condensed seqGraph to condensedGraphMB.txt"
    G.saveToFile(folderName, "condensedGraphMB.txt")
    graphFileName = "condensedGraphMB.txt"
    contigFile = "improved2_Double.fasta"
    outContigFile = "improved3.fasta"
    outOpenList = "openZoneMB.txt"
    
    print "xPhased: Outputting improved contigs from condensed seqGraph to improved3.fasta"
    IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
    
    
    # ## Repeat resolution  [Proxy for phasing step]
    # 6. Find out the repeat region by MSA
    # 7. Find out the location of SNPs and extend across repeat 
    # [short cut : use contig creator : your job here is to get data into the correct formats]
    
    
    
    
    print "xPhased"