Exemplo n.º 1
0
def performPhasing(folderName, mummerLink):
    print "performPhasing"
    '''
    1. Interface from alignmentBridge.py : 
        shortToLongMap = formRelatedMap(f2, noisyReads, currentNode, indelRobot, toProcessList)
        cleaner.cleaning([noisyReads,noisyReads] ,shortToLongMap, toProcessList,indelRobot, "init")
        in1List, in2List, out1List, out2List, commonList, longReadToUse  = cleaner.cleaning([noisyReads, noisyReads],shortToLongMap, toProcessList,indelRobot, "vote")
        extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList,indelRobot,longReadToUse, True)
    
    2. Format of input data data : 
        bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList])
    
    3. IO : 
        a) Input :
            repeatSpecification.txt, phasingSeedName_Double.fasta, graph G 
        b) Output :
            improved4.fasta
            
    3. Algorithm: 
        a) reformatNoisyReads 
        b) reformatToProcessList
        c) formShortToLongMapping
    
    '''

    json_data = open(folderName + 'repeatSpecification.txt', 'r')
    loadData = json.load(json_data)
    
    G = commonLib.seqGraph(0)
    G.loadFromFile(folderName, "phaseStringGraph1")
    
    lenDicRR = commonLib.obtainLength(folderName, "phasingSeedName_Double.fasta")
    
    lenDicCC = commonLib.obtainLength(folderName, "improved3_Double.fasta")
    N1 = len(lenDicCC)
    
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    
    for eachitem in loadData:
        print eachitem
        flankingList, repeatList, repeatPathway, flankingPathsList = eachitem[0], eachitem[1], eachitem[2], eachitem[3] 
        
        noisyReads, dicToOriginal, dicFromOriginal = reformatNoisyReads(folderName, flankingList, repeatList, N1)
        
        toProcessList = reformatToProcessList(folderName , flankingList, repeatList, dicFromOriginal, N1)

        shortToLongMap = formShortToLongMapping(folderName, G, toProcessList, dicFromOriginal,dicToOriginal, lenDicCR, N1 )
        
        indelRobot = createIndelRobot(folderName)
        
        cleaner.cleaning([noisyReads, noisyReads] , shortToLongMap, toProcessList, indelRobot, "init")
        in1List, in2List, out1List, out2List, commonList, longReadToUse = cleaner.cleaning([noisyReads, noisyReads], shortToLongMap, toProcessList, indelRobot, "vote")
        extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList, indelRobot, longReadToUse, True)
        
        if extendResult != -1:
            print "extendResult: ", extendResult
            assert(1==2)
Exemplo n.º 2
0
def filterEdge(adjacencyList, folderName, contigFilename):
    lenDic = commonLib.obtainLength(folderName, contigFilename + "_Double.fasta")
    thresFoPhase = 2000
    smallList, largeList = [], []
    for eachitem in lenDic:
        id = parseEdgeNameToID(eachitem, 'C')
        if lenDic[eachitem] < thresFoPhase:
            smallList.append(id)
        else:
            largeList.append(id)
    
    newAdjacencyList = [[] for i in range(len(adjacencyList))]
    
    for i in largeList:
        for eachitem in adjacencyList[i]:
######## IMPORTANT:
            if  eachitem in largeList and eachitem / 2 != i / 2:
######## NEED TO REMOVE IN PRODUCTION if True
                newAdjacencyList[i].append(eachitem)
    
    
    print "len(smallList)  , len(largeList): ", len(smallList)  , len(largeList)
    print "lenDic: ", lenDic
    
    for eachitem in newAdjacencyList:
        print "newAdjacencyList :", eachitem 
        
    return newAdjacencyList
Exemplo n.º 3
0
def getAllAssociatedReads(folderName, mummerLink):
    '''
    Input : relatedReads.fasta, raw_reads.fasta 
    Output : all_associated_reads.fasta
    
     Algorithm : 
        a) Get all the associated reads
        b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total
            i) Align the raws and tmp_seedReads
            ii) Put the new reads into the SeedReads
    '''
    forFastaName = "phasingSeedName"
    header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta"
    command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile
    os.system(command)
    N = 1
    
    for trial in range(N):
        print "trial", trial
        if False:
            command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + header + " " + folderName + referenceFile + " " + folderName + queryFile
            os.system(command)
            
            command = mummerLink + "show-coords -r " + folderName + header + ".delta > " + folderName + header + "Out"
            os.system(command)
        
        dataList = commonLib.extractMumData(folderName, header + "Out")
        filterList = []
        
        lenDicRR = commonLib.obtainLength(folderName, queryFile)
        
        print "len(dataList)", len(dataList)
        for eachitem in dataList:
            if checkSatisfy(eachitem, lenDicRR):
                filterList.append(eachitem)
            
        filterList.sort(key=itemgetter(-1))
        newReads = []
        
        for key, items in groupby(filterList, itemgetter(-1)):
            newReads.append(key)
                                    
        
        f = open(folderName + forFastaName + ".txt", 'w')
        
        for eachitem in newReads:
            f.write(eachitem + "\n")
        f.close()
            
        command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta"
        os.system(command)
Exemplo n.º 4
0
def colorNodes(folderName, mummerPath,sourceFilename, contigFilename, readsetFilename):
    print "colorNodes"
    lenDic = commonLib.obtainLength(folderName, sourceFilename+".fasta")
    print lenDic
    thresForShort = 15000
    shortList = []
    longList = []
    for eachitem in lenDic:
        if lenDic[eachitem] > thresForShort:
            longList.append(eachitem)
        else:
            shortList.append(eachitem)
    
    commonLib.putListToFileO(folderName, sourceFilename+".fasta", contigFilename, longList)
    commonLib.putListToFileO(folderName, sourceFilename+".fasta", readsetFilename, shortList)
Exemplo n.º 5
0
def connectContigs(toPhase, toRemove, toBR, folderName, mummerLink):
    print "\nConnect Contigs"
    tmpList = []
    delThres =20000
    lenDic = commonLib.obtainLength(folderName, "improved3_Double.fasta")
    
    for eachitem in toRemove:
        tmpList.append(eachitem/2)
        
    tmpList.sort()
    removeContigIndexList = []
    for key, items in groupby(tmpList):
        name = "Contig"+ str(key)+"_p"
        if lenDic[name] < delThres:
            removeContigIndexList.append(2*key)
            removeContigIndexList.append(2*key+1)
        
    print "removeContigIndexList", removeContigIndexList
    
    
    
    ### toRemove ===> remove both strand when detected
    G = commonLib.seqGraph(len(lenDic))
    
    ### hack ! make the nodeIndexList to be empty for empty nodes

    for eachnode in G.graphNodesList:
        if eachnode.nodeIndex in removeContigIndexList:
            eachnode.nodeIndexList = []
    
    # form a graph, .condense, then use readContigOut
    ### add edge 
    for eachedge in toBR:
        i = eachedge[0]/2
        j = eachedge[1]/2
        wt = eachedge[3]+1
        print "i, j, wt", i, j, wt
        G.insertEdge(i, j, wt)
    
    tmpFileName = "xphasebonus"
    G.condense()
    G.saveToFile(folderName,tmpFileName )
    
    commonLib.readContigOut(folderName, mummerLink, tmpFileName, "improved3_Double.fasta", "improved4.fasta", tmpFileName+"Open")
Exemplo n.º 6
0
def checkPathLength(path, G, N1, folderName):
    
    lenDicRR = commonLib.obtainLength(folderName, "phasingSeedName_Double.fasta")
    sumLength = 0
    overlapLength = 0
    for index, i in zip(path, range(len(path))):
        header = "Read" + str((index - N1) / 2) + "_"
        if (index - N1) % 2 == 0:
            header = header + "p"
        else:
            header = header + "d"
        print "lenDicRR[header], ", lenDicRR[header], header 
        print (index - N1) * 2 + 1, (index - N1) * 2 + 2
        sumLength = sumLength + lenDicRR[header]
        
        if i != len(path) - 1:
            for eachnext in G.graphNodesList[index].listOfNextNodes:
                if eachnext[0] == path[i + 1]:
                    overlapLength = overlapLength + eachnext[1]
                    break 
    print sumLength, overlapLength, sumLength - overlapLength
Exemplo n.º 7
0
def defineRepeatAndFlanking(folderName, mummerLink,contigFilename,contigReadGraph,repeatFilename,repeatSpec ):
    '''
    Input : 
V        a) String graph : G                
V        b) Repeat Pairing : repeatList     
        
    Output : 
V        a) chain of repeat indices (e.g. [S= R1, R33, R45, R24= E]) 
V        b) chain of flanking region indices for in1/2 out1/2 middle (e.g. [C1, R2, R4] )
V        c) in1/2 out1/2 and middle reads per repeat (e.g. [R1, R33, R45, R24])  
        
    Algorithm : 
V        1. Find repeat by graph operations
V        2. Find flanking region by graph operations
V        3. Find associated reads by graph operations
    '''
    
    print "defineRepeatAndFlanking: "


    
    
    # 0. Load previous data
    G = commonLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    Grev = formReverseGraph(G)
    
    json_data = open(folderName + repeatFilename, 'r')
    repeatList = json.load(json_data)
    
    lenDicCC = commonLib.obtainLength(folderName, contigFilename+"_Double.fasta")
    N1 = len(lenDicCC)
    
    
    print "repeatList: ", repeatList
    print "len(G.graphNodesList)", len(G.graphNodesList)
     
    bigDumpList = []
    
    print "len(repeatList)", len(repeatList) , repeatList
    for r in repeatList:
        rIn, rOut = [], []
        for eachitem in r[0]:
            rIn.append(eachitem / 2)
        for eachitem in r[1]:
            rOut.append(eachitem / 2)
        
        if ( len(rIn) == 2 and len(rOut) == 2) or (len(rIn) == 1 and len(rOut) == 1):
            print rIn, rOut
            if  (len(rIn) == 1 and len(rOut) == 1):
                rIn = [rIn[0], rIn[0]]
                rOut = [rOut[0], rOut[0]]
            
            # 1. Records reachable indices
            kkIn , kkOut = [], []
            for eachkk in rIn:
                kkIn.append(str(eachkk)+"_"+"in")
            
            for eachkk in rOut:
                kkOut.append(str(eachkk)+"_"+"out")
                
            
            markReachableIndices(G, Grev, kkIn, kkOut, N1)
            
            # 2. Marks inside nodes
            singleMissList, allPassList = markInsideNodes(G, kkIn, kkOut)
            for i in range(4): 
                print "len(singleMissList[i]), len(allPassList)", len(singleMissList[i]), len(allPassList)

            # 3. Finds start/end of repeat
            myStartIndex, myEndIndex = markStartEndNodes(G, rIn, rOut, singleMissList, allPassList)
            print myStartIndex, myEndIndex
            
            # 4. Find repeat interior by shortest path joining S/E
            repeatPathway = markInterior(G , myStartIndex, myEndIndex, N1)
            print "repeatPathway", repeatPathway
            #checkPathLength(repeatPathway, G, N1, folderName)
            
            # 5. Find flanking region by shortest path search again
            flankingPathsList = markFlankingRegion(G, rIn, rOut, myStartIndex, myEndIndex, N1)
            print flankingPathsList
            
            # 6. Find associated reads by graph node query
            flankingList, repeatList = markAssociatedReads(G, singleMissList, allPassList)
            
            # ## Experimental
            repeatList = allPassList
            
            # ## End Experimental
            for eachlist in flankingList:
                print len(eachlist), len(repeatList)
            
            bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList])
        

     


    # 7. Format return and move on to the phasing 
    with open(folderName + repeatSpec, 'w') as outfile:
        json.dump(bigDumpList, outfile)
Exemplo n.º 8
0
def identifyRepeat(folderName, mummerLink,contigFilename,contigReadGraph, repeatFilename, optionToRun  ):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''
    
    # ## (a) reachability test to find partners 
    G = commonLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = commonLib.obtainLength(folderName, contigFilename+"_Double.fasta")
    
    adjacencyList = [[] for i in range(len(lenDicCC))]
    
    N1 = len(lenDicCC)
    
    
    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug
    
    for i in range(len(lenDicCC)):
        adjacencyList[i] = findAllReachable(i, N1, G) 
        print "i, adjacencyList[i] : ", i , adjacencyList[i]
    
    # ## (b) formation of bipartite graph
    if optionToRun == "tandem" :
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase": 
        newAdjacencyList = filterEdge(adjacencyList, folderName, contigFilename)
    
    G2 = commonLib.seqGraph(N1 * 2)
    for i in range(N1):
        for j in newAdjacencyList[i]:
            G2.insertEdge(2 * i, 2 * j + 1, 1)
            G2.insertEdge(2 * j + 1, 2 * i, 1)

    clusters = G2.findConnectedComponents()
    
    repeatList = []
    for eachitem in clusters:
        leftList, rightList = [], []
        for eachsubitem in eachitem:
            if eachsubitem % 2 == 0 :
                leftList.append(eachsubitem)
            else:
                rightList.append(eachsubitem)
                
        
        repeatList.append([getDistinct(leftList), getDistinct(rightList)])
           
    with open(folderName + repeatFilename, 'w') as outfile:
        json.dump(repeatList, outfile)

    
    json_data = open(folderName + repeatFilename, 'r')
    loadData = json.load(json_data)
    
    
    assert(loadData == repeatList)
Exemplo n.º 9
0
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName):
    
    '''
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
    Algorithm: 
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V
    '''

    G = []

    commonLib.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig")
    commonLib.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads")
    
    
    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta" , contigFilename + "_Double.fasta"
    if True:
        commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)

    lenDicCC = commonLib.obtainLength(folderName, contigFilename + "_Double.fasta")
    dataListCC = commonLib.extractMumData(folderName, header + "Out")
    dataListCC = filterData(dataListCC, lenDicCC)
    
    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta" , readsetFilename + "_Double.fasta"
    if True:
        commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    
    lenDicRR = commonLib.obtainLength(folderName, readsetFilename + "_Double.fasta")

    dataListRR = commonLib.extractMumData(folderName, header + "Out")
    dataListRR = filterData(dataListRR, lenDicRR)

    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta" , readsetFilename + "_Double.fasta"
    if True:
        commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = commonLib.extractMumData(folderName, header + "Out")
    dataListCR = filterData(dataListCR, lenDicCR)
            
    numberOfNodes = len(lenDicCR) 
    G = commonLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    
    '''
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    
    '''
    
    # print dataListCC[0]
    # print dataListRR[0]
    # print dataListCR[0]
    
    # for eachitem in dataListCC:
    #    print eachitem
    addDataToList(dataListCC, G, 0, 0, 'C', 'C')
    # for eachitem in dataListRR[0:10]:
    #    print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]]
    
    
    addDataToList(dataListRR, G, N1, N1, 'R', 'R')
    
    addDataToList(dataListCR, G, 0, N1, 'C', 'R')
    # G.reportEdge()
    G.saveToFile(folderName, graphName)
    
    checkGraphLength(G, N1, lenDicRR)
    
    # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes)
    print "len(G.graphNodesList)", len(G.graphNodesList)
Exemplo n.º 10
0
def defineRepeatAndFlanking(folderName, mummerLink):
    '''
    Input : 
V        a) String graph : G                
V        b) Repeat Pairing : repeatList     
        
    Output : 
V        a) chain of repeat indices (e.g. [S= R1, R33, R45, R24= E]) 
V        b) chain of flanking region indices for in1/2 out1/2 middle (e.g. [C1, R2, R4] )
V        c) in1/2 out1/2 and middle reads per repeat (e.g. [R1, R33, R45, R24])  
        
    Algorithm : 
V        1. Find repeat by graph operations
V        2. Find flanking region by graph operations
V        3. Find associated reads by graph operations
    '''
    
    print "defineRepeatAndFlanking: "
    
    # 0. Load previous data
    G = commonLib.seqGraph(0)
    G.loadFromFile(folderName, "phaseStringGraph1")
    Grev = formReverseGraph(G)
    
    json_data = open(folderName + 'phaseRepeat.txt', 'r')
    repeatList = json.load(json_data)
    
    lenDicCC = commonLib.obtainLength(folderName, "improved3_Double.fasta")
    N1 = len(lenDicCC)
    
    
    print "repeatList: ", repeatList
    print "len(G.graphNodesList)", len(G.graphNodesList)
    
    bigDumpList = []
    
    print "len(repeatList)",len(repeatList) , repeatList
    for r in repeatList:
        rIn, rOut = [], []
        for eachitem in r[0]:
            rIn.append(eachitem / 2)
        for eachitem in r[1]:
            rOut.append(eachitem / 2)
        
        if len(rIn) == 2 and len(rOut) == 2:
            print rIn, rOut
        
            # 1. Records reachable indices
            kkIn , kkOut = [],[]
            '''
            for eachnext in G.graphNodesList[4].listOfNextNodes:
                print 4, eachnext
                kkIn.append(eachnext[0])

            for eachprev in G.graphNodesList[6].listOfPrevNodes:
                print 6, eachprev
                kkOut.append(eachprev[0])
                
            print set(kkIn).intersection(set(kkOut))
           
            print  len( G.graphNodesList[0].listOfNextNodes), len( G.graphNodesList[2].listOfNextNodes)
            print  len( G.graphNodesList[1].listOfPrevNodes), len( G.graphNodesList[3].listOfPrevNodes)
            
            print  len( Grev.graphNodesList[0].listOfPrevNodes), len( Grev.graphNodesList[2].listOfPrevNodes)
            print  len( Grev.graphNodesList[1].listOfNextNodes), len( Grev.graphNodesList[3].listOfNextNodes)
           ''' 
            markReachableIndices(G, Grev, rIn, rOut, N1)
            
            # 2. Marks inside nodes
            singleMissList, allPassList = markInsideNodes(G, rIn, rOut)
            for i in range(4): 
                print "len(singleMissList[i]), len(allPassList)", len(singleMissList[i]), len(allPassList)

            # 3. Finds start/end of repeat
            myStartIndex, myEndIndex = markStartEndNodes(G, rIn, rOut, singleMissList, allPassList)
            print myStartIndex, myEndIndex
            
            # 4. Find repeat interior by shortest path joining S/E
            repeatPathway = markInterior(G , myStartIndex, myEndIndex, N1)
            print "repeatPathway", repeatPathway
            checkPathLength(repeatPathway,  G, N1, folderName)
            
            # 5. Find flanking region by shortest path search again
            flankingPathsList = markFlankingRegion(G, rIn, rOut, myStartIndex, myEndIndex, N1)
            print flankingPathsList
            
            # 6. Find associated reads by graph node query
            flankingList, repeatList = markAssociatedReads(G, singleMissList, allPassList)
            
            ### Experimental
            repeatList = allPassList
            
            ### End Experimental
            for eachlist in flankingList:
                print len(eachlist), len(repeatList)
            
            bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList])
            
            
    # 7. Format return and move on to the phasing 
    with open(folderName + 'repeatSpecification.txt', 'w') as outfile:
        json.dump(bigDumpList, outfile)
Exemplo n.º 11
0
def defineRegionOfInterest(folderName , mummerLink):
    
    # Form inInfo and outInfo for [ name and endUsed ]
    # Define [terminating loc] for inInfo and outInfo
    
    print "defineRegionOfInterest"
    # commonLib.writeToFile_Double1(folderName, "improved3.fasta", "improved3_Double.fasta", "contig")
    # commonLib.useMummerAlign(mummerLink, folderName, "phasing", "improved3_Double.fasta", "improved3_Double.fasta")
    
    dataList = commonLib.extractMumData(folderName, "phasing" + "Out")
    lenDic = commonLib.obtainLength(folderName, "improved3_Double.fasta")
    
    print "Record length of contigs"
    for eachitem in lenDic:
        print lenDic[eachitem], eachitem
    
    print "\nPerform alignment and group associated contigs"
    # Convention : 0_p_L, 0_p_R, 0_d_L, 0_d_R 
    N = len(lenDic) * 2
    clusterList = []
    for i in range(N):
        clusterList.append(clusterElem(i))
        if i % 2 == 0:
            clusterList[i].terminatingLoc = 0
        else:
            clusterList[i].terminatingLoc = lenDic[parseIDToName(i)[0:-2]]
    
    oppoPairList = []
    for eachitem in dataList:
        terminatingLoc, resultOfCk = checkSameSideRequirement(eachitem, lenDic)
        isOppMatch, pair = checkOppositeSideRequirement(eachitem, lenDic)
        
        if isOppMatch:
            index1 = parseContigName(pair[0], 'R')
            index2 = parseContigName(pair[1], 'L')
            oppoPairList.append([index1, index2, pair[2], pair[3]])
            
        
        if resultOfCk == 'L' or resultOfCk == 'R':
            index1 = parseContigName(eachitem[-2], resultOfCk)
            index2 = parseContigName(eachitem[-1], resultOfCk)
            
            union(clusterList[index1], clusterList[index2])
            
            if resultOfCk == 'L':
                if clusterList[index1].terminatingLoc < terminatingLoc[0]:
                    clusterList[index1].terminatingLoc = terminatingLoc[0]
                
                if clusterList[index2].terminatingLoc < terminatingLoc[1]:
                    clusterList[index2].terminatingLoc = terminatingLoc[1]
            
            elif resultOfCk == 'R':
                if clusterList[index1].terminatingLoc > terminatingLoc[0]:
                    clusterList[index1].terminatingLoc = terminatingLoc[0]
                
                if clusterList[index2].terminatingLoc > terminatingLoc[1]:
                    clusterList[index2].terminatingLoc = terminatingLoc[1]
            

    headList = []
    for eachitem in clusterList:
        if find(eachitem) == eachitem:
            headList.append(eachitem)
    
    for eachitem in headList:
        
        for eachsub in familyList(eachitem):
            print parseIDToName(eachsub.id), eachsub.terminatingLoc,
        print 
   
    nFamily = len(headList)
    
    # Define the match of inInfo vs outInfo [matchList]
    oppoPairList.sort()
    for key, items in groupby(oppoPairList, itemgetter(0, 1)):
        
        # print parseIDToName(key[0]), parseIDToName(key[1])
        find(clusterList[key[0]]).voteList.append(key[1])
        find(clusterList[key[1]]).voteList.append(key[0])
    
    
    matchList = []
    for eachitem in headList:
        if eachitem.id % 2 == 1:
            successorIndex = eachitem.findSuccessor()
            if successorIndex != -1:
                matchList.append([eachitem.id, successorIndex])
            
    repeatList = []
    for eachitem in  matchList:
        if eachitem[0] != -1 and eachitem[1] != -1:
            
            inList = []
            for eachsubitem in familyList(find(clusterList[eachitem[0]])):
                inList.append([eachsubitem.id, eachsubitem.terminatingLoc])
                
            outList = []
            for eachsubitem in familyList(find(clusterList[eachitem[1]])):
                outList.append([eachsubitem.id, eachsubitem.terminatingLoc])
            
             
            repeatList.append([inList, outList])
            
    # Filter the embedded contigs
    globalRemoveList = []
    
    for eachitem in repeatList:
        inList = eachitem[0]
        outList = eachitem[1]
        toRemoveList = []
        for aitem in inList:
            for bitem in outList:
                if aitem[0] / 2 == bitem[0] / 2 :
                    toRemoveList.append([aitem, bitem])
                    globalRemoveList.append(aitem[0] / 2)
                
        for eachsub in toRemoveList:
            if eachsub[0] in inList:
                inList.remove(eachsub[0])
            if eachsub[1] in outList:
                outList.remove(eachsub[1])
        
    print "\nRepeats and in/out contigs"
    for i in range(len(repeatList)):
        print "(repeatList[i][0]),(repeatList[i][1]): ", (repeatList[i][0]), (repeatList[i][1]) 
    
    print "globalRemoveList: ", globalRemoveList
    print "oppoPairList", oppoPairList
    
    
    # Define the repeat 
    contigDic = commonLib.loadContigsFromFile(folderName, "improved3_Double.fasta") 
    newRepeatList = []
    newBRList = []
    
    print "\nRepeat interior and defining flanking region"
    for eachrepeat in repeatList:
        # ## Get the initial trial
        inReadList = []
        outReadList = []
        for eachitem in eachrepeat[0]:
            inReadList.append(eachitem[0])
        for eachitem in eachrepeat[1]:
            outReadList.append(eachitem[0])
        
        tmpLink = []
        for eachoppoPair in oppoPairList:
            if eachoppoPair[0] in inReadList and eachoppoPair[1] in outReadList:
                tmpLink = eachoppoPair
                break
            
        if len(tmpLink) > 0:
            f1Read, f2Read = tmpLink[0], tmpLink[1]
            f1 , a1, f2, a2 = -1, tmpLink[2], -1 , tmpLink[3]
            
            for eachitem in eachrepeat[0]:
                if eachitem[0] == f1Read:
                    f1 = eachitem[1]
            for eachitem in eachrepeat[1]:
                if eachitem[0] == f2Read:
                    f2 = eachitem[1]
            
            print "f1Read, f2Read, f1, a1, f2, a2:\t ", f1Read, f2Read, f1, a1, f2, a2
            
            f1tilde , f2tilde = f1, f2
            # ## Refine it
            for myrecord in dataList:
                myid = parseContigName(myrecord[-2], 'R')       
                otherid = parseContigName(myrecord[-1], 'R')
                
                if myid == f1Read and otherid != myid and otherid in inReadList:
                    if checkSameSideRequirement(myrecord, lenDic):
                        myStart = myrecord[0] 
                        if myStart > f1tilde:
                            f1tilde = myStart
            
            for myrecord in dataList:
                myid = parseContigName(myrecord[-2], 'L')       
                otherid = parseContigName(myrecord[-1], 'L')
                
                if myid == f2Read and otherid != myid and otherid in outReadList:
                    if checkSameSideRequirement(myrecord, lenDic):
                        myEnd = myrecord[1] 
                        if myEnd < f2tilde:
                            f2tilde = myEnd
                            
            # ## Output the loc indices and read from the real contig to get the repeat out
            print "f1Read, f2Read, f1tilde, a1, f2tilde, a2:  \t", f1Read, f2Read, f1tilde, a1, f2tilde, a2
            
            
            f1Read_parsed = parseIDToName(f1Read)[0:-2]
            f2Read_parsed = parseIDToName(f2Read)[0:-2]
            
            print "f1Read_parsed, f2Read_parsed", f1Read_parsed, f2Read_parsed, lenDic[f1Read_parsed], lenDic[f2Read_parsed]
            
            if a2 < f2tilde:
                repeatSegment = contigDic[f1Read_parsed][f1tilde:] + contigDic[f2Read_parsed][a2:f2tilde]
            else:
                repeatSegment = contigDic[f1Read_parsed][f1tilde:f2tilde - a2]
            
            
            print "len(repeatSegment)", len(repeatSegment)
            # ## Put to repeat, remove from repeat, add toBR 
            
            if a2 >= f2tilde or a1 <= f1tilde:
                newBRList.append([f1Read, f2Read, a1, a2])
                tmpSeg = [[], [], repeatSegment]
                for eachin in eachrepeat[0]:
                    if eachin[0] != f1Read:
                        tmpSeg[0].append(eachin)
                
                for eachout in eachrepeat[1]:
                    if eachout[0] != f2Read:
                        tmpSeg[1].append(eachout)
                
                
                if len(tmpSeg[0]) == 1 and len(tmpSeg[1]) == 1:
                    # TODO
                    inIndex = tmpSeg[0][0][0]
                    outIndex = tmpSeg[1][0][0]
                    found = False
                    
                    # if repeat exists , then fill in the blanks
                    # otherwise, fill 0, 0. 
                    a1New, a2New = -1, 0
                    for  secondrecord in dataList:
                        isOppMatch, pair = checkOppositeSideRequirement(secondrecord, lenDic)
                        
                        if isOppMatch:
                            index1 = parseContigName(pair[0], 'R')
                            index2 = parseContigName(pair[1], 'L')
                            
                            if index1 == inIndex and index2 == outIndex:
                                oppoPairList.append([index1, index2, pair[2], pair[3]])
                        
                    if not found:
                        newBRList.append([inIndex, outIndex, -1, 0])
                    else:
                        newBRList.append([inIndex, outIndex, a1New, a2New])
                        
                elif len(tmpSeg[0]) >= 1 and len(tmpSeg[1]) >= 1:
                    newRepeatList.append(tmpSeg) 
            else:
                if len(eachrepeat[0]) == 1 and len(eachrepeat[1]) == 1:
                    newBRList.append([f1Read, f2Read, a1, a2])
                elif len(eachrepeat[0]) >= 1 and len(eachrepeat[1]) >= 1:
                    newRepeatList.append([ repeatSegment, eachrepeat[0], eachrepeat[1]])
            
    # Format output 
    # Rmk: if only 1 copy is left, addToBR; if 0 left, remove that repeat    
    toPhase = newRepeatList
    toRemove = globalRemoveList
    toBR = newBRList
    
    print "Items to be returned to next step:"
    print "toPhase", len(toPhase)
    print "toRemove", len(toRemove)
    print "toBR", len(toBR) , toBR
    
    connectContigs(toPhase, toRemove, toBR, folderName, mummerLink)
Exemplo n.º 12
0
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec):
    print "resolvingTandem"
    '''
    Input : repeat info 
    Output : count, join. 
    
    Algorithm: 
    1. Find loops
    2. Form repeat
    3. Form chain of repeat copies back to back
    4. Align reads
    5. Calculate extra bases beyond flanking region
    6. Calculate count
    7. Join the contigs
    '''
    # 0 ) Load all the data
    G = commonLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    lenDicCC = commonLib.obtainLength(folderName, contigFilename+"_Double.fasta")
    N1 = len(lenDicCC)

    maxDuplicate = 10
    repeatTempFilename = "tandemRepeatTemplate.fasta"
    mummerFile = "myTandemRepeatTemplate"
    


    myContigsDic = commonLib.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta")    
    lenDicRR = commonLib.obtainLength(folderName, readsetFilename + "_Double.fasta")
    
    header = optTypeFileHeader + "RR"
    dataListRR = commonLib.extractMumData(folderName, header + "Out")
    dataListRR = newPhasing.filterData(dataListRR, lenDicRR)
    dataListRRDic = {}
    for eachitem in dataListRR: 
        if eachitem[1] > eachitem[3]:
            dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    header = optTypeFileHeader + "CR"
    lenDicCC = commonLib.obtainLength(folderName, contigFilename + "_Double.fasta")
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    
    dataListCR = commonLib.extractMumData(folderName, header + "Out")
    dataListCR = newPhasing.filterData(dataListCR, lenDicCR)
    dataListCRDic = {}
    for eachitem in dataListCR: 
        if eachitem[1] > eachitem[3]:
            dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    print dataListCRDic



    json_data = open(folderName + repeatSpec, 'r')
    loadData = json.load(json_data)
    
    contigsTmp = commonLib.loadContigsFromFile(folderName, contigFilename+"_Double.fasta")
    readTmp = commonLib.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")

    happyTandemList = {}
    
    
    
    for eachrepProfile in loadData:
        # 1) 
        startContig = eachrepProfile[-1][0][0]
        isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False)
       
        # 2) 
        if isTerminate:
            v = returnPathList[-1]
            i =0 
            tandemPath = []
            while i < len(returnPathList):
                if returnPathList[i] == v:
                    tandemPath = returnPathList[i:]
                    i = len(returnPathList)
                i = i +1
                
            print returnPathList
            print tandemPath
        # 3) [fix it when have time later ; to just use graph; bug at the min thing]
        
        repeatContent = ""
    
        for kk in range(len(tandemPath[0:-1])): 
            eachitem = tandemPath[kk]- N1
            nextitem = tandemPath[kk+1] - N1
            readName = "Read" + str(eachitem/2) + "_"
            nextReadName = "Read" + str(nextitem/2) + "_"
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
            
            if nextitem %2 ==0 :
                nextReadName = nextReadName + "p"
            elif nextitem %2 ==1:
                nextReadName = nextReadName + "d"
            
            overlap = dataListRRDic[readName + ";" + nextReadName]
            print overlap
            repeatContent = repeatContent +  myContigsDic[readName][0:-overlap]
            
        print "len(repeatContent)", len(repeatContent)
        
        fout = open(folderName + repeatTempFilename, 'w')
        fout.write(">RepeatSegment\n")
        repeatContentLarge = ""
        
        for i in range(maxDuplicate):
            fout.write(repeatContent)
            repeatContentLarge= repeatContentLarge + repeatContent
        fout.close()
        
        # 4)
        repeatReadList =  eachrepProfile[1]
        
        myList= []
        for eachitem in repeatReadList:
            
            readName = "Read" + str((eachitem- N1)/2) + "_"
    
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
            myList.append(readName)
            
        commonLib.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList)
        
        if True:
            commonLib.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta")
        
        dataList = commonLib.extractMumData(folderName, mummerFile+"Out")
        
        
        # 5)
        totalBasesMatch = 0
        lrepeat = len(repeatContent)
        c = 50 # Important parameters : FIX needed in production
        
        #lengthDic = commonLib.obtainLength(folderName, readsetFilename+"_Double.fasta")
        
        print "dataList[0]", dataList[0]
        dataList.sort(key = itemgetter(-1))
        for key, values in  groupby(dataList,itemgetter(-1)):
            maxValue = -1
            for eachsub in values:
                if eachsub[5] > maxValue:
                    maxValue = eachsub[5]
    
            #print key, maxValue
            totalBasesMatch = totalBasesMatch + maxValue
        
    
        print c, lrepeat, totalBasesMatch
        ct = totalBasesMatch*1.0/(c*lrepeat)
        print "BIG NUMBER of THE DAY: ", ct
    
        # 6) 
        # a) find the starting point 
        startContig = eachrepProfile[-1][0][0]
        firstRead = eachrepProfile[-1][0][1]-N1

        contigName = "Contig"+ str(startContig/2)
        if startContig %2 == 0:
            contigName = contigName + "_p"
        elif startContig%2 ==1:
            contigName = contigName + "_d"
        
        readName = "Read"+ str(firstRead/2)
        if firstRead %2 == 0:
            readName = readName + "_p"
        elif firstRead%2 ==1:
            readName = readName + "_d"
        
        overlapFirst = dataListCRDic[contigName+";"+readName]
        tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName]
        
        f1 = open(folderName + "firstOverlap.fasta", 'w')
        f1.write(">combined\n")
        f1.write(tmpCombine)
        f1.close()
        
        if True:
            commonLib.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta")
        
        dataList = commonLib.extractMumData(folderName, "myFirstOverlap"+"Out")
        
        dataList.sort(key = itemgetter(0))
        maxVal = -1
        maxItm = []
        for eachi in dataList:
            if eachi[5] > maxVal:
                maxVal = eachi[5]
                maxItm = eachi
        
        print maxItm
        repeatStart = maxItm[0]
        contigEnd = maxItm[2]
        # b) format return : prepare the repeat template 
        print "ct*lrepeat", int(repeatStart + ct*lrepeat)
        print "repeatStart", repeatStart
        happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)]
        contigsTmp[contigName] = tmpCombine[0:contigEnd]
        print "len(contigsTmp[contigName])", len(contigsTmp[contigName])
        print "len(happyTandemList[contigName])", len(happyTandemList[contigName])
        
    # 7) Combine all the repeat information and do the join
    
    leaderList = [i for i in range(len(contigsTmp))]
    for eachrepProfile in loadData:
        startContig = eachrepProfile[-1][0][0]
        endContig = eachrepProfile[-1][-1][-1]
        leaderContig = leaderList[startContig]
        
        leaderName = parseIDToName(leaderContig)
        endName = parseIDToName(endContig)
        startName = parseIDToName(startContig)
        
        contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName]
        
        if endContig != leaderContig:
            contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName]
            contigsTmp[endName] = ""
            leaderList[endContig] = leaderContig
        
    
    leaderAgg = [[] for i in range(len(leaderList))]
    for i in range(len(leaderList)):
        leaderAgg[leaderList[i]].append(i) 
    
    checkingList = [False for i in range(N1)]
    
    fout = open(folderName + "tademResolved.fasta", 'w')
    
    counter = 0
    for eachcontig in contigsTmp:
        id = newPhasing.parseEdgeNameToID(eachcontig, 'C')
        if checkingList[id/2] == False:
        
            fout.write(">Segkk"+str(counter)+ "\n")
            
            fout.write(contigsTmp[eachcontig])
            counter = counter + 1    
            for eachkk in leaderAgg[leaderList[id]]:
                checkingList[eachkk/2] = True
    
    fout.close()