def formSeqGraph(folderName , mummerLink):
    print "formSeqGraph" 
    startList, graphNodes = [], []
    
    print "formSeqGraph: Reading in best successors and predecessors"
    rightConnect = readConnectList(folderName, "rightConnect.txt")
    leftConnect = readConnectList(folderName, "leftConnect.txt")
    
    numberOfNodes = len(rightConnect)
    
    print "formSeqGraph: Initializing seqGraph"
    G = graphLib.seqGraph(numberOfNodes)
        
    print "formSeqGraph: Adding edges to seqGraph"
    for eachitem, i  in zip(rightConnect, range(len(rightConnect))):
        index = i
        connector, weight = eachitem
        G.insertEdge(index, connector, weight)
    
    for eachitem, i  in zip(leftConnect, range(len(leftConnect))):
        index = i
        connector, weight = eachitem
        G.insertEdge(connector, index, weight)
    

    G.cleanEdge()
    G.condense()
    print "formSeqGraph: Outputting seqGraph to condensedGraph.txt"
    G.saveToFile(folderName, "condensedGraph.txt")
    G.checkSelfLoops()
    G.checkCompleteness()
    
    G2 = graphLib.seqGraph(0)
    G2.loadFromFile(folderName, "condensedGraph.txt")
    
    houseKeeper.compareGraphUnitTest(G, G2)
    G.reportDummyUsefulNode()
    G.reportEdge()
    
    graphFileName = "condensedGraph.txt"
    contigFile = "noEmbed_Double.fasta"
    outContigFile = "improved.fasta"
    outOpenList = "openZone.txt"
    
    print "formSeqGraph: Outputting improved contigs from seqGraph to improved.fasta"
    IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
Пример #2
0
def formSeqGraph(folderName, mummerLink):
    print "formSeqGraph"
    startList, graphNodes = [], []

    print "formSeqGraph: Reading in best successors and predecessors"
    rightConnect = readConnectList(folderName, "rightConnect.txt")
    leftConnect = readConnectList(folderName, "leftConnect.txt")

    numberOfNodes = len(rightConnect)

    print "formSeqGraph: Initializing seqGraph"
    G = graphLib.seqGraph(numberOfNodes)

    print "formSeqGraph: Adding edges to seqGraph"
    for eachitem, i in zip(rightConnect, range(len(rightConnect))):
        index = i
        connector, weight = eachitem
        G.insertEdge(index, connector, weight)

    for eachitem, i in zip(leftConnect, range(len(leftConnect))):
        index = i
        connector, weight = eachitem
        G.insertEdge(connector, index, weight)

    G.cleanEdge()
    G.condense()
    print "formSeqGraph: Outputting seqGraph to condensedGraph.txt"
    G.saveToFile(folderName, "condensedGraph.txt")
    G.checkSelfLoops()
    G.checkCompleteness()

    G2 = graphLib.seqGraph(0)
    G2.loadFromFile(folderName, "condensedGraph.txt")

    houseKeeper.compareGraphUnitTest(G, G2)
    G.reportDummyUsefulNode()
    G.reportEdge()

    graphFileName = "condensedGraph.txt"
    contigFile = "noEmbed_Double.fasta"
    outContigFile = "improved.fasta"
    outOpenList = "openZone.txt"

    print "formSeqGraph: Outputting improved contigs from seqGraph to improved.fasta"
    IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile,
                          outContigFile, outOpenList)
def xPhased(folderName , mummerLink):
    # ## Repeat resolution  [Proxy for MB]
    # 1. Re-form the contig string graph with ALL connections from contigs only V
    # 2. Log down the reads and associated blocked contigs V 
    # 3. Use reads to connect;
    # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important);
    # 5. Read out contigs
    
    numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb")
    
    lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta")
    
    confidenLenThres = 0 
    
    G = graphLib.seqGraph(numberOfContig)
    extraEdges = loadEdgeFromBlockedReads(folderName)
    
    for eachitem in dataSet:
        # print eachitem
        wt, myin, myout = eachitem
        myInData = myin[6:].split('_')
        myOutData = myout[6:].split('_')
        
        if myInData[1] == 'p':
            offsetin = 0
        else:
            offsetin = 1
        
        if myOutData[1] == 'p':
            offsetout = 0
        else:
            offsetout = 1
            
        i = int(myInData[0]) * 2 + offsetin
        j = int(myOutData[0]) * 2 + offsetout
        
        ck = False
        
        for eachedge in extraEdges:
            mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3]
            if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres:
                ck = True
                
        if ck:
            G.insertEdge(i, j, wt)
    
    
    # G.reportEdge()
    G.MBResolve()
    G.reportEdge()
    
    G.saveToFile(folderName, "condensedGraphMB.txt")
    graphFileName = "condensedGraphMB.txt"
    contigFile = "improved2_Double.fasta"
    outContigFile = "improved3.fasta"
    outOpenList = "openZoneMB.txt"
    
    IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
    
    # ## Repeat resolution  [Proxy for phasing step]
    # 6. Find out the repeat region by MSA
    # 7. Find out the location of SNPs and extend across repeat 
    # [short cut : use contig creator : your job here is to get data into the correct formats]
    
    
    
    
    print "xPhased"
Пример #4
0
def xPhased(folderName , mummerLink):
    # ## Repeat resolution  [Proxy for MB]
    # 1. Re-form the contig string graph with ALL connections from contigs only V
    # 2. Log down the reads and associated blocked contigs V 
    # 3. Use reads to connect;
    # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important);
    # 5. Read out contigs
    
    print "xPhased: Aligning improved2.fasta against itself, outputting to mb*.delta"
    numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb")
    
    lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta")
    
    confidenLenThres = 0 
    
    print "xPhased: Building seqGraph"
    G = graphLib.seqGraph(numberOfContig)
    extraEdges = loadEdgeFromBlockedReads(folderName)
    
    for eachitem in dataSet:
        # print eachitem
        wt, myin, myout = eachitem
        myInData = myin[6:].split('_')
        myOutData = myout[6:].split('_')
        
        if myInData[1] == 'p':
            offsetin = 0
        else:
            offsetin = 1
        
        if myOutData[1] == 'p':
            offsetout = 0
        else:
            offsetout = 1
            
        i = int(myInData[0]) * 2 + offsetin
        j = int(myOutData[0]) * 2 + offsetout
        
        ck = False
        
        for eachedge in extraEdges:
            mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3]
            if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres:
                ck = True
                
        if ck:
            G.insertEdge(i, j, wt)
    
    
    # G.reportEdge()
    G.MBResolve()
    G.reportEdge()
    
    print "xPhased: Saving condensed seqGraph to condensedGraphMB.txt"
    G.saveToFile(folderName, "condensedGraphMB.txt")
    graphFileName = "condensedGraphMB.txt"
    contigFile = "improved2_Double.fasta"
    outContigFile = "improved3.fasta"
    outOpenList = "openZoneMB.txt"
    
    print "xPhased: Outputting improved contigs from condensed seqGraph to improved3.fasta"
    IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
    
    
    # ## Repeat resolution  [Proxy for phasing step]
    # 6. Find out the repeat region by MSA
    # 7. Find out the location of SNPs and extend across repeat 
    # [short cut : use contig creator : your job here is to get data into the correct formats]
    
    
    
    
    print "xPhased"