def xPhased(folderName , mummerLink):
    # ## Repeat resolution  [Proxy for MB]
    # 1. Re-form the contig string graph with ALL connections from contigs only V
    # 2. Log down the reads and associated blocked contigs V 
    # 3. Use reads to connect;
    # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important);
    # 5. Read out contigs
    
    numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb")
    
    lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta")
    
    confidenLenThres = 0 
    
    G = graphLib.seqGraph(numberOfContig)
    extraEdges = loadEdgeFromBlockedReads(folderName)
    
    for eachitem in dataSet:
        # print eachitem
        wt, myin, myout = eachitem
        myInData = myin[6:].split('_')
        myOutData = myout[6:].split('_')
        
        if myInData[1] == 'p':
            offsetin = 0
        else:
            offsetin = 1
        
        if myOutData[1] == 'p':
            offsetout = 0
        else:
            offsetout = 1
            
        i = int(myInData[0]) * 2 + offsetin
        j = int(myOutData[0]) * 2 + offsetout
        
        ck = False
        
        for eachedge in extraEdges:
            mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3]
            if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres:
                ck = True
                
        if ck:
            G.insertEdge(i, j, wt)
    
    
    # G.reportEdge()
    G.MBResolve()
    G.reportEdge()
    
    G.saveToFile(folderName, "condensedGraphMB.txt")
    graphFileName = "condensedGraphMB.txt"
    contigFile = "improved2_Double.fasta"
    outContigFile = "improved3.fasta"
    outOpenList = "openZoneMB.txt"
    
    IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
    
    # ## Repeat resolution  [Proxy for phasing step]
    # 6. Find out the repeat region by MSA
    # 7. Find out the location of SNPs and extend across repeat 
    # [short cut : use contig creator : your job here is to get data into the correct formats]
    
    
    
    
    print "xPhased"
Пример #2
0
def xPhased(folderName , mummerLink):
    # ## Repeat resolution  [Proxy for MB]
    # 1. Re-form the contig string graph with ALL connections from contigs only V
    # 2. Log down the reads and associated blocked contigs V 
    # 3. Use reads to connect;
    # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important);
    # 5. Read out contigs
    
    print "xPhased: Aligning improved2.fasta against itself, outputting to mb*.delta"
    numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb")
    
    lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta")
    
    confidenLenThres = 0 
    
    print "xPhased: Building seqGraph"
    G = graphLib.seqGraph(numberOfContig)
    extraEdges = loadEdgeFromBlockedReads(folderName)
    
    for eachitem in dataSet:
        # print eachitem
        wt, myin, myout = eachitem
        myInData = myin[6:].split('_')
        myOutData = myout[6:].split('_')
        
        if myInData[1] == 'p':
            offsetin = 0
        else:
            offsetin = 1
        
        if myOutData[1] == 'p':
            offsetout = 0
        else:
            offsetout = 1
            
        i = int(myInData[0]) * 2 + offsetin
        j = int(myOutData[0]) * 2 + offsetout
        
        ck = False
        
        for eachedge in extraEdges:
            mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3]
            if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres:
                ck = True
                
        if ck:
            G.insertEdge(i, j, wt)
    
    
    # G.reportEdge()
    G.MBResolve()
    G.reportEdge()
    
    print "xPhased: Saving condensed seqGraph to condensedGraphMB.txt"
    G.saveToFile(folderName, "condensedGraphMB.txt")
    graphFileName = "condensedGraphMB.txt"
    contigFile = "improved2_Double.fasta"
    outContigFile = "improved3.fasta"
    outOpenList = "openZoneMB.txt"
    
    print "xPhased: Outputting improved contigs from condensed seqGraph to improved3.fasta"
    IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
    
    
    # ## Repeat resolution  [Proxy for phasing step]
    # 6. Find out the repeat region by MSA
    # 7. Find out the location of SNPs and extend across repeat 
    # [short cut : use contig creator : your job here is to get data into the correct formats]
    
    
    
    
    print "xPhased"
def fetchSuccessor(folderName , mummerLink): 
    
    print "fetchSuccessor"
    left_connect, right_connect = [], [] 
        
    print "Direct greedy"
    print "fetchSuccessor: Aligning non-contained contigs to themselves, output files are greedy*.delta"
    numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "noEmbed", "greedy")
    # [next_item, overlap_length]
    
    leftConnect = [[-1, -1] for i in range(numberOfContig)]
    rightConnect = [[-1, -1] for i in range(numberOfContig)]
    
    dataSet.sort(reverse=True, key=itemgetter(1))
    
    print "fetchSuccessor: Finding best successors"
    for key, items in groupby(dataSet, itemgetter(1)):
        # if key == "Contig217_d":
        #    print "dddd"
        maxVal = -1
        myName = key
        connectorName = "" 
        for eachsubitem in items:
            if eachsubitem[0] > maxVal:
                maxVal = eachsubitem[0]
                connectorName = eachsubitem[2]
        

        prefix = myName.split('_')
        suffix = connectorName.split('_')
        lengthOfOverlap = maxVal
        
        if prefix[1] == 'p':
            prefixContig = int(prefix[0][6:]) * 2 
        else:
            prefixContig = int(prefix[0][6:]) * 2 + 1
        
        if suffix[1] == 'p':
            suffixContig = int(suffix[0][6:]) * 2 
        else:
            suffixContig = int(suffix[0][6:]) * 2 + 1
            
        assert(rightConnect[prefixContig][0] == -1)
        rightConnect[prefixContig][0] = suffixContig
        rightConnect[prefixContig][1] = lengthOfOverlap
        

    dataSet.sort(reverse=True, key=itemgetter(2))
    
    print "fetchSuccessor: Finding best predecessors"
    for key, items in groupby(dataSet, itemgetter(2)):

        maxVal = -1
        myName = key
        connectorName = "" 
        for eachsubitem in items:
            if eachsubitem[0] > maxVal:
                maxVal = eachsubitem[0]
                connectorName = eachsubitem[1]
        

        prefix = connectorName.split('_')
        suffix = myName.split('_')
        lengthOfOverlap = maxVal
        
        if prefix[1] == 'p':
            prefixContig = int(prefix[0][6:]) * 2 
        else:
            prefixContig = int(prefix[0][6:]) * 2 + 1
        
        if suffix[1] == 'p':
            suffixContig = int(suffix[0][6:]) * 2 
        else:
            suffixContig = int(suffix[0][6:]) * 2 + 1
            
        assert(leftConnect[suffixContig][0] == -1)
        leftConnect[suffixContig][0] = prefixContig 
        leftConnect[suffixContig][1] = lengthOfOverlap
    
    
    print "fetchSuccessor: Outputting best successors to rightConnect.txt"
    # ## Write to file: 
    f = open(folderName + 'rightConnect.txt', 'w')
    for eachitem, dummyIndex in zip(rightConnect, range(len(rightConnect))):
        f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n')
        
    f.close()
    
    print "fetchSuccessor: Outputting best predecessors to leftConnect.txt"
    f = open(folderName + 'leftConnect.txt', 'w')
    for eachitem, dummyIndex in zip(leftConnect, range(len(leftConnect))):
        f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n')
        
    f.close()
Пример #4
0
def fetchSuccessor(folderName , mummerLink): 
    
    print "fetchSuccessor"
    left_connect, right_connect = [], [] 
        
    print "Direct greedy"
    numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "noEmbed", "greedy")
    # [next_item, overlap_length]
    
    leftConnect = [[-1, -1] for i in range(numberOfContig)]
    rightConnect = [[-1, -1] for i in range(numberOfContig)]
    
    dataSet.sort(reverse=True, key=itemgetter(1))
    
    for key, items in groupby(dataSet, itemgetter(1)):
        # if key == "Contig217_d":
        #    print "dddd"
        maxVal = -1
        myName = key
        connectorName = "" 
        for eachsubitem in items:
            if eachsubitem[0] > maxVal:
                maxVal = eachsubitem[0]
                connectorName = eachsubitem[2]
        

        prefix = myName.split('_')
        suffix = connectorName.split('_')
        lengthOfOverlap = maxVal
        
        if prefix[1] == 'p':
            prefixContig = int(prefix[0][6:]) * 2 
        else:
            prefixContig = int(prefix[0][6:]) * 2 + 1
        
        if suffix[1] == 'p':
            suffixContig = int(suffix[0][6:]) * 2 
        else:
            suffixContig = int(suffix[0][6:]) * 2 + 1
            
        assert(rightConnect[prefixContig][0] == -1)
        rightConnect[prefixContig][0] = suffixContig
        rightConnect[prefixContig][1] = lengthOfOverlap
        

    dataSet.sort(reverse=True, key=itemgetter(2))
    
    for key, items in groupby(dataSet, itemgetter(2)):

        maxVal = -1
        myName = key
        connectorName = "" 
        for eachsubitem in items:
            if eachsubitem[0] > maxVal:
                maxVal = eachsubitem[0]
                connectorName = eachsubitem[1]
        

        prefix = connectorName.split('_')
        suffix = myName.split('_')
        lengthOfOverlap = maxVal
        
        if prefix[1] == 'p':
            prefixContig = int(prefix[0][6:]) * 2 
        else:
            prefixContig = int(prefix[0][6:]) * 2 + 1
        
        if suffix[1] == 'p':
            suffixContig = int(suffix[0][6:]) * 2 
        else:
            suffixContig = int(suffix[0][6:]) * 2 + 1
            
        assert(leftConnect[suffixContig][0] == -1)
        leftConnect[suffixContig][0] = prefixContig 
        leftConnect[suffixContig][1] = lengthOfOverlap
    
    
    # ## Write to file: 
    f = open(folderName + 'rightConnect.txt', 'w')
    for eachitem, dummyIndex in zip(rightConnect, range(len(rightConnect))):
        f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n')
        
    f.close()
    
    f = open(folderName + 'leftConnect.txt', 'w')
    for eachitem, dummyIndex in zip(leftConnect, range(len(leftConnect))):
        f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n')
        
    f.close()