Пример #1
0
def abunSplit(folderName, mummerLink, myCountDic):
    
    '''
    Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta
    Output : abunsplit.fasta
    
    Algorithm : 
    
    1. Load data from various sources [various json files]
    
    2. For each repeat interior:
        a) identify the abundances associated with in/out contigs
        b) perform a split and record the split
    
    3. Use split results to generate contigs [may already exist in newPhasing.py ] 
        a) use a graph to capture the split results 
        b) use reads to fill in any gaps 
        c) read out the contigs 
    
    '''
    json_data = open(folderName + "phaseRepeat.txt", 'r')
    repeatPairs = json.load(json_data)
    repeatPairs = obtainNonEmpty(repeatPairs)
    
    N1 = len(myCountDic)*2
    
    G = graphLib.seqGraph(N1)
    
    for eachitem in repeatPairs:
        inList, outList = eachitem[0], eachitem[1]
        resolvedList = determindMatch(inList, outList, myCountDic)
        addEdges(G, resolvedList)
        
    G.condense()
    IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", "improved3_Double.fasta")
Пример #2
0
def abunSplit(folderName, mummerLink, myCountDic):

    """
    Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta
    Output : abunsplit.fasta
    
    Algorithm : 
    
    1. Load data from various sources [various json files]
    
    2. For each repeat interior:
        a) identify the abundances associated with in/out contigs
        b) perform a split and record the split
    
    3. Use split results to generate contigs [may already exist in newPhasing.py ] 
        a) use a graph to capture the split results 
        b) use reads to fill in any gaps 
        c) read out the contigs 
    
    """
    json_data = open(folderName + "phaseRepeat.txt", "r")
    repeatPairs = json.load(json_data)
    repeatPairs = obtainNonEmpty(repeatPairs)

    N1 = len(myCountDic) * 2

    G = graphLib.seqGraph(N1)

    for eachitem in repeatPairs:
        inList, outList = eachitem[0], eachitem[1]
        resolvedList = determindMatch(inList, outList, myCountDic)
        addEdges(G, resolvedList)

    G.condense()
    IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", "improved3_Double.fasta")
Пример #3
0
def abunSplit(folderName, mummerLink, myCountDic, contigReadGraph,
              contigFilename, readsetFilename):
    '''
    Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta
    Output : abunsplit.fasta
    
    Algorithm : 
    
    1. Load data from various sources [various json files]
    
    2. For each repeat interior:
        a) identify the abundances associated with in/out contigs
        b) perform a split and record the split
    
    3. Use split results to generate contigs [may already exist in newPhasing.py ] 
        a) use a graph to capture the split results 
        b) use reads to fill in any gaps 
        c) read out the contigs 
    
    '''

    json_data = open(folderName + "phaseRepeat.txt", 'r')
    repeatPairs = json.load(json_data)
    repeatPairs = obtainNonEmpty(repeatPairs)

    N1 = len(myCountDic) * 2
    print "N1", N1

    G = graphLib.seqGraph(N1)

    gapContentLookUpList = []

    for eachitem in repeatPairs:
        inList, outList = eachitem[0], eachitem[1]
        resolvedList = determindMatch(inList, outList, myCountDic, folderName,
                                      contigReadGraph, N1)
        print "resolvedList", resolvedList
        gapContentLookUpList += generateGapContentLookup(
            folderName, mummerLink, resolvedList, contigReadGraph,
            contigFilename, readsetFilename)

        addEdges(G, resolvedList)

    gapContentLookUpDic = {}
    gapContentLookUpList.sort()

    for eachitem in gapContentLookUpList:
        gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [
            eachitem[2], eachitem[3], eachitem[4]
        ]
        print eachitem[2:4], len(eachitem[4])

    # some how change ASplitter here by appending necessary information

    G.condense()
    IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta",
                                  contigFilename + "_Double.fasta",
                                  gapContentLookUpDic)
Пример #4
0
def XResolution(folderName, contigReadGraph, Gnew, myCountDic, lenDic, N1,
                mummerLink):

    if abunHouseKeeper.abunGlobalSplitParameterRobot.runXResolve:
        G = graphLib.seqGraph(0)
        G.loadFromFile(folderName, contigReadGraph)

        Grev = abunGraphLib.formReverseGraphFast(G)

        if not abunHouseKeeper.abunGlobalRunEM:
            xResolvedList, brResolvedListforX = [[] for i in range(N1)
                                                 ], [[] for i in range(N1)]
            if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunAbunX:
                if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerX > 0:
                    abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLower = abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerX

                if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperX > 0:
                    abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpper = abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperX

                xResolvedList = xNodeAdvResolving(Gnew, G, folderName,
                                                  myCountDic, lenDic)

            if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunBRX:

                if abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresX > 0:
                    abunHouseKeeper.abunGlobalSplitParameterRobot.BRThres = abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresX

                brResolvedListforX = xNodeBrResolving(Gnew, G, Grev,
                                                      folderName, N1)

            combinedList = resolveConflictX(xResolvedList, brResolvedListforX)
        else:
            combinedList = xNodeEMResolving(Gnew, G, Grev, folderName,
                                            myCountDic, lenDic, N1, mummerLink)

        print "combinedList", combinedList

        Gnew.xResolve(combinedList)
        Gnew.condense()
        Gnew.saveToFile(folderName, "xResolvedGraph")

        with open(folderName + "mapDummyToRealDic.json", 'w') as f:
            json.dump(Gnew.mapDummyToRealDic, f)

        with open(folderName + "xResolvedSimplifiedList.json", 'w') as f:
            json.dump(Gnew.xResolvedSimplifiedList, f)

    else:
        Gnew.saveToFile(folderName, "xResolvedGraph")

        with open(folderName + "mapDummyToRealDic.json", 'w') as f:
            json.dump(Gnew.mapDummyToRealDic, f)

        with open(folderName + "xResolvedSimplifiedList.json", 'w') as f:
            json.dump(Gnew.xResolvedSimplifiedList, f)
Пример #5
0
def abunSplit(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename):

    """
    Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta
    Output : abunsplit.fasta
    
    Algorithm : 
    
    1. Load data from various sources [various json files]
    
    2. For each repeat interior:
        a) identify the abundances associated with in/out contigs
        b) perform a split and record the split
    
    3. Use split results to generate contigs [may already exist in newPhasing.py ] 
        a) use a graph to capture the split results 
        b) use reads to fill in any gaps 
        c) read out the contigs 
    
    """

    json_data = open(folderName + "phaseRepeat.txt", "r")
    repeatPairs = json.load(json_data)
    repeatPairs = obtainNonEmpty(repeatPairs)

    N1 = len(myCountDic) * 2
    print "N1", N1

    G = graphLib.seqGraph(N1)

    gapContentLookUpList = []

    for eachitem in repeatPairs:
        inList, outList = eachitem[0], eachitem[1]
        resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1)
        print "resolvedList", resolvedList
        gapContentLookUpList += generateGapContentLookup(
            folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename
        )

        addEdges(G, resolvedList)

    gapContentLookUpDic = {}
    gapContentLookUpList.sort()

    for eachitem in gapContentLookUpList:
        gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]]
        print eachitem[2:4], len(eachitem[4])

    # some how change ASplitter here by appending necessary information

    G.condense()
    IORobot.extractGraphToContigs(
        G, folderName, mummerLink, "abun.fasta", contigFilename + "_Double.fasta", gapContentLookUpDic
    )
Пример #6
0
def readContigForAbunSplit(folderName, mummerLink, contigFilename,
                           readsetFilename, N1, contigReadGraph):

    json_data = open(folderName + "mapDummyToRealDic.json", 'r')
    mapDummyToRealDic = json.load(json_data)

    G = []
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, "xResolvedGraph")

    gapContentLookUpDic = {}

    furtherGapList = []
    for i in range(N1):
        if len(G.graphNodesList[i].nodeIndexList) > 1:
            for j in range(len(G.graphNodesList[i].nodeIndexList) - 1):

                bk, fwd = G.graphNodesList[i].nodeIndexList[
                    j], G.graphNodesList[i].nodeIndexList[j + 1]

                key = str(bk) + "_" + str(fwd)

                if not key in gapContentLookUpDic:
                    furtherGapList.append([bk, fwd])

    with open(folderName + "furtherGapList.json", 'w') as f:
        json.dump(furtherGapList, f)

    furtherGapContentLookUpList = generateGapContentLookup(
        folderName, mummerLink, furtherGapList, contigReadGraph,
        contigFilename, readsetFilename, mapDummyToRealDic)

    for eachitem in furtherGapContentLookUpList:
        gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [
            eachitem[2], eachitem[3], eachitem[4]
        ]
        print eachitem[2:4], len(eachitem[4])

    #segLookUp = IORobot.readContigsFromFile(folderName, "LC_n_Double.fasta")

    print "Final step: really hacking a file"
    os.system("cp " + folderName + contigFilename + "_Double.fasta " +
              folderName + "tmpWithDummy.fasta")
    contigList = IORobot.readContigsFromFile(folderName,
                                             contigFilename + "_Double.fasta")

    IORobot.extractGraphToContigs(G, folderName, mummerLink, "abunPre.fasta",
                                  "tmpWithDummy.fasta", gapContentLookUpDic,
                                  mapDummyToRealDic)

    if True:
        nonRedundantResolver.removeRedundantWithFile(folderName, mummerLink,
                                                     "abunPre", "abunMum",
                                                     "abun")
Пример #7
0
def XResolution(folderName, contigReadGraph, Gnew, myCountDic, lenDic, N1):

    if abunHouseKeeper.abunGlobalSplitParameterRobot.runXResolve:
        G = graphLib.seqGraph(0)
        G.loadFromFile(folderName, contigReadGraph)

        Grev = abunGraphLib.formReverseGraphFast(G)

        xResolvedList, brResolvedListforX = [[] for i in range(N1)], [[] for i in range(N1)]

        if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunAbunX:
            if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerX > 0:
                abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLower = (
                    abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerX
                )

            if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperX > 0:
                abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpper = (
                    abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperX
                )

            xResolvedList = xNodeAdvResolving(Gnew, G, folderName, myCountDic, lenDic)

        if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunBRX:

            if abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresX > 0:
                abunHouseKeeper.abunGlobalSplitParameterRobot.BRThres = (
                    abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresX
                )

            brResolvedListforX = xNodeBrResolving(Gnew, G, Grev, folderName, N1)

        combinedList = resolveConflictX(xResolvedList, brResolvedListforX)

        Gnew.xResolve(combinedList)
        Gnew.condense()
        Gnew.saveToFile(folderName, "xResolvedGraph")

        with open(folderName + "mapDummyToRealDic.json", "w") as f:
            json.dump(Gnew.mapDummyToRealDic, f)

        with open(folderName + "xResolvedSimplifiedList.json", "w") as f:
            json.dump(Gnew.xResolvedSimplifiedList, f)

    else:
        Gnew.saveToFile(folderName, "xResolvedGraph")

        with open(folderName + "mapDummyToRealDic.json", "w") as f:
            json.dump(Gnew.mapDummyToRealDic, f)

        with open(folderName + "xResolvedSimplifiedList.json", "w") as f:
            json.dump(Gnew.xResolvedSimplifiedList, f)
Пример #8
0
def readContigForAbunSplit(folderName, mummerLink, contigFilename, readsetFilename, N1, contigReadGraph):

    json_data = open(folderName + "mapDummyToRealDic.json", "r")
    mapDummyToRealDic = json.load(json_data)

    G = []
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, "xResolvedGraph")

    gapContentLookUpDic = {}

    furtherGapList = []
    for i in range(N1):
        if len(G.graphNodesList[i].nodeIndexList) > 1:
            for j in range(len(G.graphNodesList[i].nodeIndexList) - 1):

                bk, fwd = G.graphNodesList[i].nodeIndexList[j], G.graphNodesList[i].nodeIndexList[j + 1]

                key = str(bk) + "_" + str(fwd)

                if not key in gapContentLookUpDic:
                    furtherGapList.append([bk, fwd])

    with open(folderName + "furtherGapList.json", "w") as f:
        json.dump(furtherGapList, f)

    furtherGapContentLookUpList = generateGapContentLookup(
        folderName, mummerLink, furtherGapList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic
    )

    for eachitem in furtherGapContentLookUpList:
        gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]]
        print eachitem[2:4], len(eachitem[4])

    # segLookUp = IORobot.readContigsFromFile(folderName, "LC_n_Double.fasta")

    print "Final step: really hacking a file"
    os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta")
    contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta")

    IORobot.extractGraphToContigs(
        G, folderName, mummerLink, "abunPre.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic
    )

    if True:
        nonRedundantResolver.removeRedundantWithFile(folderName, mummerLink, "abunPre", "abunMum", "abun")
Пример #9
0
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph,
                   repeatFilename, optionToRun):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''

    # ## (a) reachability test to find partners
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")

    adjacencyList = [[] for i in range(len(lenDicCC))]

    N1 = len(lenDicCC)

    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug

    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G)
        print "i, adjacencyList[i] : ", i, adjacencyList[i]

    # ## (b) formation of bipartite graph
    if optionToRun == "tandem":
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase":
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName,
                                                   contigFilename)

    # cut here

    adjListToRepeatList(newAdjacencyList, folderName, repeatFilename)
Пример #10
0
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''
    
    # ## (a) reachability test to find partners 
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    
    adjacencyList = [[] for i in range(len(lenDicCC))]
    
    N1 = len(lenDicCC)
    
    
    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug
    
    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) 
        print "i, adjacencyList[i] : ", i , adjacencyList[i]
    
    # ## (b) formation of bipartite graph
    if optionToRun == "tandem" :
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase": 
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename)
    

    # cut here

    adjListToRepeatList(newAdjacencyList,folderName,repeatFilename )
Пример #11
0
def performPhasing(folderName, mummerLink):
    print "performPhasing"
    '''
    1. Interface from alignmentBridge.py : 
        shortToLongMap = formRelatedMap(f2, noisyReads, currentNode, indelRobot, toProcessList)
        cleaner.cleaning([noisyReads,noisyReads] ,shortToLongMap, toProcessList,indelRobot, "init")
        in1List, in2List, out1List, out2List, commonList, longReadToUse  = cleaner.cleaning([noisyReads, noisyReads],shortToLongMap, toProcessList,indelRobot, "vote")
        extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList,indelRobot,longReadToUse, True)
    
    2. Format of input data data : 
        bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList])
    
    3. IO : 
        a) Input :
            repeatSpecification.txt, phasingSeedName_Double.fasta, graph G 
        b) Output :
            improved4.fasta
            
    3. Algorithm: 
        a) reformatNoisyReads 
        b) reformatToProcessList
        c) formShortToLongMapping
    
    '''

    json_data = open(folderName + 'repeatSpecification.txt', 'r')
    loadData = json.load(json_data)
    
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, "phaseStringGraph1")
    
    lenDicRR = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta")
    
    lenDicCC = IORobot.obtainLength(folderName, "improved3_Double.fasta")
    N1 = len(lenDicCC)
    
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    
    loadData = filterReverseComp(loadData, N1)
    
    toPhaseList = []
    
    if True:
        for eachitem in loadData:
            # print eachitem
            flankingList, repeatList, repeatPathway, flankingPathsList = eachitem[0], eachitem[1], eachitem[2], eachitem[3] 
            
            noisyReads, dicToOriginal, dicFromOriginal = reformatNoisyReads(folderName, flankingList, repeatList, N1)
            
            toProcessList = reformatToProcessList(folderName , flankingList, repeatList, dicFromOriginal, N1)
    
            shortToLongMap = formShortToLongMapping(folderName, G, toProcessList, dicFromOriginal, dicToOriginal, lenDicCR, N1)
            
            indelRobot = createIndelRobot(folderName)
            
            cleaner.cleaning([noisyReads, noisyReads] , shortToLongMap, toProcessList, indelRobot, "init")
            in1List, in2List, out1List, out2List, commonList, longReadToUse = cleaner.cleaning([noisyReads, noisyReads], shortToLongMap, toProcessList, indelRobot, "vote")
            extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList, indelRobot, longReadToUse, True)
            
            if extendResult != -1:
                print "extendResult: ", extendResult
                toPhaseList.append(eachitem + [extendResult])
            
        with open(folderName + 'toPhaseList.txt', 'w') as outfile:
            json.dump(toPhaseList, outfile)

    json_data = open(folderName + 'toPhaseList.txt', 'r')
    toPhaseList = json.load(json_data)
    
    outputResults(folderName, mummerLink, toPhaseList, N1, G)
Пример #12
0
def outputResults(folderName, mummerLink, toPhaseList, N1, G):
    '''    
    Algorithm :
    a) Write as contigs 
    b) Add back reverse complement 
    c) Create G2 as the readOut part 
    d) Output the contigs by a function call

    '''
    # a) 
    combinedName = "contigAndRead_Double.fasta"
    os.system("cp " + folderName + "improved3_Double.fasta " + folderName + combinedName)
    
    fout = open(folderName + combinedName, 'a')
    fin = open(folderName + "phasingSeedName_Double.fasta", 'r')

    tmp = fin.readline().rstrip()
    while len(tmp) > 0:
        if tmp[0] != ">":
            fout.write(tmp + "\n")
        else:
            infoArr = tmp[5:].split("_")
            fout.write(">Contig" + str(int(infoArr[0]) + N1 / 2))
            fout.write("_" + infoArr[1] + "\n")
        tmp = fin.readline().rstrip()
        
    fin.close()
    fout.close()

    # b)
    '''
    [28], [[2, 690, 28], [6, 126, 28], [28, 212, 0], [28, 216, 4]], 1
    
    [2 , 690, 28, 212, 0]
    '''
    
    completePhaseList = []
    for eachitem in toPhaseList:
        repeat = eachitem[-3]
        flanking = eachitem[-2]
        result = eachitem[-1]
        
        
        revrepeat = []
        for eachsub in eachitem[-3][-1::-1]:
            revrepeat.append(eachsub + pow(-1, eachsub))
            
        revflanking = [[] for i in range(4)] 
        
        for j in range(2):
            for eachsub in eachitem[-2][j + 2][-1::-1]:
                revflanking[j].append(eachsub + pow(-1, eachsub))
            for eachsub in eachitem[-2][j][-1::-1]:
                revflanking[j + 2].append(eachsub + pow(-1, eachsub))
            
        revresult = eachitem[-1]
        
        completePhaseList.append([repeat, flanking, result])
        completePhaseList.append([revrepeat, revflanking, revresult])
    
    print "completePhaseList", completePhaseList
    # c) 
    G2 = graphLib.seqGraph(N1)
    nameDic = {}
    for i in range(N1):
        nameDic[i] = i
        
    for eachitem in completePhaseList:
        repeat, flanking, result = eachitem[0] , eachitem[1] , eachitem[2]
        path = [[], []]
        
        if result == 0:
            path[0] = flanking[0][0:-1] + repeat + flanking[2][1:]
            path[1] = flanking[1][0:-1] + repeat + flanking[3][1:]
        else:
            path[0] = flanking[0][0:-1] + repeat + flanking[3][1:]
            path[1] = flanking[1][0:-1] + repeat + flanking[2][1:]
        
        print path[0] , path[1]
        for i  in range(2):
            eachpath = path[i]
            currentNode = G2.graphNodesList[eachpath[0]]
            
            for nextNodeIndex, ctr in zip(eachpath[1:], range(len(eachpath[1:]))):
                if ctr != len(eachpath[1:]) - 1:
                    myindex = len(G2.graphNodesList)
                    nameDic[myindex] = nextNodeIndex
                    
                    newNode = graphLib.seqGraphNode(myindex)
                    G2.graphNodesList.append(newNode)
                else:
                    newNode = G2.graphNodesList[nextNodeIndex]
                    
                wt = 0
                for eachck in G.graphNodesList[nameDic[currentNode.nodeIndex]].listOfNextNodes:
                    if eachck[0] == nextNodeIndex:
                        wt = eachck[1]
                        break
                    
                newNode.listOfPrevNodes.append([currentNode.nodeIndex, wt])
                currentNode.listOfNextNodes.append([newNode.nodeIndex, wt])
                
                currentNode = newNode
                
    graphFileName = "phaseGraphFinal"
    G2.condense()
    G2.saveToFile(folderName, graphFileName)
    
    IORobot.readContigOut(folderName, mummerLink, graphFileName, combinedName, "improved4.fasta", "outOpenListphaing", nameDic)
Пример #13
0
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec):
    print "resolvingTandem"
    '''
    Input : repeat info 
    Output : count, join. 
    
    Algorithm: 
    1. Find loops
    2. Form repeat
    3. Form chain of repeat copies back to back
    4. Align reads
    5. Calculate extra bases beyond flanking region
    6. Calculate count
    7. Join the contigs
    '''
    # 0 ) Load all the data
    thres = 5 
    
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta")
    N1 = len(lenDicCC)

    maxDuplicate = 10
    repeatTempFilename = "tandemRepeatTemplate.fasta"
    mummerFile = "myTandemRepeatTemplate"
    


    myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta")    
    lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta")
    
    header = optTypeFileHeader + "RR"
    dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
    dataListRRDic = {}
    for eachitem in dataListRR: 
        if eachitem[2] < thres:
            dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    header = optTypeFileHeader + "CR"
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)
    dataListCRDic = {}
    for eachitem in dataListCR: 
        if eachitem[2] < thres:
            dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    print dataListCRDic



    json_data = open(folderName + repeatSpec, 'r')
    loadData = json.load(json_data)
    
    contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename+"_Double.fasta")
    readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")

    happyTandemList = {}
    
    
    
    for eachrepProfile in loadData:
        # 1) 
        startContig = eachrepProfile[-1][0][0]
        isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False)
       
        # 2) 
        if isTerminate:
            v = returnPathList[-1]
            i =0 
            tandemPath = []
            while i < len(returnPathList):
                if returnPathList[i] == v:
                    tandemPath = returnPathList[i:]
                    i = len(returnPathList)
                i = i +1
                
            print returnPathList
            print tandemPath
        # 3) [fix it when have time later ; to just use graph; bug at the min thing]
        
        repeatContent = ""
    
        for kk in range(len(tandemPath[0:-1])): 
            eachitem = tandemPath[kk]- N1
            nextitem = tandemPath[kk+1] - N1
            readName = "Read" + str(eachitem/2) + "_"
            nextReadName = "Read" + str(nextitem/2) + "_"
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
            
            if nextitem %2 ==0 :
                nextReadName = nextReadName + "p"
            elif nextitem %2 ==1:
                nextReadName = nextReadName + "d"
            
            overlap = dataListRRDic[readName + ";" + nextReadName]
            print overlap
            repeatContent = repeatContent +  myContigsDic[readName][0:-overlap]
            
        print "len(repeatContent)", len(repeatContent)
        
        fout = open(folderName + repeatTempFilename, 'w')
        fout.write(">RepeatSegment\n")
        repeatContentLarge = ""
        
        for i in range(maxDuplicate):
            fout.write(repeatContent)
            repeatContentLarge= repeatContentLarge + repeatContent
        fout.close()
        
        # 4)
        repeatReadList =  eachrepProfile[1]
        
        myList= []
        for eachitem in repeatReadList:
            
            readName = "Read" + str((eachitem- N1)/2) + "_"
    
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
            myList.append(readName)
            
        IORobot.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList)
        
        if True:
            alignerRobot.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta")
        
        dataList = alignerRobot.extractMumData(folderName, mummerFile+"Out")
        
        
        # 5)
        totalBasesMatch = 0
        lrepeat = len(repeatContent)
        c = findCoverageFromRawData(folderName)
        
        
        # print "dataList[0]", dataList[0]
        dataList.sort(key = itemgetter(-1))
        for key, values in  groupby(dataList,itemgetter(-1)):
            maxValue = -1
            for eachsub in values:
                if eachsub[5] > maxValue:
                    maxValue = eachsub[5]
    
            #print key, maxValue
            totalBasesMatch = totalBasesMatch + maxValue
        
    
        print c, lrepeat, totalBasesMatch
        ct = totalBasesMatch*1.0/(c*lrepeat)
        print "BIG NUMBER of THE DAY: ", ct
    
        # 6) 
        # a) find the starting point 
        startContig = eachrepProfile[-1][0][0]
        firstRead = eachrepProfile[-1][0][1]-N1

        contigName = "Contig"+ str(startContig/2)
        if startContig %2 == 0:
            contigName = contigName + "_p"
        elif startContig%2 ==1:
            contigName = contigName + "_d"
        
        readName = "Read"+ str(firstRead/2)
        if firstRead %2 == 0:
            readName = readName + "_p"
        elif firstRead%2 ==1:
            readName = readName + "_d"
        
        overlapFirst = dataListCRDic[contigName+";"+readName]
        tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName]
        
        f1 = open(folderName + "firstOverlap.fasta", 'w')
        f1.write(">combined\n")
        f1.write(tmpCombine)
        f1.close()
        
        if True:
            alignerRobot.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta")
        
        dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap"+"Out")
        
        dataList.sort(key = itemgetter(0))
        maxVal = -1
        maxItm = []
        for eachi in dataList:
            if eachi[5] > maxVal:
                maxVal = eachi[5]
                maxItm = eachi
        
        print maxItm
        if len(maxItm) > 0 :
            repeatStart = maxItm[0]
            contigEnd = maxItm[2]
        else:
            repeatStart = 0
            contigEnd = -1
        # b) format return : prepare the repeat template 
        print "ct*lrepeat", int(repeatStart + ct*lrepeat)
        print "repeatStart", repeatStart
        happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)]
        contigsTmp[contigName] = tmpCombine[0:contigEnd]
        print "len(contigsTmp[contigName])", len(contigsTmp[contigName])
        print "len(happyTandemList[contigName])", len(happyTandemList[contigName])
        
    # 7) Combine all the repeat information and do the join
    
    leaderList = [i for i in range(len(contigsTmp))]
    for eachrepProfile in loadData:
        startContig = eachrepProfile[-1][0][0]
        endContig = eachrepProfile[-1][-1][-1]
        leaderContig = leaderList[startContig]
        
        leaderName = parseIDToName(leaderContig)
        endName = parseIDToName(endContig)
        startName = parseIDToName(startContig)
        
        contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName]
        
        if endContig != leaderContig:
            contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName]
            contigsTmp[endName] = ""
            leaderList[endContig] = leaderContig
        
    
    leaderAgg = [[] for i in range(len(leaderList))]
    for i in range(len(leaderList)):
        leaderAgg[leaderList[i]].append(i) 
    
    checkingList = [False for i in range(N1)]
    
    fout = open(folderName + "tademResolved.fasta", 'w')
    
    counter = 0
    for eachcontig in contigsTmp:
        id = abunHouseKeeper.parseEdgeNameToID(eachcontig, 'C')
        if checkingList[id/2] == False:
        
            fout.write(">Segkk"+str(counter)+ "\n")
            
            fout.write(contigsTmp[eachcontig])
            counter = counter + 1    
            for eachkk in leaderAgg[leaderList[id]]:
                checkingList[eachkk/2] = True
    
    fout.close()
Пример #14
0
def BResolution(Gnew, folderName, contigReadGraph, N1, myCountDic, lenDic):

    if abunHouseKeeper.abunGlobalSplitParameterRobot.runBResolve:
        print "abunHouseKeeper.abunGlobalSplitParameterRobot.runBResolve", abunHouseKeeper.abunGlobalSplitParameterRobot.runBResolve
        maxRThres = abunHouseKeeper.abunGlobalSplitParameterRobot.RThres

        repeatFinder.adjListToRepeatList(Gnew.adj, folderName, "phaseRepeatTR.txt")

        json_data = open(folderName + "phaseRepeatTR.txt", "r")
        repeatPairs = json.load(json_data)

        repeatPairs = obtainNonEmpty(repeatPairs)

        biResolvedCombineList = []

        G = graphLib.seqGraph(0)
        G.loadFromFile(folderName, contigReadGraph)

        Grev = abunGraphLib.formReverseGraphFast(G)

        abunAnalysisList = []

        for eachitem in repeatPairs:
            inList, outList = eachitem[0], eachitem[1]
            resolvedList, brResolvedList = [], []

            if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunAbunB:
                if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerB > 0:
                    abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLower = (
                        abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerB
                    )

                if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperB > 0:
                    abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpper = (
                        abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperB
                    )

                if not abunHouseKeeper.abunGlobalSplitParameterRobot.toRunAggB:
                    resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1)
                else:

                    resolvedList = determindMatchAggregate(
                        inList, outList, myCountDic, folderName, contigReadGraph, N1, Gnew, lenDic
                    )

            if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunBRB:
                if abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresB > 0:
                    abunHouseKeeper.abunGlobalSplitParameterRobot.BRThres = (
                        abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresB
                    )

                brResolvedList = formBRReolve(folderName, inList, outList, G, Grev, True, N1)

            combinedList = abunHouseKeeper.getDistinct(resolvedList + brResolvedList)

            print "resolvedList, brResolvedList, inList, outList", resolvedList, brResolvedList, inList, outList

            print "resolveConflict(combinedList)", resolveConflict(combinedList)

            abunAnalysisList.append([inList, outList, resolvedList, brResolvedList, resolveConflict(combinedList)])
            if len(inList) <= maxRThres and len(outList) <= maxRThres and len(inList) > 0 and len(outList) > 0:
                # biResolvedCombineList += resolveConflict(combinedList)

                resolvedCombine = resolveConflict(combinedList)
                ### kkdebug
                Gnew.bipartiteLocalResolve(resolvedCombine, inList, outList, folderName)

        # json_data = open(folderName + "hackBRResolveList.json", 'r')
        # dataItem = json.load(json_data)
        # Gnew.bipartiteResolve(dataItem)

        ### end kkdebug
        Gnew.condense()

        with open(folderName + "biResolvedCombineList.json", "w") as f:
            json.dump(biResolvedCombineList, f)

        with open(folderName + "abunAnalysisList.json", "w") as f:
            json.dump(abunAnalysisList, f)

        # assert(1==2)

        return Gnew

    else:
        return Gnew
Пример #15
0
def graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink, readsetFilename, contigFilename):

    ### Transitive reduction and remove double pointers
    N1 = len(myCountDic) * 2
    print "N1", N1
    kthres = abunHouseKeeper.abunGlobalSplitParameterRobot.kthres
    edgeThres = abunHouseKeeper.abunGlobalSplitParameterRobot.edgeThres

    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    adj = [[] for i in range(N1)]

    for i in range(N1):
        tmpList = abunGraphLib.findAllReachable(i, N1, G)

        for j in tmpList:
            if len(abunGraphLib.findAllPathK(i, j, G, kthres)) >= edgeThres:
                adj[i].append(j)

    ### Filter adaptor skipped case

    adaptorPair = []

    for i in range(len(adj)):
        if i % 2 == 0:
            if i + 1 in adj[i]:
                adj[i].remove(i + 1)
                adaptorPair.append([i, i + 1])
        elif i % 2 == 1:
            if i - 1 in adj[i]:
                adj[i].remove(i - 1)
                adaptorPair.append([i, i - 1])

    Gnew = abunGraphLib.seqGraphDynamic(N1)

    for i in range(N1):
        for j in adj[i]:
            Gnew.insertEdge(i, j, 1997)

    for eachpair in adaptorPair:
        u, v = eachpair[0], eachpair[1]
        for x in Gnew.graphNodesList[u].listOfPrevNodes:
            xIndex = x[0]
            Gnew.removeEdge(xIndex, v)
        for y in Gnew.graphNodesList[v].listOfNextNodes:
            yIndex = y[0]
            Gnew.removeEdge(u, yIndex)

    ### Trying out the new component
    import toCondenseFixer

    Gnew = toCondenseFixer.noGoZoneDefiner(Gnew, folderName)

    Gnew.symGraph()
    ### End filter adaptor skipped case

    if abunHouseKeeper.abunGlobalSplitParameterRobot.runGraphSurgery:

        Gnew.initAdv()
        if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunCondenseRemove:
            Gnew.condenseEdgeRemove(G, folderName, mummerLink, contigFilename)

        if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunDoubltPtr:
            Gnew.doubleEdgeReduction()

        if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunTransitive:
            Gnew.transitiveReduction(
                folderName, mummerLink, contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta", G
            )

        Gnew.condense()
        Gnew.findAdjList()
    else:
        Gnew.initAdv()
        Gnew.condense()
        Gnew.findAdjList()

    return Gnew
Пример #16
0
def abunSplitWithXResolve(folderName, mummerLink, myCountDic, contigReadGraph,
                          contigFilename, readsetFilename):
    N1 = len(myCountDic) * 2
    print "N1", N1

    # Debug
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    adj = [[] for i in range(N1)]

    for i in range(N1):
        adj[i] = abunGraphLib.findAllReachable(i, N1, G)

    Gnew = graphLib.seqGraph(N1)

    for i in range(N1):
        for j in adj[i]:
            Gnew.insertEdge(i, j, 1)

    Gnew.reportEdge()
    # End Debug

    if False:
        json_data = open(folderName + "phaseRepeat.txt", 'r')
        repeatPairs = json.load(json_data)
        repeatPairs = obtainNonEmpty(repeatPairs)

        biResolvedCombineList = []
        for eachitem in repeatPairs:
            inList, outList = eachitem[0], eachitem[1]
            resolvedList = determindMatch(inList, outList, myCountDic,
                                          folderName, contigReadGraph, N1)

            biResolvedCombineList += resolvedList

        ### Xnode repeatResolution
        xResolvedList, mapDummyToRealDic = xNodeResolving(
            folderName, contigReadGraph)

        ### Combine resolution
        resolvedList = xResolvedList + biResolvedCombineList
        resolvedList = abunHouseKeeper.getDistinct(resolvedList)
        print "resolvedList, len(resolvedList),len(xResolvedList), len(biResolvedCombineList) ", resolvedList, len(
            resolvedList), len(xResolvedList), len(biResolvedCombineList)

        with open(folderName + "resolvedList.json", 'w') as f:
            json.dump(resolvedList, f)

        with open(folderName + "mapDummyToRealDic.json", 'w') as f:
            json.dump(mapDummyToRealDic, f)

    if False:
        json_data = open(folderName + "resolvedList.json", 'r')
        resolvedList = json.load(json_data)

        json_data = open(folderName + "mapDummyToRealDic.json", 'r')
        mapDummyToRealDic = json.load(json_data)

        gapContentLookUpList = []
        gapContentLookUpList = generateGapContentLookup(
            folderName, mummerLink, resolvedList, contigReadGraph,
            contigFilename, readsetFilename, mapDummyToRealDic)
        gapContentLookUpDic = {}
        gapContentLookUpList.sort()

        for eachitem in gapContentLookUpList:
            gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [
                eachitem[2], eachitem[3], eachitem[4]
            ]
            print eachitem[2:4], len(eachitem[4])

        with open(folderName + "gapContentLookUpDic.json", 'w') as f:
            json.dump(gapContentLookUpDic, f)

    if False:
        json_data = open(folderName + "resolvedList.json", 'r')
        resolvedList = json.load(json_data)

        json_data = open(folderName + "mapDummyToRealDic.json", 'r')
        mapDummyToRealDic = json.load(json_data)

        G = graphLib.seqGraph(N1 + len(mapDummyToRealDic))
        addEdges(G, resolvedList)
        G.condense()

        G.saveToFile(folderName, "xResolvedGraph")

    if False:
        json_data = open(folderName + "mapDummyToRealDic.json", 'r')
        mapDummyToRealDic = json.load(json_data)

        G = graphLib.seqGraph(0)
        G.loadFromFile(folderName, "xResolvedGraph")

        json_data = open(folderName + "gapContentLookUpDic.json", 'r')
        gapContentLookUpDic = json.load(json_data)

        print "Final step: really hacking a file"
        os.system("cp " + folderName + contigFilename + "_Double.fasta " +
                  folderName + "tmpWithDummy.fasta")
        contigList = IORobot.readContigsFromFile(
            folderName, contigFilename + "_Double.fasta")

        f = open(folderName + "tmpWithDummy.fasta", 'a')
        for i in range(len(mapDummyToRealDic)):
            id = mapDummyToRealDic[str(i)]
            f.write(">SegDum" + str(i) + "\n")
            f.write(contigList[id] + "\n")
        f.close()

        IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta",
                                      "tmpWithDummy.fasta",
                                      gapContentLookUpDic, mapDummyToRealDic)
Пример #17
0
def identifyRepeat(folderName, mummerLink,contigFilename,contigReadGraph, repeatFilename, optionToRun  ):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''
    
    # ## (a) reachability test to find partners 
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta")
    
    adjacencyList = [[] for i in range(len(lenDicCC))]
    
    N1 = len(lenDicCC)
    
    
    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug
    
    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) 
        print "i, adjacencyList[i] : ", i , adjacencyList[i]
    
    # ## (b) formation of bipartite graph
    if optionToRun == "tandem" :
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase": 
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename)
    
    G2 = abunGraphLib.seqGraphWt(N1 * 2)
    for i in range(N1):
        for j in newAdjacencyList[i]:
            G2.insertEdge(2 * i, 2 * j + 1, 1)
            G2.insertEdge(2 * j + 1, 2 * i, 1)

    clusters = G2.findConnectedComponents()
    
    repeatList = []
    for eachitem in clusters:
        leftList, rightList = [], []
        for eachsubitem in eachitem:
            if eachsubitem % 2 == 0 :
                leftList.append(eachsubitem)
            else:
                rightList.append(eachsubitem)
                
        
        repeatList.append([abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList)])
           
    with open(folderName + repeatFilename, 'w') as outfile:
        json.dump(repeatList, outfile)

    
    json_data = open(folderName + repeatFilename, 'r')
    loadData = json.load(json_data)
    
    
    assert(loadData == repeatList)
    
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName):
    
    '''
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
    Algorithm: 
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V
    '''

    G = []

    IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig")
    IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads")
    
    
    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta" , contigFilename + "_Double.fasta"
    if True:
        alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)

    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    dataListCC = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC)
    
    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta" , readsetFilename + "_Double.fasta"
    
    
    lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta")
    
    if not abunHouseKeeper.abunGlobalRRDisable:
        if True:
            alignerSubRoutine(folderName ,referenceFile,  queryFile, mummerLink, header )
    
        dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
        dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
        for eachitem in dataListRR:
            if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p" :    
                print "debug" , eachitem
            if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p" :    
                print "debug" , eachitem
            
        dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR)

    else:
        dataListRR = []
    
    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta" , readsetFilename + "_Double.fasta"
    if True:
        alignerSubRoutine(folderName ,referenceFile,  queryFile, mummerLink, header )
        #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)
            
    numberOfNodes = len(lenDicCR) 
    G = graphLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    
    '''
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    
    '''
    
    # print dataListCC[0]
    # print dataListRR[0]
    # print dataListCR[0]
    
    # for eachitem in dataListCC:
    #    print eachitem
    addDataToList(dataListCC, G, 0, 0, 'C', 'C')
    # for eachitem in dataListRR[0:10]:
    #    print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]]
    
    addDataToList(dataListRR, G, N1, N1, 'R', 'R')
    
    addDataToList(dataListCR, G, 0, N1, 'C', 'R')
    # G.reportEdge()
    G.saveToFile(folderName, graphName)
    
    checkGraphLength(G, N1, lenDicRR)
    
    # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes)
    print "len(G.graphNodesList)", len(G.graphNodesList)
    
    
    
Пример #19
0
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph,
                   repeatFilename, optionToRun):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''

    # ## (a) reachability test to find partners
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")

    adjacencyList = [[] for i in range(len(lenDicCC))]

    N1 = len(lenDicCC)

    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug

    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G)
        print "i, adjacencyList[i] : ", i, adjacencyList[i]

    # ## (b) formation of bipartite graph
    if optionToRun == "tandem":
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase":
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName,
                                                   contigFilename)

    G2 = abunGraphLib.seqGraphWt(N1 * 2)
    for i in range(N1):
        for j in newAdjacencyList[i]:
            G2.insertEdge(2 * i, 2 * j + 1, 1)
            G2.insertEdge(2 * j + 1, 2 * i, 1)

    clusters = G2.findConnectedComponents()

    repeatList = []
    for eachitem in clusters:
        leftList, rightList = [], []
        for eachsubitem in eachitem:
            if eachsubitem % 2 == 0:
                leftList.append(eachsubitem)
            else:
                rightList.append(eachsubitem)

        repeatList.append([
            abunHouseKeeper.getDistinct(leftList),
            abunHouseKeeper.getDistinct(rightList)
        ])

    with open(folderName + repeatFilename, 'w') as outfile:
        json.dump(repeatList, outfile)

    json_data = open(folderName + repeatFilename, 'r')
    loadData = json.load(json_data)

    assert (loadData == repeatList)
def resolvingTandem(
    folderName, mummerPath, contigReadGraph, contigFilename, readsetFilename, optTypeFileHeader, repeatSpec
):
    print "resolvingTandem"
    """
    Input : repeat info 
    Output : count, join. 
    
    Algorithm: 
    1. Find loops
    2. Form repeat
    3. Form chain of repeat copies back to back
    4. Align reads
    5. Calculate extra bases beyond flanking region
    6. Calculate count
    7. Join the contigs
    """
    # 0 ) Load all the data
    thres = 5

    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    N1 = len(lenDicCC)

    maxDuplicate = 10
    repeatTempFilename = "tandemRepeatTemplate.fasta"
    mummerFile = "myTandemRepeatTemplate"

    myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")
    lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta")

    header = optTypeFileHeader + "RR"
    dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
    dataListRRDic = {}
    for eachitem in dataListRR:
        if eachitem[2] < thres:
            dataListRRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4]

    header = optTypeFileHeader + "CR"
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())

    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)
    dataListCRDic = {}
    for eachitem in dataListCR:
        if eachitem[2] < thres:
            dataListCRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4]

    print dataListCRDic

    json_data = open(folderName + repeatSpec, "r")
    loadData = json.load(json_data)

    contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename + "_Double.fasta")
    readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")

    happyTandemList = {}

    for eachrepProfile in loadData:
        # 1)
        startContig = eachrepProfile[-1][0][0]
        isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False)

        # 2)
        if isTerminate:
            v = returnPathList[-1]
            i = 0
            tandemPath = []
            while i < len(returnPathList):
                if returnPathList[i] == v:
                    tandemPath = returnPathList[i:]
                    i = len(returnPathList)
                i = i + 1

            print returnPathList
            print tandemPath
        # 3) [fix it when have time later ; to just use graph; bug at the min thing]

        repeatContent = ""

        for kk in range(len(tandemPath[0:-1])):
            eachitem = tandemPath[kk] - N1
            nextitem = tandemPath[kk + 1] - N1
            readName = "Read" + str(eachitem / 2) + "_"
            nextReadName = "Read" + str(nextitem / 2) + "_"
            if eachitem % 2 == 0:
                readName = readName + "p"
            elif eachitem % 2 == 1:
                readName = readName + "d"

            if nextitem % 2 == 0:
                nextReadName = nextReadName + "p"
            elif nextitem % 2 == 1:
                nextReadName = nextReadName + "d"

            overlap = dataListRRDic[readName + ";" + nextReadName]
            print overlap
            repeatContent = repeatContent + myContigsDic[readName][0:-overlap]

        print "len(repeatContent)", len(repeatContent)

        fout = open(folderName + repeatTempFilename, "w")
        fout.write(">RepeatSegment\n")
        repeatContentLarge = ""

        for i in range(maxDuplicate):
            fout.write(repeatContent)
            repeatContentLarge = repeatContentLarge + repeatContent
        fout.close()

        # 4)
        repeatReadList = eachrepProfile[1]

        myList = []
        for eachitem in repeatReadList:

            readName = "Read" + str((eachitem - N1) / 2) + "_"

            if eachitem % 2 == 0:
                readName = readName + "p"
            elif eachitem % 2 == 1:
                readName = readName + "d"
            myList.append(readName)

        IORobot.putListToFileO(folderName, readsetFilename + "_Double.fasta", "toAlignReads", myList)

        if True:
            alignerRobot.useMummerAlign(mummerPath, folderName, mummerFile, repeatTempFilename, "toAlignReads.fasta")

        dataList = alignerRobot.extractMumData(folderName, mummerFile + "Out")

        # 5)
        totalBasesMatch = 0
        lrepeat = len(repeatContent)
        c = findCoverageFromRawData(folderName)

        # print "dataList[0]", dataList[0]
        dataList.sort(key=itemgetter(-1))
        for key, values in groupby(dataList, itemgetter(-1)):
            maxValue = -1
            for eachsub in values:
                if eachsub[5] > maxValue:
                    maxValue = eachsub[5]

            # print key, maxValue
            totalBasesMatch = totalBasesMatch + maxValue

        print c, lrepeat, totalBasesMatch
        ct = totalBasesMatch * 1.0 / (c * lrepeat)
        print "BIG NUMBER of THE DAY: ", ct

        # 6)
        # a) find the starting point
        startContig = eachrepProfile[-1][0][0]
        firstRead = eachrepProfile[-1][0][1] - N1

        contigName = "Contig" + str(startContig / 2)
        if startContig % 2 == 0:
            contigName = contigName + "_p"
        elif startContig % 2 == 1:
            contigName = contigName + "_d"

        readName = "Read" + str(firstRead / 2)
        if firstRead % 2 == 0:
            readName = readName + "_p"
        elif firstRead % 2 == 1:
            readName = readName + "_d"

        overlapFirst = dataListCRDic[contigName + ";" + readName]
        tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName]

        f1 = open(folderName + "firstOverlap.fasta", "w")
        f1.write(">combined\n")
        f1.write(tmpCombine)
        f1.close()

        if True:
            alignerRobot.useMummerAlign(
                mummerPath, folderName, "myFirstOverlap", repeatTempFilename, "firstOverlap.fasta"
            )

        dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap" + "Out")

        dataList.sort(key=itemgetter(0))
        maxVal = -1
        maxItm = []
        for eachi in dataList:
            if eachi[5] > maxVal:
                maxVal = eachi[5]
                maxItm = eachi

        print maxItm
        if len(maxItm) > 0:
            repeatStart = maxItm[0]
            contigEnd = maxItm[2]
        else:
            repeatStart = 0
            contigEnd = -1
        # b) format return : prepare the repeat template
        print "ct*lrepeat", int(repeatStart + ct * lrepeat)
        print "repeatStart", repeatStart
        happyTandemList[contigName] = repeatContentLarge[repeatStart : int(repeatStart + ct * lrepeat)]
        contigsTmp[contigName] = tmpCombine[0:contigEnd]
        print "len(contigsTmp[contigName])", len(contigsTmp[contigName])
        print "len(happyTandemList[contigName])", len(happyTandemList[contigName])

    # 7) Combine all the repeat information and do the join

    leaderList = [i for i in range(len(contigsTmp))]
    for eachrepProfile in loadData:
        startContig = eachrepProfile[-1][0][0]
        endContig = eachrepProfile[-1][-1][-1]
        leaderContig = leaderList[startContig]

        leaderName = parseIDToName(leaderContig)
        endName = parseIDToName(endContig)
        startName = parseIDToName(startContig)

        contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName]

        if endContig != leaderContig:
            contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName]
            contigsTmp[endName] = ""
            leaderList[endContig] = leaderContig

    leaderAgg = [[] for i in range(len(leaderList))]
    for i in range(len(leaderList)):
        leaderAgg[leaderList[i]].append(i)

    checkingList = [False for i in range(N1)]

    fout = open(folderName + "tademResolved.fasta", "w")

    counter = 0
    for eachcontig in contigsTmp:
        id = abunHouseKeeper.parseEdgeNameToID(eachcontig, "C")
        if checkingList[id / 2] == False:

            fout.write(">Segkk" + str(counter) + "\n")

            fout.write(contigsTmp[eachcontig])
            counter = counter + 1
            for eachkk in leaderAgg[leaderList[id]]:
                checkingList[eachkk / 2] = True

    fout.close()
Пример #21
0
def continuousIntegration():
	if False:
		G = graphLib.seqGraph(10)
		for i in range(5):
			G.insertEdge(i,i+1,1997)
			G.insertEdge(i,i+2, 1997)

		resultList = abunGraphLib.BFS_revisit(1,3,G,1)

		print "resultList", resultList 

	if False : 

		folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile = \
			"Apr10Test/", "/usr/bin/", [[1, 486, 217], [1, 8642, 217], [1, 13465, 217]], [[1, 486, 217]], "improved3_Double.fasta", "phasingSeedName_Double.fasta"

		abunGraphLib.formPathSeq(folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile)
    
		if False:
			lenDic = IORobot.obtainLength(folderName , contigFile)
			N1 = len(lenDic)

			print "N1", N1

			G = graphLib.seqGraph(0)
			G.loadFromFile(folderName, "phaseStringGraph1")

			adj = [[] for i in range(N1)]

			for i in range(N1): 
			    adj[i] = abunGraphLib.findAllReachable(i, N1, G)

			Gnew = abunGraphLib.seqGraphDynamic(N1)

			for i in range(N1):
			    for j in adj[i]:
			        Gnew.insertEdge(i,j,1997)


			Gnew.initAdv()    
			Gnew.doubleEdgeReduction()

			contigPaths = abunGraphLib.findAllPathK(1, 217, Gnew, 3)
			contigReadPaths = abunGraphLib.findAllPathK(1, 217, G, 5)

			print "contigPaths", contigPaths
			print "contigReadPaths", contigReadPaths

			Gnew.transitiveReduction()

	if False:
		toDelete = abunGraphLib.decideCut("Apr10Test/", "/usr/bin/")
		print toDelete

	if False:
		G = graphLib.seqGraph(0)
		G.loadFromFile("Apr10TestA/", "xResolvedGraph")

		if False:
			for i in range(len(G.graphNodesList)):

				v = G.graphNodesList[i]

				if len(v.nodeIndexList) > 0:
					print i , v.listOfPrevNodes , v.listOfNextNodes

		G.reportEdge()
		lenDic = IORobot.obtainLength("Apr10TestA/", "improved3_Double.fasta")
		mylist = [401, 207, 405, 407, 344]

		json_data = open("Apr10TestA/" + "myCountDic.json", 'r')
		myCountDic = json.load(json_data)

		for x in mylist:
			print x, lenDic["Contig"+str(x/2)+"_p"], myCountDic["Segkk"+str(x/2)]


	if False:
		folderName = "Apr10TestA/"
		G = graphLib.seqGraph(0)
		G.loadFromFile(folderName , "xResolvedGraph")

		json_data = open(folderName + "mapDummyToRealDic.json", 'r')
		mapDummyToRealDic = json.load(json_data)

		lenDic = IORobot.obtainLength(folderName, "improved3_Double.fasta")
		print len(G.graphNodesList)
		print len(mapDummyToRealDic)
		
		print "fake N1 , real N1 ", len(G.graphNodesList) - len(mapDummyToRealDic), len(lenDic)


	if False:
		abunSplitter.mainFlow("Apr10TestB/", "/usr/bin/")

	if False: 
		nonRedundantResolver.removeEmbedded("Apr10TestD/", "/usr/bin/")

	if False:
		folderName, contigReadGraph = "Apr10TestA/", "phaseStringGraph1"
		G = graphLib.seqGraph(0)
		kthres, edgeThres = 3, 1
		G.loadFromFile(folderName, contigReadGraph)
		lenDic = IORobot.obtainLength(folderName , "improved3_Double.fasta")

		N1 = len(lenDic)

		adj = [[] for i in range(N1)]

		for i in range(N1): 
		    tmpList = abunGraphLib.findAllReachable(i, N1, G)
		    
		    for j in tmpList:
		        if len(abunGraphLib.findAllPathK(i,j,G,kthres)) >= edgeThres:
		            adj[i].append(j) 

		    #print i, adj[i]

	    ### Filter adaptor skipped case 

		adaptorPair = []

		for i in range(len(adj)):
		    if  i % 2 == 0:
		        if i + 1 in adj[i]:
		            adj[i].remove(i+1)
		            adaptorPair.append([i, i+1])
		    elif i % 2 ==1: 
		        if i-1 in adj[i] :
		            adj[i].remove(i-1)
		            adaptorPair.append([i, i-1])

		Gnew = abunGraphLib.seqGraphDynamic(N1)

		for i in range(N1):
		    for j in adj[i]:
		        Gnew.insertEdge(i,j,1997)

		for eachpair in adaptorPair:
		    u, v = eachpair[0], eachpair[1]
		    for x in Gnew.graphNodesList[u].listOfPrevNodes:
		        xIndex = x[0]
		        Gnew.removeEdge(xIndex, v)
		    for y in Gnew.graphNodesList[v].listOfNextNodes:
		        yIndex = y[0]
		        Gnew.removeEdge(u, yIndex)


        #Gnew.reportEdge()
		count2 = 0
		for i in range(len(Gnew.graphNodesList)):
			if  len(Gnew.graphNodesList[i].listOfPrevNodes) == 2 and  len(Gnew.graphNodesList[i].listOfNextNodes) == 2:
				count2 = count2 + 1
				print str(i)+"{color:red}"

		print "count2, ", count2

		### End filter adaptor skipped case 
	if True:
		nonRedundantResolver.removeRedundantWithFile("May11TestB/" , "/usr/bin/", "abun", "abunDebug", "abunNoEmbed")
def formReadContigStringGraph(folderName, mummerLink, contigFilename,
                              readsetFilename, optTypeFileHeader, graphName):
    '''
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
    Algorithm: 
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V
    '''

    G = []

    IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta",
                                contigFilename + "_Double.fasta", "contig")
    IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta",
                                readsetFilename + "_Double.fasta", "reads")

    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta"
    if True:
        alignerRobot.useMummerAlign(mummerLink, folderName, header,
                                    referenceFile, queryFile)

    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")
    dataListCC = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC)

    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta"

    lenDicRR = IORobot.obtainLength(folderName,
                                    readsetFilename + "_Double.fasta")

    if not abunHouseKeeper.abunGlobalRRDisable:
        if True:
            alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                              header)

        dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
        dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
        for eachitem in dataListRR:
            if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p":
                print "debug", eachitem
            if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p":
                print "debug", eachitem

        dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR)

    else:
        dataListRR = []

    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta"
    if True:
        alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                          header)
        #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)

    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)

    numberOfNodes = len(lenDicCR)
    G = graphLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    '''
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    
    '''

    # print dataListCC[0]
    # print dataListRR[0]
    # print dataListCR[0]

    # for eachitem in dataListCC:
    #    print eachitem
    addDataToList(dataListCC, G, 0, 0, 'C', 'C')
    # for eachitem in dataListRR[0:10]:
    #    print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]]

    addDataToList(dataListRR, G, N1, N1, 'R', 'R')

    addDataToList(dataListCR, G, 0, N1, 'C', 'R')
    # G.reportEdge()
    G.saveToFile(folderName, graphName)

    checkGraphLength(G, N1, lenDicRR)

    # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes)
    print "len(G.graphNodesList)", len(G.graphNodesList)
Пример #23
0
def findPathBtwEnds(folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1):
    
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    
    return findPathBtwEndsFast(folderName, leftCtgIndex, rightCtgIndex, G, N1)
Пример #24
0
def singleGapLookUp(eachmatchpair, folderName, N1, mummerLink, contigReadGraph, contigFilename, readsetFilename):

    print eachmatchpair
    leftCtgIndex, rightCtgIndex, leftEnd, rightStart, middleContent = eachmatchpair[0], eachmatchpair[-1], 0, 0, ""

    succReadsList = abunGraphLib.findPathBtwEnds(folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1)

    succReadsList = []
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    allPaths = abunGraphLib.findAllPathK(leftCtgIndex, rightCtgIndex, G, 5)
    # shuffle(allPaths)

    print "allPaths", allPaths

    possibleList = []
    for p in allPaths:
        noContig = True
        for pp in p[1:-1]:
            if pp < N1:
                noContig = False
        if noContig == True:
            possibleList.append(p)
    print "possibleList", possibleList

    minListLen = 1000
    for p in possibleList:
        if len(p) < minListLen:
            succReadsList = p
            minListLen = len(p)

    if len(succReadsList) > 0:
        succReadsList.pop(0)
        succReadsList.pop(-1)
    else:
        print "interesting item for future study"

    print "succReadsList", succReadsList

    if len(succReadsList) == 0:
        contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, "C", N1)
        leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

        contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, "C", N1)
        rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

        overlap = IORobot.alignWithName(
            leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex)
        )

        print "overlap contig : ", overlap

        leftEnd = len(leftSeg) - overlap[0]
        middleContent = ""

    else:

        contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, "C", N1)
        print contigName
        leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

        readName = abunHouseKeeper.parseIDToName(succReadsList[0], "R", N1)
        print readName
        rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)

        overlap = IORobot.alignWithName(
            leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex)
        )

        print "overlap start read : ", overlap

        leftEnd = len(leftSeg) - overlap[0]

        middleContent = ""

        for i in range(len(succReadsList) - 1):
            readName = abunHouseKeeper.parseIDToName(succReadsList[i], "R", N1)
            leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)

            readName = abunHouseKeeper.parseIDToName(succReadsList[i + 1], "R", N1)
            rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)

            overlap = IORobot.alignWithName(
                leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex)
            )
            print "overlap middle read : ", overlap
            middleContent = middleContent + leftSeg[0 : len(leftSeg) - overlap[0]]

        readName = abunHouseKeeper.parseIDToName(succReadsList[-1], "R", N1)
        leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)

        contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, "C", N1)
        rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

        overlap = IORobot.alignWithName(
            leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex)
        )
        print "overlap end read : ", overlap

        middleContent = middleContent + leftSeg[0 : len(leftSeg) - overlap[0]]

    return [leftCtgIndex, rightCtgIndex, leftEnd, rightStart, middleContent]
Пример #25
0
def findPathBtwEnds(folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1):
    
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    
    return findPathBtwEndsFast(folderName, leftCtgIndex, rightCtgIndex, G, N1)
Пример #26
0
def xNodeResolving(folderName, contigReadGraph):
    '''
    Input : contigGraph , abunInfo , folderName  

    Output: myresolvedList.json, gapContentLookUp.json, dummyNodeMapping.json

    Algorithm :
        1) Tranverse the graph 
            a) If the node can well be fixed with sd requirement met 
                i) Link it across and add the pair into the myresolvedList, gapContentLookUp
                ii) Add dummynodes and fill in the dummyNodeMapping 
        
        2) Format return and output as temp file 
    '''

    ### Init G, myCountDic, N1
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    with open(folderName + 'myCountDic.json') as f:
        myCountDic = json.load(f)

    N1 = len(myCountDic) * 2

    ### Add resolved edge

    adj = [[] for i in range(N1)]

    for i in range(N1):
        adj[i] = abunGraphLib.findAllReachable(i, N1, G)

    Gnew = graphLib.seqGraph(N1)

    for i in range(N1):
        for j in adj[i]:
            Gnew.insertEdge(i, j, 1)

    extraCounter = 0
    mapDummyToRealDic = {}
    resolvedList = []

    for v in Gnew.graphNodesList:

        inList = []
        for eachitem in v.listOfPrevNodes:
            inList.append(eachitem[0])

        outList = []
        for eachitem in v.listOfNextNodes:
            outList.append(eachitem[0])

        inListCt = getCtTwoToOne(inList, myCountDic)
        outListCt = getCtTwoToOne(outList, myCountDic)

        sizeList = []
        for eachitem in myCountDic:
            sizeList.append(myCountDic[eachitem])

        sd = np.std(sizeList)

        for eachIn in inListCt:
            matchedOut = satisfyMatch(eachIn, outListCt, sd)

            if matchedOut != -1:
                leftCtgIndex, rightCtgIndex = eachIn[0], v.nodeIndex
                inSuccReadsList = abunGraphLib.findPathBtwEnds(
                    folderName, leftCtgIndex, rightCtgIndex, contigReadGraph,
                    N1)

                leftCtgIndex, rightCtgIndex = v.nodeIndex, matchedOut
                outSuccReadsList = abunGraphLib.findPathBtwEnds(
                    folderName, leftCtgIndex, rightCtgIndex, contigReadGraph,
                    N1)

                if inSuccReadsList != None and outSuccReadsList != None:

                    resolvedList.append([eachIn[0]] + inSuccReadsList +
                                        [N1 + extraCounter])
                    print "in: ", resolvedList[-1]

                    resolvedList.append([N1 + extraCounter] +
                                        outSuccReadsList + [matchedOut])
                    print "out: ", resolvedList[-1]

                    mapDummyToRealDic[extraCounter] = v.nodeIndex
                    extraCounter = extraCounter + 1

    return resolvedList, mapDummyToRealDic
Пример #27
0
def formReadContigStringGraph(folderName,
                              mummerLink,
                              contigFilename,
                              readsetFilename,
                              optTypeFileHeader,
                              graphName,
                              needAlignment=True):
    '''
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
    Algorithm: 
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V
    '''

    G = []

    IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta",
                                contigFilename + "_Double.fasta", "contig")
    IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta",
                                readsetFilename + "_Double.fasta", "reads")

    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta"

    #if needAlignment:
    #    alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    if needAlignment:
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, [[header, referenceFile, queryFile, ""]],
            houseKeeper.globalParallel)

    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")
    dataListCC = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC)

    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta"

    lenDicRR = IORobot.obtainLength(folderName,
                                    readsetFilename + "_Double.fasta")

    if not abunHouseKeeper.abunGlobalRRDisable:
        if needAlignment:
            alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                              header)

        dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
        dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
        dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR)

    else:
        dataListRR = []

    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta"
    if needAlignment:
        alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                          header)

    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)

    numberOfNodes = len(lenDicCR)
    G = graphLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    '''
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    
    '''

    addDataToList(dataListCC, G, 0, 0, 'C', 'C')

    addDataToList(dataListRR, G, N1, N1, 'R', 'R')

    addDataToList(dataListCR, G, 0, N1, 'C', 'R')

    Gnew = formExtraEdges(folderName, optTypeFileHeader, contigFilename, G, N1)

    Gnew.saveToFile(folderName, graphName)

    print "len(Gnew.graphNodesList)", len(Gnew.graphNodesList)
Пример #28
0
def abunSplitWithXResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename):
    N1 = len(myCountDic) * 2
    print "N1", N1

    # Debug
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    adj = [[] for i in range(N1)]

    for i in range(N1):
        adj[i] = abunGraphLib.findAllReachable(i, N1, G)

    Gnew = graphLib.seqGraph(N1)

    for i in range(N1):
        for j in adj[i]:
            Gnew.insertEdge(i, j, 1)

    Gnew.reportEdge()
    # End Debug

    if False:
        json_data = open(folderName + "phaseRepeat.txt", "r")
        repeatPairs = json.load(json_data)
        repeatPairs = obtainNonEmpty(repeatPairs)

        biResolvedCombineList = []
        for eachitem in repeatPairs:
            inList, outList = eachitem[0], eachitem[1]
            resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1)

            biResolvedCombineList += resolvedList

        ### Xnode repeatResolution
        xResolvedList, mapDummyToRealDic = xNodeResolving(folderName, contigReadGraph)

        ### Combine resolution
        resolvedList = xResolvedList + biResolvedCombineList
        resolvedList = abunHouseKeeper.getDistinct(resolvedList)
        print "resolvedList, len(resolvedList),len(xResolvedList), len(biResolvedCombineList) ", resolvedList, len(
            resolvedList
        ), len(xResolvedList), len(biResolvedCombineList)

        with open(folderName + "resolvedList.json", "w") as f:
            json.dump(resolvedList, f)

        with open(folderName + "mapDummyToRealDic.json", "w") as f:
            json.dump(mapDummyToRealDic, f)

    if False:
        json_data = open(folderName + "resolvedList.json", "r")
        resolvedList = json.load(json_data)

        json_data = open(folderName + "mapDummyToRealDic.json", "r")
        mapDummyToRealDic = json.load(json_data)

        gapContentLookUpList = []
        gapContentLookUpList = generateGapContentLookup(
            folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic
        )
        gapContentLookUpDic = {}
        gapContentLookUpList.sort()

        for eachitem in gapContentLookUpList:
            gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]]
            print eachitem[2:4], len(eachitem[4])

        with open(folderName + "gapContentLookUpDic.json", "w") as f:
            json.dump(gapContentLookUpDic, f)

    if False:
        json_data = open(folderName + "resolvedList.json", "r")
        resolvedList = json.load(json_data)

        json_data = open(folderName + "mapDummyToRealDic.json", "r")
        mapDummyToRealDic = json.load(json_data)

        G = graphLib.seqGraph(N1 + len(mapDummyToRealDic))
        addEdges(G, resolvedList)
        G.condense()

        G.saveToFile(folderName, "xResolvedGraph")

    if False:
        json_data = open(folderName + "mapDummyToRealDic.json", "r")
        mapDummyToRealDic = json.load(json_data)

        G = graphLib.seqGraph(0)
        G.loadFromFile(folderName, "xResolvedGraph")

        json_data = open(folderName + "gapContentLookUpDic.json", "r")
        gapContentLookUpDic = json.load(json_data)

        print "Final step: really hacking a file"
        os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta")
        contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta")

        f = open(folderName + "tmpWithDummy.fasta", "a")
        for i in range(len(mapDummyToRealDic)):
            id = mapDummyToRealDic[str(i)]
            f.write(">SegDum" + str(i) + "\n")
            f.write(contigList[id] + "\n")
        f.close()

        IORobot.extractGraphToContigs(
            G, folderName, mummerLink, "abun.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic
        )
Пример #29
0
def xNodeResolving(folderName, contigReadGraph):

    ### Init G, myCountDic, N1
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    with open(folderName + "myCountDic.json") as f:
        myCountDic = json.load(f)

    N1 = len(myCountDic) * 2

    ### Add resolved edge

    adj = [[] for i in range(N1)]

    for i in range(N1):
        adj[i] = abunGraphLib.findAllReachable(i, N1, G)

    Gnew = graphLib.seqGraph(N1)

    for i in range(N1):
        for j in adj[i]:
            Gnew.insertEdge(i, j, 1)

    extraCounter = 0
    mapDummyToRealDic = {}
    resolvedList = []

    for v in Gnew.graphNodesList:

        inList = []
        for eachitem in v.listOfPrevNodes:
            inList.append(eachitem[0])

        outList = []
        for eachitem in v.listOfNextNodes:
            outList.append(eachitem[0])

        inListCt = getCtTwoToOne(inList, myCountDic)
        outListCt = getCtTwoToOne(outList, myCountDic)

        sizeList = []
        for eachitem in myCountDic:
            sizeList.append(myCountDic[eachitem])

        sd = np.std(sizeList)

        for eachIn in inListCt:
            matchedOut = satisfyMatch(eachIn, outListCt, sd)

            if matchedOut != -1:
                leftCtgIndex, rightCtgIndex = eachIn[0], v.nodeIndex
                inSuccReadsList = abunGraphLib.findPathBtwEnds(
                    folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1
                )

                leftCtgIndex, rightCtgIndex = v.nodeIndex, matchedOut
                outSuccReadsList = abunGraphLib.findPathBtwEnds(
                    folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1
                )

                if inSuccReadsList != None and outSuccReadsList != None:

                    resolvedList.append([eachIn[0]] + inSuccReadsList + [N1 + extraCounter])
                    print "in: ", resolvedList[-1]

                    resolvedList.append([N1 + extraCounter] + outSuccReadsList + [matchedOut])
                    print "out: ", resolvedList[-1]

                    mapDummyToRealDic[extraCounter] = v.nodeIndex
                    extraCounter = extraCounter + 1

    return resolvedList, mapDummyToRealDic
Пример #30
0
def graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink,
                 readsetFilename, contigFilename):

    ### Transitive reduction and remove double pointers
    N1 = len(myCountDic) * 2
    print "N1", N1
    kthres = abunHouseKeeper.abunGlobalSplitParameterRobot.kthres
    edgeThres = abunHouseKeeper.abunGlobalSplitParameterRobot.edgeThres

    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    adj = [[] for i in range(N1)]

    for i in range(N1):
        tmpList = abunGraphLib.findAllReachable(i, N1, G)

        for j in tmpList:
            if len(abunGraphLib.findAllPathK(i, j, G, kthres)) >= edgeThres:
                adj[i].append(j)

    ### Filter adaptor skipped case

    adaptorPair = []

    for i in range(len(adj)):
        if i % 2 == 0:
            if i + 1 in adj[i]:
                adj[i].remove(i + 1)
                adaptorPair.append([i, i + 1])
        elif i % 2 == 1:
            if i - 1 in adj[i]:
                adj[i].remove(i - 1)
                adaptorPair.append([i, i - 1])

    Gnew = abunGraphLib.seqGraphDynamic(N1)

    for i in range(N1):
        for j in adj[i]:
            Gnew.insertEdge(i, j, 1997)

    for eachpair in adaptorPair:
        u, v = eachpair[0], eachpair[1]
        for x in Gnew.graphNodesList[u].listOfPrevNodes:
            xIndex = x[0]
            Gnew.removeEdge(xIndex, v)
        for y in Gnew.graphNodesList[v].listOfNextNodes:
            yIndex = y[0]
            Gnew.removeEdge(u, yIndex)

    Gnew.reportEdge()
    ### Trying out the new component
    import toCondenseFixer
    Gnew = toCondenseFixer.noGoZoneDefiner(Gnew, folderName)

    Gnew.symGraph()
    #Gnew.reportEdge()
    ### End filter adaptor skipped case

    if abunHouseKeeper.abunGlobalSplitParameterRobot.runGraphSurgery:

        Gnew.initAdv()
        if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunCondenseRemove:
            Gnew.condenseEdgeRemove(G, folderName, mummerLink, contigFilename)

        if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunDoubltPtr:
            Gnew.doubleEdgeReduction()

        if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunTransitive:
            Gnew.transitiveReduction(folderName, mummerLink,
                                     contigFilename + "_Double.fasta",
                                     readsetFilename + "_Double.fasta", G)

        Gnew.condense()
        Gnew.findAdjList()
    else:
        Gnew.initAdv()
        Gnew.condense()
        Gnew.findAdjList()

    return Gnew