예제 #1
0
def rgCluster(graph, lowerBound, upperBound, enforceTrace = True):
    clusters = []
    clusterPointers = {}
    clusterPos = {}
    nodeClusters = {}
    weightMap = []
    absorbed = set()
    cantConnects = set()
    
    for s in range(len(lowerBound)):
        for a in range(lowerBound[s], upperBound[s]):
            clusters.append([a])
            idx = len(clusters)-1
            nodeClusters[a] = idx
            weightMap.append({})
            clusterPos[idx] = {s : a}
            clusterPointers[idx] = {s : (idx-1 if idx > lowerBound[s] else None, idx+1 if idx < upperBound[s]-1 else None)}
    
    
    heap = buildHeap(graph, nodeClusters, weightMap, lowerBound, upperBound)
    Configs.log("Built a heap of size {}..".format(len(heap)))
    crunchHeap(graph, heap, clusters, nodeClusters, clusterPos, clusterPointers, weightMap, cantConnects, absorbed, enforceTrace)

    #c2 = [sorted(c) for c in clusters if len(c) > 0]
    #c2.sort(key= lambda l : graph.matSubPosMap[l[0]])
    #for c in c2:
    #    print(c)
        
    if enforceTrace:
        clusters = orderClusters(graph, clusters, nodeClusters, lowerBound, upperBound)
        #for c in clusters:
        #    print(sorted(c))
    return clusters
예제 #2
0
def rgSearch(graph):
    Configs.log("Finding graph trace with region-growing search..")
    
    k = len(graph.context.subalignments)
    lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)]
    upperBound = [graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k)] 
    graph.clusters = rgCluster(graph, lowerBound, upperBound, True)
예제 #3
0
def buildSubsetsKMH(context, subsetsDir):
    tempDir = os.path.join(subsetsDir, "initial_tree")

    Configs.log(
        "Building KMH decomposition on {} with skeleton size {}/{}..".format(
            context.sequencesPath, Configs.decompositionSkeletonSize, 1000))
    time1 = time.time()

    initialTreePath, initialAlignPath, unusedTaxa = buildInitialTreeAlign(
        tempDir, context.sequencesPath)

    if len(unusedTaxa) == 0:
        subsetPaths = treeutils.decomposeGuideTree(
            tempDir, initialAlignPath, initialTreePath,
            Configs.decompositionMaxSubsetSize,
            Configs.decompositionMaxNumSubsets)
    else:
        subsetSeedDir = os.path.join(subsetsDir, "seed_subsets")
        if not os.path.exists(subsetSeedDir):
            os.makedirs(subsetSeedDir)
        subsetSeedPaths = treeutils.decomposeGuideTree(
            subsetSeedDir, initialAlignPath, initialTreePath, None,
            Configs.decompositionMaxNumSubsets)
        subsetPaths = reassignTaxons(subsetsDir, subsetSeedPaths,
                                     context.unalignedSequences, unusedTaxa)

    time2 = time.time()
    Configs.log("Built KMH decomposition on {} in {} sec..".format(
        context.sequencesPath, time2 - time1))

    return subsetPaths
예제 #4
0
 def readGraphFromFile(self, filePath):
     self.matrix = [{} for i in range(self.matrixSize)]
     with open(filePath) as f:
         for line in f:
             tokens = [int(token) for token in line.strip().split()]
             self.matrix[tokens[0]][tokens[1]] = tokens[2]
     Configs.log("Read matrix from {}".format(filePath))
예제 #5
0
 def writeGraphToFile(self, filePath):
     with open(filePath, 'w') as textFile:
         for i in range(len(self.matrix)):
             for k in self.matrix[i]:
                 textFile.write("{} {} {}\n".format(i, k,
                                                    self.matrix[i][k]))
     Configs.log("Wrote matrix to {}".format(filePath))
예제 #6
0
def naiveClustering(graph):
    Configs.log("Building a naive left-justified clustering..")

    k = len(graph.context.subalignments)
    lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)]
    upperBound = [
        graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i]
        for i in range(k)
    ]
    graph.clusters = naiveCluster(lowerBound, upperBound)
예제 #7
0
def atomizedClustering(graph):
    Configs.log("Building a fully atomized clustering..")

    k = len(graph.context.subalignments)
    lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)]
    upperBound = [
        graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i]
        for i in range(k)
    ]
    graph.clusters = atomizedCluster(lowerBound, upperBound)
예제 #8
0
def rgFastClustering(graph):
    Configs.log("Building a fast region-growing graph clustering..")

    k = len(graph.context.subalignments)
    lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)]
    upperBound = [
        graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i]
        for i in range(k)
    ]
    graph.clusters = rgFastCluster(graph, lowerBound, upperBound, False)
    graph.writeClustersToFile(graph.clusterPath)
예제 #9
0
def purgeDuplicateClusters(graph):
    uniqueClusters = set()
    newclusters = []
    for cluster in graph.clusters:
        cluster.sort()
        clusterTuple = tuple(cluster)
        if clusterTuple not in uniqueClusters:
            uniqueClusters.add(clusterTuple)
            newclusters.append(cluster)
    graph.clusters = newclusters
    Configs.log("Purged duplicate clusters. Found {} unique clusters..".format(
        len(graph.clusters)))
예제 #10
0
 def initialize(self, graph):
     Configs.log("Initializing search context data structures..")
     for i in range(len(self.clusters)):
         self.clusterOrders[i] = [i]
         self.clusterLL[i] = (i - 1 if i > 0 else None,
                              i + 1 if i < len(self.clusters) - 1 else None)
         for a in self.clusters[i]:
             asub, apos = graph.matSubPosMap[a]
             self.elementClusters[a] = i
             self.clusterSubs[i, asub] = a
             nbrs = self.getNeighborList(graph, a)
             self.updateNeighborWeights(None, i, nbrs)
예제 #11
0
def runMlrMclClustering(graph):
    Configs.log("Running MLR-MCL alignment graph clustering..")
    graphPath = os.path.join(graph.workingDir, "graph_mlr_mcl.txt")
    clusterPath = os.path.join(graph.workingDir, "clusters_mlr_mcl.txt")

    if not os.path.exists(clusterPath):
        if not os.path.exists(graphPath):
            writeGraphToFile(graph, graphPath)
        external_tools.runMlrMcl(graphPath, 30000, 0.5, 4, graph.workingDir,
                                 clusterPath).run()

    graph.clusters = readClustersFromFile(clusterPath)
    graph.writeClustersToFile(graph.clusterPath)
예제 #12
0
def runCommand(**kwargs):
    command = kwargs["command"]
    Configs.log("Running an external tool, command: {}".format(command))
    runner = subprocess.run(command, shell = True, cwd = kwargs["workingDir"], universal_newlines = True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    try:    
        runner.check_returncode()
    except:
        Configs.error("Command encountered error: {}".format(command))
        Configs.error("Exit code: {}".format(runner.returncode))
        Configs.error("Output: {}".format(runner.stdout))
        raise
    for srcPath, destPath in kwargs.get("fileCopyMap", {}).items():
        shutil.move(srcPath, destPath)
예제 #13
0
def optimizeClusters(graph, clusters):
    bestClusters, bestCost = clusters, graph.computeClusteringCost(clusters)
    Configs.log(
        "Starting optimization from initial cost of {}..".format(bestCost))
    context = SearchContext(clusters)
    context.initialize(graph)

    passNum = 1
    while True:
        Configs.log("Starting optimization pass {}..".format(passNum))
        newClusters, gain = optimizationPass(graph, bestClusters, context)
        if gain > 0:
            bestClusters = newClusters
            bestCost = bestCost - gain
            Configs.log(
                "New clustering with a cost of {} over {} clusters..".format(
                    bestCost, len(bestClusters)))
            #Configs.log("Verifying cost of {}..".format(graph.computeClusteringCost(bestClusters)))
        else:
            break
        passNum = passNum + 1
    #Configs.log("Final optimized cost of {} over {} clusters..".format(graph.computeClusteringCost(bestClusters), len(bestClusters)))
    Configs.log("Final optimized cost of {} over {} clusters..".format(
        bestCost, len(bestClusters)))
    return bestClusters
예제 #14
0
def readClustersFromFile(filePath):
    assignments = {}
    with open(filePath) as f:
        num = 0
        for line in f:
            cluster = int(line.strip())
            if cluster not in assignments:
                assignments[cluster] = [num]
            else:
                assignments[cluster].append(num)
            num = num + 1
    clusters = [assignments[c] for c in range(len(assignments))]
    Configs.log("Found {} clusters..".format(len(clusters)))
    return clusters
예제 #15
0
def decomposeSequences(context):
    time1 = time.time()

    if len(context.subsetPaths) > 0:
        Configs.log("Subset paths already provided, skipping decomposition..")

    elif len(context.subalignmentPaths) > 0:
        context.subsetPaths = context.subalignmentPaths
        Configs.log(
            "Subalignment paths already provided, skipping decomposition..")

    else:
        subsetsDir = os.path.join(context.workingDir, "decomposition")
        context.subsetPaths = []
        n = 1
        while True:
            filePath = os.path.join(subsetsDir, "subset_{}.txt".format(n))
            if not os.path.exists(filePath):
                break
            Configs.log("Detected existing subset file {}".format(filePath))
            context.subsetPaths.append(filePath)
            n = n + 1

        if len(context.subsetPaths) == 0:
            buildDecomposition(context, subsetsDir)

    time2 = time.time()
    Configs.log("Decomposed {} into {} subsets in {} sec..".format(
        context.sequencesPath, len(context.subsetPaths), time2 - time1))
예제 #16
0
def requestBackboneTasks(context):
    if len(context.backbonePaths) > 0:
        Configs.log("Using {} user-defined backbone files..".format(
            len(context.backbonePaths)))
        context.backbonePaths = context.backbonePaths
        for path in context.backbonePaths:
            context.backboneTaxa.update(sequenceutils.readFromFasta(path))

    elif Configs.graphBuildMethod == "mafft":
        Configs.log("Using {} MAFFT backbones..".format(Configs.mafftRuns))
        requestMafftBackbones(context)

    elif Configs.graphBuildMethod == "subsethmm":
        Configs.log(
            "Using {} HMM-extended subalignments as backbone files..".format(
                len(context.subalignmentPaths)))
        context.backbonePaths = context.subalignmentPaths
        context.backboneExtend.update(context.backbonePaths)

    elif Configs.graphBuildMethod == "initial":
        Configs.log(
            "Using the initial decomposition alignment as the single backbone.."
        )
        initialAlignPath = os.path.join(context.workingDir, "decomposition",
                                        "initial_tree",
                                        "initial_insert_align.txt")
        context.backbonePaths = [initialAlignPath]

    if not Configs.constrain and Configs.graphBuildMethod != "subsethmm":
        context.backbonePaths.extend(context.subalignmentPaths)
예제 #17
0
def mergeSubalignments(context):
    Configs.log("Merging {} subaligments..".format(
        len(context.subalignmentPaths)))
    time1 = time.time()

    buildGraph(context)
    clusterGraph(context.graph)
    findTrace(context.graph)
    optimizeTrace(context.graph)
    writeAlignment(context)

    time2 = time.time()
    Configs.log("Merged {} subalignments into {} in {} sec..".format(
        len(context.subalignmentPaths), context.outputFile, time2 - time1))
예제 #18
0
    def buildNodeEdgeDataStructure(self):
        Configs.log("Preparing node edge data structure..")
        k = len(self.subalignmentLengths)
        self.nodeEdges = {}

        for a in range(self.matrixSize):
            asub, apos = self.matSubPosMap[a]
            self.nodeEdges[a] = [[] for i in range(k)]
            for b, value in self.matrix[a].items():
                bsub, bpos = self.matSubPosMap[b]
                if asub == bsub:
                    continue
                self.nodeEdges[a][bsub].append((b, value))
            for i in range(k):
                self.nodeEdges[a][i].sort(key=lambda pair: pair[0])
        Configs.log("Prepared node edge data structure..")
예제 #19
0
def decomposeTree(tree, maxSubsetSize, numSubsets):
    trees = [tree]
    while len(trees) < numSubsets:
        largestTree = max(trees, key=lambda t: t.childs)

        if maxSubsetSize is not None and largestTree.childs <= maxSubsetSize:
            return trees
        else:
            numChilds = largestTree.childs
            e = getCentroidEdge(largestTree)
            t1, t2 = bipartitionByEdge(largestTree, e)
            Configs.log(
                "Decomposing a tree with {} leaves into {} and {}..".format(
                    numChilds, t1.childs, t2.childs))
            trees.remove(largestTree)
            trees = trees + [t1, t2]
    return trees
예제 #20
0
def mwtSearch(graph):
    Configs.log("Finding graph trace with MWT heuristic search..")

    k = len(graph.context.subalignments)
    lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)]
    upperBound = [
        graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i]
        for i in range(k)
    ]

    if graph.clusters is None or len(graph.clusters) == 0:
        graph.buildNodeEdgeDataStructure()
    else:
        graph.buildNodeEdgeDataStructureFromClusters()

    clusters, totalCost = mwtHeuristicSearch(graph, lowerBound, upperBound)
    graph.clusters = clusters
예제 #21
0
def fmAlgorithm(graph):
    Configs.log("Finding graph trace with FM Algorithm..")

    k = len(graph.context.subalignments)
    lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)]
    upperBound = [
        graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i]
        for i in range(k)
    ]

    if graph.clusters is None or len(graph.clusters) == 0:
        graph.buildNodeEdgeDataStructure()
    else:
        graph.buildNodeEdgeDataStructureFromClusters()
    clusters, totalCost, cuts = fmPartition(graph, lowerBound, upperBound)

    graph.clusters = clusters
예제 #22
0
def writeGraphToFile(graph, filePath):
    Configs.log("Writing MLR-MCL graph file to {}".format(filePath))
    vertices, edges = 0, 0
    lines = []
    for i in range(len(graph.matrix)):
        pairs = graph.matrix[i].items()
        vertices = vertices + 1
        edges = edges + len(pairs)
        lines.append(" ".join(["{} {}".format(a + 1, b) for a, b in pairs]))

    with open(filePath, 'w') as textFile:
        textFile.write("{} {} 1\n".format(vertices, int(edges / 2)))
        for line in lines:
            textFile.write(line + "\n")

    Configs.log("Wrote graph with {} vertices and {} edges to {}".format(
        vertices, int(edges / 2), filePath))
예제 #23
0
def mwtGreedySearch(graph):
    Configs.log("Finding graph trace with MWT greedy search..")

    k = len(graph.context.subalignments)
    lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)]
    upperBound = [
        graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i]
        for i in range(k)
    ]

    if graph.clusters is None or len(graph.clusters) == 0:
        graph.buildNodeEdgeDataStructure()
    else:
        graph.buildNodeEdgeDataStructureFromClusters()

    context = MwtSearchContext(lowerBound, upperBound)
    state = MwtSearchState()
    state.frontier = list(lowerBound)
    clusters, totalCost, cycles = greedySearch(graph, state, context)
    graph.clusters = clusters
예제 #24
0
def writeUnconstrainedAlignment(context):
    graph = context.graph
    alignment = {}
    for taxon in context.unalignedSequences:
        alignment[taxon] = sequenceutils.Sequence(taxon,
                                                  ['-'] * len(graph.clusters))

    curIdxes = {taxon: 0 for taxon in context.unalignedSequences}
    for idx, cluster in enumerate(graph.clusters):
        for b in cluster:
            bsub, bpos = graph.matSubPosMap[b]
            taxon = context.subalignments[bsub][0]
            alignment[taxon].seq[idx] = context.unalignedSequences[taxon].seq[
                curIdxes[taxon]]
            curIdxes[taxon] = curIdxes[taxon] + 1

    for taxon in alignment:
        alignment[taxon].seq = "".join(alignment[taxon].seq)

    sequenceutils.writeFasta(alignment, context.outputFile)
    Configs.log("Wrote final alignment to {}".format(context.outputFile))
예제 #25
0
def buildDecomposition(context, subsetsDir):
    if not os.path.exists(subsetsDir):
        os.makedirs(subsetsDir)
    if context.unalignedSequences is None:
        context.unalignedSequences = sequenceutils.readFromFasta(
            context.sequencesPath, removeDashes=True)

    if (Configs.decompositionStrategy == "random" or context.guideTree
            == "random") and Configs.outputPath == context.outputFile:
        context.subsetPaths = randomDecomposition(
            subsetsDir, context.unalignedSequences,
            Configs.decompositionMaxNumSubsets)

    elif Configs.decompositionStrategy == "kmh":
        Configs.log("Decomposing {} with KMH..".format(context.sequencesPath))
        Configs.log("Targetting {} subsets..".format(
            Configs.decompositionMaxNumSubsets))
        context.subsetPaths = kmh.buildSubsetsKMH(context, subsetsDir)

    else:
        guideTreePath = initial_tree.buildInitialTree(context, subsetsDir,
                                                      context.guideTree)
        Configs.log(
            "Using target subset size of {}, and maximum number of subsets {}.."
            .format(Configs.decompositionMaxSubsetSize,
                    Configs.decompositionMaxNumSubsets))
        context.subsetPaths = treeutils.decomposeGuideTree(
            subsetsDir, context.sequencesPath, guideTreePath,
            Configs.decompositionMaxSubsetSize,
            Configs.decompositionMaxNumSubsets)
예제 #26
0
def addAlignmentFileToGraph(context, alignedFile):
    Configs.log("Feeding backbone {} to the graph..".format(alignedFile))
    backboneAlign = sequenceutils.readFromFasta(alignedFile)
    alignmentLength = len(next(iter(backboneAlign.values())).seq)

    if alignedFile in context.backboneExtend:
        extensionTasks = requestHmmExtensionTasks(context, backboneAlign,
                                                  alignedFile)
        task.submitTasks(extensionTasks)
        for extensionTask in task.asCompleted(extensionTasks):
            backboneAlign.update(
                sequenceutils.readFromStockholm(extensionTask.outputFile,
                                                includeInsertions=True))

    alignmap = backboneToAlignMap(context, backboneAlign, alignmentLength)
    Configs.log(
        "Constructed backbone alignment map from {}".format(alignedFile))

    graph = context.graph
    with graph.matrixLock:
        for l in range(alignmentLength):
            for a, avalue in alignmap[l].items():
                for b, bvalue in alignmap[l].items():

                    if Configs.graphBuildRestrict:
                        asub, apos = graph.matSubPosMap[a]
                        bsub, bpos = graph.matSubPosMap[b]
                        if asub == bsub and apos != bpos:
                            continue

                    graph.matrix[a][b] = graph.matrix[a].get(
                        b, 0) + avalue * bvalue
    Configs.log("Fed backbone {} to the graph.".format(alignedFile))
예제 #27
0
def findTrace(graph):
    time1 = time.time()

    if os.path.exists(graph.tracePath):
        Configs.log("Found existing trace file {}".format(graph.tracePath))
        graph.readClustersFromFile(graph.tracePath)

    else:
        purgeDuplicateClusters(graph)
        purgeClusterViolations(graph)

        if Configs.graphTraceMethod == "minclusters":
            minClustersSearch(graph)
        elif Configs.graphTraceMethod == "fm":
            fmAlgorithm(graph)
        elif Configs.graphTraceMethod == "mwtgreedy":
            mwtGreedySearch(graph)
        elif Configs.graphTraceMethod == "mwtsearch":
            mwtSearch(graph)
        elif Configs.graphTraceMethod == "rg":
            rgSearch(graph)
        elif Configs.graphTraceMethod == "rgfast":
            rgFastSearch(graph)
        elif Configs.graphTraceMethod == "naive":
            naiveClustering(graph)

        graph.writeClustersToFile(graph.tracePath)

    time2 = time.time()
    Configs.log("Found alignment graph trace in {} sec..".format(time2 -
                                                                 time1))
    Configs.log("Found a trace with {} clusters and a total cost of {}".format(
        len(graph.clusters), graph.computeClusteringCost(graph.clusters)))
예제 #28
0
def main():
    '''
    Resolve the args/configs, spin up the task manager (which deals with worker threads and handles parallelism), 
    and get started on the main alignment task. 
    '''

    startTime = time.time()
    args = parseArgs()
    buildConfigs(args)
    Configs.log("MAGUS was run with: {}".format(" ".join(sys.argv)))

    try:
        manager.startTaskManager()
        mainAlignmentTask()
    except:
        Configs.error("MAGUS aborted with an exception..")
        Configs.error(traceback.format_exc())
    finally:
        manager.stopTaskManager()

    endTime = time.time()
    Configs.log("MAGUS finished in {} seconds..".format(endTime - startTime))
예제 #29
0
    def initializeHeap(self, graph):
        Configs.log("Reinitializing heap and all that stuff..")
        self.gainStructure = []
        self.elementMoves = {}
        self.heap = []
        self.locked = set()

        print("Working with {} clusters..".format(
            len(self.clusters) - len(self.deletedClusters)))

        k = len(graph.context.subalignments)
        self.gainStructure = [[0 for j in range(graph.subalignmentLengths[i])]
                              for i in range(k)]
        for i in range(k):
            for j in range(graph.subalignmentLengths[i]):
                node = graph.subsetMatrixIdx[i] + j
                weight = self.weights.get((node, self.elementClusters[node]),
                                          0)
                self.gainStructure[i][
                    j] = weight if j == 0 else weight + self.gainStructure[i][
                        j - 1]

        #if self.mode == "positive_moves":
        #    candidates = self.getPositiveMoves(graph)
        #elif self.mode == "adjacent_moves":
        #    candidates = self.getAdjacentMoves(graph)

        #Configs.log("Considering {} candidate moves..".format(len(candidates)))
        #Configs.log("Choosing {} out of {} candidates..".format(limit, len(candidates)))
        #candidates = heapq.nlargest(limit, candidates)
        #for gain, i, nbr in candidates:
        #    self.pullNeighborMoves(graph, i, [(nbr, gain)])
        #gain = self.getGainSimple(nbr, i)
        #self.elementMoves[nbr, i] = gain
        #heapq.heappush(self.heap, (-1*gain, nbr, i))

        self.getPositiveMoves(graph)
        Configs.log("Starting with {} candidate moves..".format(len(
            self.heap)))
예제 #30
0
def optimizeTrace(graph):
    time1 = time.time()

    if Configs.graphTraceOptimize:
        Configs.log("Optimization pass..")
        graph.addSingletonClusters()
        graph.clusters = optimizeClusters(graph, graph.clusters)
        Configs.log(
            "Optimized the trace to {} clusters with a total cost of {}".
            format(len(graph.clusters),
                   graph.computeClusteringCost(graph.clusters)))
    else:
        Configs.log("Skipping optimization pass..")
    time2 = time.time()
    Configs.log("Finished optimization in {} sec..".format(time2 - time1))