Пример #1
0
def buildHmmScores(hmmPaths, queriesPath, scoreFileHmmFileMap):
    #tasks = [getHmmScores(hmmPath, queriesPath) for hmmPath in hmmPaths]
    queries = sequenceutils.readFromFasta(queriesPath, removeDashes=True)
    baseName = os.path.basename(queriesPath).split('.')[0]
    dirName = os.path.join(os.path.dirname(queriesPath),
                           "chunks_{}".format(baseName))
    if not os.path.exists(dirName):
        os.makedirs(dirName)

    chunkSize = 1000

    taxa = list(queries.keys())
    inputOutputs = []
    for i in range(math.ceil(len(taxa) / chunkSize)):
        chunk = taxa[i * chunkSize:min(len(taxa), (i + 1) * chunkSize)]
        inputName = os.path.join(dirName,
                                 "{}_chunk_{}.txt".format(baseName, i + 1))
        sequenceutils.writeFasta(queries, inputName, chunk)
        for hmmPath in hmmPaths:
            outputName = os.path.join(
                os.path.dirname(hmmPath),
                "{}_chunk_{}_score.txt".format(baseName, i + 1))
            inputOutputs.append((hmmPath, inputName, outputName))
            scoreFileHmmFileMap[outputName] = hmmPath

    tasks = [
        getHmmScores(hmmPath, inputPath, outputPath)
        for hmmPath, inputPath, outputPath in inputOutputs
    ]
    return tasks
Пример #2
0
def buildInducedSubalignment(**kwargs):
    alignmentColumnsPath = kwargs["alignmentColumnsPath"]
    subalignmentPath = kwargs["subalignmentPath"]
    inducedAlignPath = kwargs["outputFile"]
    tempInducedAlignPath = os.path.join(
        os.path.dirname(inducedAlignPath),
        "temp_{}".format(os.path.basename(inducedAlignPath)))

    alignColumns = []
    with open(alignmentColumnsPath) as f:
        insertIdxs = set(
            [int(token) for token in f.readline().strip().split()])
        for line in f:
            tokens = set([int(token) for token in line.strip().split()])
            alignColumns.append(tokens)

    subsetAlign = sequenceutils.readFromFasta(subalignmentPath,
                                              removeDashes=False)
    inducedAlign = {taxon: ['-'] * len(alignColumns) for taxon in subsetAlign}

    for idx, column in enumerate(alignColumns):
        for taxon in subsetAlign:
            for c in column:
                letter = subsetAlign[taxon].seq[c]
                if letter != '-':
                    letter = letter.lower() if c in insertIdxs else letter
                    assert inducedAlign[taxon][idx] == '-'
                    inducedAlign[taxon][idx] = letter

    for s in inducedAlign:
        inducedAlign[s] = sequenceutils.Sequence(s, "".join(inducedAlign[s]))
    sequenceutils.writeFasta(inducedAlign, tempInducedAlignPath)
    shutil.move(tempInducedAlignPath, inducedAlignPath)
Пример #3
0
def decomposeGuideTree(subsetsDir, sequencesPath, guideTreePath, maxSubsetSize,
                       maxNumSubsets):
    sequences = sequenceutils.readFromFasta(sequencesPath, removeDashes=False)
    guideTree = dendropy.Tree.get(path=guideTreePath,
                                  schema="newick",
                                  preserve_underscores=True)
    guideTree.collapse_basal_bifurcation()

    for edge in guideTree.postorder_edge_iter():
        if len(edge.head_node.child_edges()) > 0:
            edge.childs = sum([e.childs for e in edge.head_node.child_edges()])
        else:
            edge.childs = 1
    guideTree.childs = guideTree.seed_node.edge.childs
    trees = decomposeTree(guideTree, maxSubsetSize, maxNumSubsets)

    taxonSubsets = []
    for tree in trees:
        keep = [n.taxon.label for n in tree.leaf_nodes()]
        taxonSubsets.append(keep)

    subsetPaths = []
    for n, subset in enumerate(taxonSubsets):
        subsetPath = os.path.join(subsetsDir, "subset_{}.txt".format(n + 1))
        subsetPaths.append(subsetPath)
        sequenceutils.writeFasta(sequences, subsetPath, subset)
    return subsetPaths
Пример #4
0
def hmmAlignQueries(hmmPath, queriesPath):
    queries = sequenceutils.readFromFasta(queriesPath, removeDashes=True)
    baseName = os.path.basename(queriesPath).split('.')[0]
    dirName = os.path.join(os.path.dirname(queriesPath),
                           "chunks_{}".format(baseName))
    if not os.path.exists(dirName):
        os.makedirs(dirName)
    chunkSize = 1000

    taxa = list(queries.keys())
    alignFiles = {}
    for i in range(math.ceil(len(taxa) / chunkSize)):
        chunk = taxa[i * chunkSize:min(len(taxa), (i + 1) * chunkSize)]
        inputName = os.path.join(dirName,
                                 "{}_chunk_{}.txt".format(baseName, i + 1))
        outputName = os.path.join(
            dirName, "{}_chunk_{}_aligned.txt".format(baseName, i + 1))
        sequenceutils.writeFasta(queries, inputName, chunk)
        alignFiles[inputName] = outputName

    tasks = []
    for inputPath, outputPath in alignFiles.items():
        task = buildHmmAlignment(hmmPath, inputPath, outputPath)
        tasks.append(task)
    return tasks
Пример #5
0
def assignBackboneTaxa(context, numTaxa, unalignedFile):
    backbone = {}
    for subset in context.subsets:
        random.shuffle(subset)
        for taxon in subset[:numTaxa]:
            backbone[taxon] = context.unalignedSequences[taxon]
    sequenceutils.writeFasta(backbone, unalignedFile)
    return backbone
Пример #6
0
def randomDecomposition(subsetsDir, sequences, numSubsets):
    allTaxa = list(sequences.keys())
    random.shuffle(allTaxa)

    taxonSubsets = [allTaxa[i::numSubsets] for i in range(numSubsets)]
    subsetPaths = []
    for n, subset in enumerate(taxonSubsets):
        subsetPath = os.path.join(subsetsDir, "subset_{}.txt".format(n + 1))
        subsetPaths.append(subsetPath)
        sequenceutils.writeFasta(sequences, subsetPath, subset)
    return subsetPaths
Пример #7
0
def reassignTaxons(subsetsDir, subsetSeedPaths, sequences, unusedTaxa):
    unusedPath = os.path.join(subsetsDir, "unassigned_sequences.txt")
    sequenceutils.writeFasta(sequences, unusedPath, unusedTaxa)

    hmmMap = {}
    for subsetPath in subsetSeedPaths:
        hmmDir = os.path.join(
            os.path.dirname(subsetPath),
            "hmm_{}".format(os.path.basename(subsetPath)).replace(".", "_"))
        if not os.path.exists(hmmDir):
            os.makedirs(hmmDir)
        hmmMap[subsetPath] = os.path.join(hmmDir, "hmm_model.txt")
    hmmTasks = hmmutils.buildHmms(hmmMap)
    task.submitTasks(hmmTasks)
    task.awaitTasks(hmmTasks)
    hmmPaths = [t.outputFile for t in hmmTasks]

    scoreFileHmmFileMap = {}
    scoreTasks = hmmutils.buildHmmScores(hmmPaths, unusedPath,
                                         scoreFileHmmFileMap)
    task.submitTasks(scoreTasks)

    bestScores = {}
    taxonHmmMap = {}
    for scoreTask in task.asCompleted(scoreTasks):
        subsetScores = hmmutils.readSearchFile(scoreTask.outputFile)
        for taxon, scores in subsetScores.items():
            if scores[1] > bestScores.get(taxon, -float("inf")):
                bestScores[taxon] = scores[1]
                taxonHmmMap[taxon] = scoreFileHmmFileMap[scoreTask.outputFile]

    subsetTaxons = {file: [] for file in hmmPaths}
    for taxon, hmmPath in taxonHmmMap.items():
        subsetTaxons[hmmPath].append(taxon)
    for subsetPath, hmmPath in hmmMap.items():
        subset = sequenceutils.readFromFasta(subsetPath)
        for taxon in subset:
            subsetTaxons[hmmPath].append(taxon)

    subsetPaths = []
    i = 1
    for hmmPath, subset in subsetTaxons.items():
        subsetPath = os.path.join(subsetsDir, "subset_{}.txt".format(i))
        subsetPaths.append(subsetPath)
        sequenceutils.writeFasta(sequences, subsetPath, subset)
        i = i + 1

    return subsetPaths
Пример #8
0
def buildInitialTreeAlign(tempDir, sequencesPath):
    outputTreePath = os.path.join(tempDir, "initial_tree.tre")
    outputAlignPath = os.path.join(tempDir, "initial_align.txt")

    if os.path.exists(outputTreePath) and os.path.exists(outputAlignPath):
        return outputTreePath, outputAlignPath
    if os.path.exists(tempDir):
        shutil.rmtree(tempDir)
    os.makedirs(tempDir)

    initialAlign, unusedTaxa = decomposer.initial_tree.buildInitialAlignment(
        sequencesPath, tempDir, Configs.decompositionSkeletonSize, 1000)
    sequenceutils.writeFasta(initialAlign, outputAlignPath)
    #external_tools.runRaxmlNg(outputAlignPath, tempDir, outputTreePath, 8).run()
    external_tools.runFastTree(outputAlignPath, tempDir, outputTreePath).run()

    return outputTreePath, outputAlignPath, unusedTaxa
Пример #9
0
def requestHmmExtensionTasks(context, backbone, alignedFile):
    baseName = os.path.basename(alignedFile)
    hmmDir = os.path.join(context.graph.workingDir, "hmm_{}".format(baseName))
    extensionUnalignedFile = os.path.join(hmmDir, "queries.txt")
    hmmPath = os.path.join(hmmDir, "hmm_model.txt")
    if not os.path.exists(hmmDir):
        os.makedirs(hmmDir)

    backboneExtension = {}
    for taxon in context.unalignedSequences:
        if not taxon in backbone:
            backboneExtension[taxon] = context.unalignedSequences[taxon]

    sequenceutils.writeFasta(backboneExtension, extensionUnalignedFile)
    buildTask = hmmutils.buildHmmOverAlignment(alignedFile, hmmPath)
    buildTask.run()
    alignTasks = hmmutils.hmmAlignQueries(hmmPath, extensionUnalignedFile)
    return alignTasks
Пример #10
0
def writeUnconstrainedAlignment(context):
    graph = context.graph
    alignment = {}
    for taxon in context.unalignedSequences:
        alignment[taxon] = sequenceutils.Sequence(taxon,
                                                  ['-'] * len(graph.clusters))

    curIdxes = {taxon: 0 for taxon in context.unalignedSequences}
    for idx, cluster in enumerate(graph.clusters):
        for b in cluster:
            bsub, bpos = graph.matSubPosMap[b]
            taxon = context.subalignments[bsub][0]
            alignment[taxon].seq[idx] = context.unalignedSequences[taxon].seq[
                curIdxes[taxon]]
            curIdxes[taxon] = curIdxes[taxon] + 1

    for taxon in alignment:
        alignment[taxon].seq = "".join(alignment[taxon].seq)

    sequenceutils.writeFasta(alignment, context.outputFile)
    Configs.log("Wrote final alignment to {}".format(context.outputFile))
Пример #11
0
def buildInitialAlignment(sequences, tempDir, skeletonSize, initialAlignSize,
                          outputAlignPath):
    skeletonPath = os.path.join(tempDir, "skeleton_sequences.txt")
    queriesPath = os.path.join(tempDir, "queries.txt")
    hmmDir = os.path.join(tempDir, "skeleton_hmm")
    hmmPath = os.path.join(hmmDir, "hmm_model.txt")
    initialInsertPath = os.path.join(tempDir, "initial_insert_align.txt")
    if not os.path.exists(hmmDir):
        os.makedirs(hmmDir)

    if initialAlignSize is None or initialAlignSize > len(sequences):
        initialAlignSize = len(sequences)

    skeletonTaxa, remainingTaxa = decomposer.chooseSkeletonTaxa(
        sequences, skeletonSize)
    additional = initialAlignSize - skeletonSize
    random.shuffle(remainingTaxa)
    remainingTaxa, unusedTaxa = remainingTaxa[:additional], remainingTaxa[
        additional:]

    sequenceutils.writeFasta(sequences, skeletonPath, skeletonTaxa)
    external_tools.runMafft(skeletonPath, None, tempDir, outputAlignPath,
                            Configs.numCores).run()

    if len(remainingTaxa) > 0:
        sequenceutils.writeFasta(sequences, queriesPath, remainingTaxa)
        hmmutils.buildHmmOverAlignment(outputAlignPath, hmmPath).run()
        hmmTasks = hmmutils.hmmAlignQueries(hmmPath, queriesPath)
        task.submitTasks(hmmTasks)
        for hmmTask in task.asCompleted(hmmTasks):
            hmmutils.mergeHmmAlignments([hmmTask.outputFile],
                                        outputAlignPath,
                                        includeInsertions=False)
            if Configs.graphBuildMethod == "initial":
                hmmutils.mergeHmmAlignments([hmmTask.outputFile],
                                            initialInsertPath,
                                            includeInsertions=True)
Пример #12
0
def writeUnpackedAlignment(context):
    graph = context.graph
    filePath = context.outputFile

    tempFile = os.path.join(os.path.dirname(filePath),
                            "temp_{}".format(os.path.basename(filePath)))
    if os.path.exists(tempFile):
        os.remove(tempFile)

    clusterMap = {
        path: [[] for c in graph.clusters]
        for path in context.subalignmentPaths
    }
    for idx, cluster in enumerate(graph.clusters):
        for b in cluster:
            bsub, bpos = graph.matSubPosMap[b]
            clusterMap[context.subalignmentPaths[bsub]][idx].append(bpos)

    inserts = {path: [] for path in context.subalignmentPaths}
    for b in graph.insertions:
        bsub, bpos = graph.matSubPosMap[b]
        inserts[context.subalignmentPaths[bsub]].append(bpos)

    Configs.log("Assembling final alignment in {}".format(filePath))
    inducedSubalignTasks = []
    for bsub, subalignPath in enumerate(context.subalignmentPaths):
        alignmentColumnsPath = os.path.join(
            context.graph.workingDir,
            "alignment_columns_{}".format(os.path.basename(subalignPath)))
        with open(alignmentColumnsPath, 'w') as textFile:
            textFile.write("{}\n".format(" ".join(
                [str(c) for c in inserts[subalignPath]])))
            for cluster in clusterMap[subalignPath]:
                textFile.write("{}\n".format(" ".join(
                    [str(c) for c in cluster])))

        inducedAlignPath = os.path.join(
            graph.workingDir,
            "induced_{}".format(os.path.basename(subalignPath)))
        args = {
            "alignmentColumnsPath": alignmentColumnsPath,
            "subalignmentPath": subalignPath,
            "outputFile": inducedAlignPath
        }
        inducedTask = task.Task(taskType="buildInducedSubalignment",
                                outputFile=args["outputFile"],
                                taskArgs=args)
        inducedSubalignTasks.append(inducedTask)
        #inducedTask.submitTask()

    task.submitTasks(inducedSubalignTasks)
    for inducedTask in task.asCompleted(inducedSubalignTasks):
        inducedAlign = sequenceutils.readFromFasta(inducedTask.outputFile,
                                                   removeDashes=False)
        Configs.log(
            "Appending induced alignment, {} sequences of length {}..".format(
                len(inducedAlign), len(next(iter(inducedAlign.values())).seq)))
        sequenceutils.writeFasta(inducedAlign, tempFile, append=True)

        os.remove(inducedTask.taskArgs["alignmentColumnsPath"])
        os.remove(inducedTask.outputFile)
    shutil.move(tempFile, filePath)
    Configs.log("Wrote final alignment to {}".format(filePath))
Пример #13
0
def combineHmmAlignments(alignFiles, outputAlignmentPath, includeInsertions):
    alignment = {}
    for file in alignFiles:
        alignment.update(
            sequenceutils.readFromStockholm(file, includeInsertions))
    sequenceutils.writeFasta(alignment, outputAlignmentPath, None)
Пример #14
0
def mergeHmmAlignments(alignFiles, outputAlignmentPath, includeInsertions):
    for file in alignFiles:
        alignment = sequenceutils.readFromStockholm(file, includeInsertions)
        sequenceutils.writeFasta(alignment, outputAlignmentPath, None, True)