Exemplo n.º 1
0
def estimateTree(seqFiles, tree, iterations, doSubTreeBranchEstimation, treeArgs):
    #get sequence files
    seqNo = len(seqFiles)
    #run alignment
    treeStrings = [ printBinaryTree(tree, False) + " " + " ".join(seqFiles) ]
    for iteration in xrange(0, iterations):
        ####edit this line to set
        outputAlignment = getTempFile()
        makeAlignment(seqFiles, tree, outputAlignment, treeArgs)
        gaplessColumnNo, totalColumnNo = countGaplessColumns(outputAlignment)
        logger.info("Total number of gapless columns: %s " % gaplessColumnNo)
        if gaplessColumnNo > treeArgs.COLUMN_MIN_GAPLESS_NO: #total number of columns exceeds minimum required to do tree estimation
            gaplessOutputAlignment = getGaplessAlignment(outputAlignment, seqNo)
            tree = calculateSemphyTreeEstimate(gaplessOutputAlignment, treeArgs, seqNo)
            os.remove(gaplessOutputAlignment)
        elif totalColumnNo > 0:
            logger.info("Warning, insufficient columns to estimate tree using only gapless columns")
            tree = calculateSemphyTreeEstimate(outputAlignment, treeArgs, seqNo)
        else:
            logger.info("Warning, no alignment from which to estimate tree!!")
        logger.info("Found tree topology : %s " % printBinaryTree(tree, True))
        seqFiles = getSubtreeSeqs(seqFiles, tree)
        labelTree(tree, strCounter([-1]))
        treeString = printBinaryTree(tree, False) + " " + " ".join(seqFiles)
        logger.info("On iteration : %i , found tree and seq files (ordered) : %s " % (iteration, treeString))
        if treeString in treeStrings:
            logger.info("Topology of tree is equal to one previously seen, so exiting")
            break
        if iteration+1 < iterations:
            os.remove(outputAlignment)
        #now scale by global estimates of branch length
    if doSubTreeBranchEstimation:
        subTrees = getSubtrees(tree, treeArgs.BRANCH_LENGTH_ESTIMATION_SUBTREE_DISTANCE)
        if len(subTrees) > 0:
            rateCorrections = []
            for subTree in subTrees:
                subTree2, seqFiles2, outputAlignment2 = estimateTree(getSubtreeSeqs(seqFiles, subTree), subTree, 1, False, treeArgs)
                os.remove(outputAlignment2)
                rateCorrections.append(calculateRateCorrection(subTree, subTree2))
            for i in xrange(0, len(subTrees)):
                logger.info("Rate correction for subtree: %s %s , is calculated as : %f ", \
                            printBinaryTree(subTrees[i], True), \
                            " ".join(getSubtreeSeqs(seqFiles, subTrees[i])), rateCorrections[i])
            rateCorrection = sum(rateCorrections)/len(rateCorrections)
            logger.info("Average rate correction is calculated as : %f ", rateCorrection)
            adjustTreeRates(tree, rateCorrection)
        else:
            logger.info("No suitable branches found for rate re-estimation")    
    return tree, seqFiles, outputAlignment
Exemplo n.º 2
0
def makeAlignment(binaryTree, seqFiles, alignmentFile, outputScoreFile,
                  alignerArgs):
    seqFiles = getChildSeqs(binaryTree, seqFiles)
    return stitchAlignAndReconstruct(len(seqFiles), seqFiles,
                                     printBinaryTree(binaryTree, True,
                                                     False), alignmentFile,
                                     outputScoreFile, alignerArgs)
Exemplo n.º 3
0
def estimateTreeAlign(seqFiles, outputTreeFile, treeArgs):
    origSeqFileOrder = seqFiles[:]
    tree = makeStarTree(len(seqFiles), 0, treeArgs.DEFAULT_DISTANCE)
    binaryTree_depthFirstNumbers(tree)
    labelTree(tree, strCounter([-1]))
    tree, seqFiles, outputAlignment = estimateTree(seqFiles, tree, treeArgs.ITERATION_NUMBER, \
                                                    treeArgs.DO_SUBTREE_BRANCH_LENGTH_ESTIMATION, treeArgs)
    seqFiles = list(seqFiles)
    if treeArgs.SPECIES_TREE_STRING != None:
        logger.info("Predicting root of tree using species tree")
        speciesTree = newickTreeParser(treeArgs.SPECIES_TREE_STRING)
        binaryTree_depthFirstNumbers(speciesTree)
        logger.info("Parsed species tree: %s" %
                    printBinaryTree(speciesTree, True))
        i = [-1]

        def fn():
            i[0] += 1
            j = origSeqFileOrder.index(seqFiles[i[0]])
            return "%s_%s" % (treeArgs.LEAF_SPECIES[j], str(i[0]))

        labelTree(tree, fn)
        tree, dupCount, lossCount = calculateProbableRootOfGeneTree(
            speciesTree, tree, processID=lambda x: x.split("_")[0])

        def fn2(tree):
            if tree.internal:
                fn2(tree.left)
                fn2(tree.right)
            else:
                tree.iD = tree.iD.split('_')[1]

        fn2(tree)
        seqFiles = getSubtreeSeqs(seqFiles, tree)
        logger.info("Reconciled tree with root : %s %s " %
                    (printBinaryTree(tree, True), " ".join(seqFiles)))
        logger.info("Number of dups needed for reconcilliations : %s " %
                    dupCount)
        logger.info("Number of losses needed for reconcilliations : %s " %
                    lossCount)
    seqFiles = list(seqFiles)
    out = open(outputTreeFile, 'w')
    out.write("%s\n" % printBinaryTree(tree, True))
    out.write("%s\n" % " ".join(seqFiles))
    out.close()
    logger.info("Finished estimate tree")
    return tree, seqFiles, outputAlignment
Exemplo n.º 4
0
def main():
    sys.stderr.write("Arguments received : %s \n" % "_".join(sys.argv))
    startTime = time.time()
    alignerArgs = getDefaultArgs()
    addDefaultArgs(alignerArgs)
    addDefaultStitcherArgs(alignerArgs)
    addDefaultNesterArgs(alignerArgs)
    addDefaultEstimateTreeArgs(alignerArgs)
    i = loggerIndices
    removeReservedIndices(i, alignerArgs)
    if len(sys.argv) < 3:
        print "Ortheus.py [MODIFIER_ARGUMENTS]"
        print "Version: ", VERSION_NO
        print "A top level script for running Ortheus and Pecan to produce substitution and indel aware reconstructed chunks of genome"
        print "If you would like to contribute to this program's development please contact me at bjp (AT) ebi (DOT) ac (DOT) uk "
        print "Arguments:"
        i = printFirstMods(alignerArgs, i)
        i = printMods(alignerArgs, i)
        i = printModsStitcher(alignerArgs, i)
        i = printModsNester(alignerArgs, i)
        i = printEstimateTreeMods(alignerArgs, i)
        print "-------------Ortheus help string as follows (Changing these arguments may break the script)-------------"
        os.system("ortheus_core")
        print "-------------End Ortheus help string-------------"
        print "-------------Pecan help string as follows (Changing these arguments may break the script)-------------"
        os.system("%s bp.pecan.Pecan -help" % (alignerArgs.JAVA_PREFIX,))
        print "-------------End Pecan help string-------------"
        sys.exit(0)
        
    mods = sys.argv[1:]
    l = []
    i = parseFirstMods(mods, alignerArgs, i, l)
    i = parseMods(l, alignerArgs, i, mods)
    i = parseModsStitcher(mods, alignerArgs, i, l)
    i = parseModsNester(l, alignerArgs, i, mods)
    i = parseEstimateTreeMods(mods, alignerArgs, i, l)
    if len(l) != 0:
        logger.info("Ooops, remaining arguments %s ", " ".join(l))
        assert False  
    logger.info("Arguments received : %s " % " ".join(sys.argv))
    logger.info("Sequence files : %s " % " ".join(alignerArgs.SEQUENCE_FILES))
    if alignerArgs.EMPIRICALLY_ESTIMATE_CHARACTER_FREQUENCIES:
        alignerArgs.EXPECTED_CHARACTER_FREQUENCIES = empiricallyEstimateNucleotideFrequencies(alignerArgs.SEQUENCE_FILES)
        logger.info("Empirically estimated character frequencies : %s " % " ".join([ str(i) for i in alignerArgs.EXPECTED_CHARACTER_FREQUENCIES ]))
    try:
        os.remove(alignerArgs.OUTPUT_SCORE_FILE)
    except OSError:
        pass
    if alignerArgs.NEWICK_TREE_STRING != None:
        binaryTree = newickTreeParser(alignerArgs.NEWICK_TREE_STRING)  
        logger.info("Newick tree read : %s " % printBinaryTree(binaryTree, True))
    else:
        binaryTree, seqFiles, outputAlignment = estimateTreeAlign(alignerArgs.SEQUENCE_FILES, alignerArgs.OUTPUT_TREE_FILE, alignerArgs)
        os.remove(outputAlignment) #for now, this should be
        alignerArgs.SEQUENCE_FILES = seqFiles
    if alignerArgs.MAKE_FINAL_ALIGNMENT:
        nestAlign(binaryTree, alignerArgs.SEQUENCE_FILES, alignerArgs.OUTPUT_FILE, alignerArgs.OUTPUT_SCORE_FILE, alignerArgs)        
    #logger.info("Finished, total time taken : %s (seconds)" % (time.time()-startTime))
    print "total_time %s " % (time.time()-startTime)
Exemplo n.º 5
0
def makeAlignment(seqFiles, tree, outputAlignmentFile, alignerArgs):
    if len(seqFiles) < 30:
        makePecanAlignment(seqFiles, printBinaryTree(tree, True), outputAlignmentFile, alignerArgs)
    else:
        alignmentFile = getTempFile()
        outputScoreFile = getTempFile()
        nestAlign(tree, seqFiles, alignmentFile, outputScoreFile, alignerArgs)
        splitOutAncestors(alignmentFile,outputAlignmentFile)
        os.remove(alignmentFile)
        os.remove(outputScoreFile)
Exemplo n.º 6
0
def makeAlignment(seqFiles, tree, outputAlignmentFile, alignerArgs):
    if len(seqFiles) < 30:
        makePecanAlignment(seqFiles, printBinaryTree(tree, True),
                           outputAlignmentFile, alignerArgs)
    else:
        alignmentFile = getTempFile()
        outputScoreFile = getTempFile()
        nestAlign(tree, seqFiles, alignmentFile, outputScoreFile, alignerArgs)
        splitOutAncestors(alignmentFile, outputAlignmentFile)
        os.remove(alignmentFile)
        os.remove(outputScoreFile)
Exemplo n.º 7
0
def estimateTreeAlign(seqFiles, outputTreeFile, treeArgs):
    origSeqFileOrder = seqFiles[:]
    tree = makeStarTree(len(seqFiles), 0, treeArgs.DEFAULT_DISTANCE)
    binaryTree_depthFirstNumbers(tree)
    labelTree(tree, strCounter([-1]))
    tree, seqFiles, outputAlignment = estimateTree(seqFiles, tree, treeArgs.ITERATION_NUMBER, \
                                                    treeArgs.DO_SUBTREE_BRANCH_LENGTH_ESTIMATION, treeArgs)
    seqFiles = list(seqFiles)
    if treeArgs.SPECIES_TREE_STRING != None:
        logger.info("Predicting root of tree using species tree")
        speciesTree = newickTreeParser(treeArgs.SPECIES_TREE_STRING)
        binaryTree_depthFirstNumbers(speciesTree)
        logger.info("Parsed species tree: %s" % printBinaryTree(speciesTree, True))
        i = [-1]
        def fn():
            i[0] += 1
            j = origSeqFileOrder.index(seqFiles[i[0]])
            return "%s_%s" % (treeArgs.LEAF_SPECIES[j], str(i[0]))
        labelTree(tree, fn)
        tree, dupCount, lossCount = calculateProbableRootOfGeneTree(speciesTree, tree, processID=lambda x : x.split("_")[0])
        def fn2(tree):
            if tree.internal:
                fn2(tree.left)
                fn2(tree.right)
            else:
                tree.iD = tree.iD.split('_')[1]
        fn2(tree)
        seqFiles = getSubtreeSeqs(seqFiles, tree)
        logger.info("Reconciled tree with root : %s %s " % (printBinaryTree(tree, True), " ".join(seqFiles)))
        logger.info("Number of dups needed for reconcilliations : %s " % dupCount)
        logger.info("Number of losses needed for reconcilliations : %s " % lossCount)
    seqFiles = list(seqFiles)
    out = open(outputTreeFile, 'w')
    out.write("%s\n" % printBinaryTree(tree, True))
    out.write("%s\n" % " ".join(seqFiles))
    out.close()
    logger.info("Finished estimate tree")
    return tree, seqFiles, outputAlignment
Exemplo n.º 8
0
def makeAlignment(binaryTree, seqFiles, alignmentFile, outputScoreFile, alignerArgs):
    seqFiles = getChildSeqs(binaryTree, seqFiles)
    return stitchAlignAndReconstruct(len(seqFiles), seqFiles, printBinaryTree(binaryTree, True, False), alignmentFile, outputScoreFile, alignerArgs)
Exemplo n.º 9
0
def nestAlign(binaryTree, leafSeqFiles, outputFile, outputScoreFile, alignerArgs):
    logger.info("Starting Nester")
    maxNodeNo = alignerArgs.MAX_NODE_NO
    
    removeInternalIDs(binaryTree)
    
    logger.info("Binary tree : %s " % printBinaryTree(binaryTree, True, False))
    binaryTree_depthFirstNumbers(binaryTree)
    nodeNo = binaryTree.traversalID.midEnd
    logger.info("Labelled tree with numbers ")
    
    seqNo = len(leafSeqFiles)
    logger.info(" Sequence files : %s" % " ".join(leafSeqFiles))
    #assert seqNo*2 - 1 == nodeNo
    
    logger.info("Output file %s " % outputFile)
    
    labels = binaryTree_nodeNames(binaryTree)
    costs = calculateTreeNodeCosts(binaryTree)
    logger.info("Calculated node costs")
    for node in xrange(0, nodeNo):
        logger.info("Node : %s , reconstruction value : %f , %f" % (labels[node], costs[node], 1.0 - costs[node]))
    pathCost, treePath = calculatePath(binaryTree, costs, maxNodeNo)
    logger.info(" Calculated nested path. Cost : %f , Path : %s" % (pathCost, " ".join([ labels[i.traversalID.mid] for i in treePath ])))
    assert len(leafSeqFiles) == seqNo
    alignmentFiles = [None] * nodeNo
    seqFiles = [None] * nodeNo
    for i in xrange(0, seqNo):
        seqFiles[i*2] = leafSeqFiles[i]
    logger.debug("About to start main nested loop")
    for subTree in treePath:
        assert subTree != binaryTree
        logger.info("Chosen sub tree to align : %s " % printBinaryTree(subTree, True, False))
        alignmentFile = getTempFile()
        startTime = time.time()
        makeAlignment(subTree, seqFiles, alignmentFile, outputScoreFile, alignerArgs)
        logger.info("Made alignment of subtree, time taken : %s (seconds)" % (time.time()-startTime))
        #get the two ancestors
        subTreeTraversalIDs = binaryTree_depthFirstNumbers(subTree, labelTree=False, dontStopAtID=False)
        
        if subTree.left.internal:
            offset = subTreeTraversalIDs[subTree].midStart
            childXAlignmentFile = getTempFile()
            extractSubAlignment(alignmentFile, 0, subTreeTraversalIDs[subTree].mid-offset, childXAlignmentFile)
            alignmentFiles[subTree.left.traversalID.mid] = childXAlignmentFile
            logger.info("Extracted alignment of left child : %s " % printBinaryTree(subTree.left, True, False))
            
            assert offset == subTreeTraversalIDs[subTree.left].midStart
            childXSeqFile = getTempFile()
            extractSubAlignment(childXAlignmentFile, subTreeTraversalIDs[subTree.left].mid - offset, subTreeTraversalIDs[subTree.left].mid - offset + 1, childXSeqFile)
            seqFiles[subTree.left.traversalID.mid] = childXSeqFile
            logger.info("Extracted sequence of left child : %s " % printBinaryTree(subTree.left, True, False))
        
        if subTree.right.internal:
            offset = subTreeTraversalIDs[subTree].midStart
            childYAlignmentFile = getTempFile()
            extractSubAlignment(alignmentFile, subTreeTraversalIDs[subTree].mid + 1 - offset, subTreeTraversalIDs[subTree].midEnd - offset, childYAlignmentFile)
            alignmentFiles[subTree.right.traversalID.mid] = childYAlignmentFile  
            logger.info("Extracted alignment of right child : %s " % printBinaryTree(subTree.right, True, False))
            
            offset = subTreeTraversalIDs[subTree.right].midStart
            childYSeqFile = getTempFile()
            extractSubAlignment(childYAlignmentFile, subTreeTraversalIDs[subTree.right].mid - offset, subTreeTraversalIDs[subTree.right].mid - offset + 1, childYSeqFile)
            seqFiles[subTree.right.traversalID.mid] = childYSeqFile  
            logger.info("Extracted sequence of right child : %s " % printBinaryTree(subTree.right, True, False))
        
        subTree.left.iD = labels[subTree.left.traversalID.mid] #labels tree, so we only print relevant bits
        subTree.right.iD = labels[subTree.right.traversalID.mid]
        os.remove(alignmentFile)
        logger.info("Finished loop and reduced tree to : %s " % printBinaryTree(subTree, True, False))
    startTime = time.time()
    makeAlignment(binaryTree, seqFiles, outputFile, outputScoreFile, alignerArgs)
    logger.info("Finished final nested alignment, time taken : %s (seconds)" % (time.time()-startTime))
    alignmentFiles[binaryTree.traversalID.mid] = outputFile
    mergeTogetherAllAlignments(binaryTree, alignmentFiles, labels, [0])
    logger.info("Merged together all alignments")
    for i in xrange(1, nodeNo, 2):
        if seqFiles[i] != None:
            os.remove(seqFiles[i])
    removeInternalIDs(binaryTree)
    logger.info("Have cleaned up, and am returning")
Exemplo n.º 10
0
def estimateTree(seqFiles, tree, iterations, doSubTreeBranchEstimation,
                 treeArgs):
    #get sequence files
    seqNo = len(seqFiles)
    #run alignment
    treeStrings = [printBinaryTree(tree, False) + " " + " ".join(seqFiles)]
    for iteration in xrange(0, iterations):
        ####edit this line to set
        outputAlignment = getTempFile()
        makeAlignment(seqFiles, tree, outputAlignment, treeArgs)
        gaplessColumnNo, totalColumnNo = countGaplessColumns(outputAlignment)
        logger.info("Total number of gapless columns: %s " % gaplessColumnNo)
        if gaplessColumnNo > treeArgs.COLUMN_MIN_GAPLESS_NO:  #total number of columns exceeds minimum required to do tree estimation
            gaplessOutputAlignment = getGaplessAlignment(
                outputAlignment, seqNo)
            tree = calculateSemphyTreeEstimate(gaplessOutputAlignment,
                                               treeArgs, seqNo)
            os.remove(gaplessOutputAlignment)
        elif totalColumnNo > 0:
            logger.info(
                "Warning, insufficient columns to estimate tree using only gapless columns"
            )
            tree = calculateSemphyTreeEstimate(outputAlignment, treeArgs,
                                               seqNo)
        else:
            logger.info("Warning, no alignment from which to estimate tree!!")
        logger.info("Found tree topology : %s " % printBinaryTree(tree, True))
        seqFiles = getSubtreeSeqs(seqFiles, tree)
        labelTree(tree, strCounter([-1]))
        treeString = printBinaryTree(tree, False) + " " + " ".join(seqFiles)
        logger.info(
            "On iteration : %i , found tree and seq files (ordered) : %s " %
            (iteration, treeString))
        if treeString in treeStrings:
            logger.info(
                "Topology of tree is equal to one previously seen, so exiting")
            break
        if iteration + 1 < iterations:
            os.remove(outputAlignment)
        #now scale by global estimates of branch length
    if doSubTreeBranchEstimation:
        subTrees = getSubtrees(
            tree, treeArgs.BRANCH_LENGTH_ESTIMATION_SUBTREE_DISTANCE)
        if len(subTrees) > 0:
            rateCorrections = []
            for subTree in subTrees:
                subTree2, seqFiles2, outputAlignment2 = estimateTree(
                    getSubtreeSeqs(seqFiles, subTree), subTree, 1, False,
                    treeArgs)
                os.remove(outputAlignment2)
                rateCorrections.append(
                    calculateRateCorrection(subTree, subTree2))
            for i in xrange(0, len(subTrees)):
                logger.info("Rate correction for subtree: %s %s , is calculated as : %f ", \
                            printBinaryTree(subTrees[i], True), \
                            " ".join(getSubtreeSeqs(seqFiles, subTrees[i])), rateCorrections[i])
            rateCorrection = sum(rateCorrections) / len(rateCorrections)
            logger.info("Average rate correction is calculated as : %f ",
                        rateCorrection)
            adjustTreeRates(tree, rateCorrection)
        else:
            logger.info("No suitable branches found for rate re-estimation")
    return tree, seqFiles, outputAlignment
Exemplo n.º 11
0
def stitchReconstruct(seqNo, inputSeqFiles, treeString, outputFile, outputScoreFile, inputAlignmentFile, alignerArgs):
    startTime = time.time() #epoch time in seconds
    
    logger.info("Starting Stitcher")
    reconstructionPrefix = alignerArgs.RECONSTRUCTION_PREFIX
    if alignerArgs.FAST_SETTING:
        reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS_FAST
    else:
        reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS
    cautiousArgs = alignerArgs.CAUTIOUS_ARGS
    alignmentChunkMaxSeqSize = alignerArgs.ALIGNMENT_CHUNK_MAX_COLUMN_SIZE
    viterbiAlignmentColumnGap = alignerArgs.VITERBI_ALIGNMENT_COLUMN_GAP
    #parse tree 
    binaryTree = newickTreeParser(treeString)
    binaryTree_depthFirstNumbers(binaryTree)
    logger.info("Newick tree read : %s " % printBinaryTree(binaryTree, True))
    labels = binaryTree_nodeNames(binaryTree)
    leafLabels = [ labels[i] for i in xrange(0, len(labels)) if (i%2) == 0]
    #load alignment iterator
    alignmentReader = multiFastaRead(inputAlignmentFile, lambda x : x)
    #number of sequences, including ancestors
    nodeNumber = binaryTree.traversalID.midEnd
    assert nodeNumber == seqNo * 2 - 1
    #create output files
    outputFiles, outputIterators = getOpenSeqFiles(nodeNumber, getTempFile)
    #while has chunk
    previousAlignment = []
    alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels)
    tempTreeStatesFile = getTempFile()
    loopOptions = " "  
    logger.info("Starting main loop")
    characterFrequenciesString = " ".join([ str(i) for i in alignerArgs.EXPECTED_CHARACTER_FREQUENCIES ])
    while alignmentSeqs != None:
        if(end):
            viterbiAlignmentColumnGap = 0
        tempAncestorFile = getTempFile()
        tempScoreFile = getTempFile()
        command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -n %s -x %s " % (reconstructionPrefix, treeString, alignmentFile, \
                                                                       " ".join(alignmentSeqs), tempTreeStatesFile, \
                                                                       viterbiAlignmentColumnGap, loopOptions, reconstructionArgs, tempAncestorFile, characterFrequenciesString, tempScoreFile)
        logger.info("Calling Ortheus with : %s", command)
        exitValue = os.system(command)
        if exitValue != 0:
            logger.info("Something went wrong calling Ortheus : %i ", exitValue)
            #if exitValue != 73:
            #    logger.info("Unrecognised issue, so am exiting to be cautious")
            #    sys.exit(1)
            logger.info("Going to retry with caution settings")
            command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -x %s" % (reconstructionPrefix, treeString, alignmentFile, \
                                                                       " ".join(alignmentSeqs), tempTreeStatesFile, \
                                                                       viterbiAlignmentColumnGap, loopOptions, cautiousArgs, tempAncestorFile, tempScoreFile)
            logger.info("Calling Ortheus with : %s", command)
            if os.system(command):
                logger.info("Already tried caution, so have to go")
                sys.exit(1)
        logger.info("Completed reconstruction of chunk")
        appendScore(tempScoreFile, outputScoreFile)
        os.remove(tempScoreFile)
        loopOptions = " -t " + tempTreeStatesFile
        tempAncestorFastaOffsets = getMultiFastaOffsets(tempAncestorFile)
        previousAlignment = removeFromLeft(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), previousAlignment, nodeNumber, seqNo)
        appendToAlignment(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), outputIterators, nodeNumber)
        logger.info("Added reconstructed chunk to complete alignment")
        os.remove(tempAncestorFile)
        removeSeqFiles(alignmentSeqs, seqNo)
        os.remove(alignmentFile)
        logger.info("Cleaned up at end of loop")
        alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels)
    logger.info("Finished main loop")
    #load into single output file
    closeSeqIterators(outputIterators, nodeNumber)
    concatanateSeqFiles(outputFiles, outputFile, nodeNumber, labels)
    logger.info("Written out alignment to single file")
    #clean up
    os.remove(tempTreeStatesFile)
    removeSeqFiles(outputFiles, nodeNumber)
    logger.info("Cleaned up final files")
    logger.info("Finished, total time taken for stitcher: %s (seconds)" % (time.time()-startTime))
Exemplo n.º 12
0
def nestAlign(binaryTree, leafSeqFiles, outputFile, outputScoreFile,
              alignerArgs):
    logger.info("Starting Nester")
    maxNodeNo = alignerArgs.MAX_NODE_NO

    removeInternalIDs(binaryTree)

    logger.info("Binary tree : %s " % printBinaryTree(binaryTree, True, False))
    binaryTree_depthFirstNumbers(binaryTree)
    nodeNo = binaryTree.traversalID.midEnd
    logger.info("Labelled tree with numbers ")

    seqNo = len(leafSeqFiles)
    logger.info(" Sequence files : %s" % " ".join(leafSeqFiles))
    #assert seqNo*2 - 1 == nodeNo

    logger.info("Output file %s " % outputFile)

    labels = binaryTree_nodeNames(binaryTree)
    costs = calculateTreeNodeCosts(binaryTree)
    logger.info("Calculated node costs")
    for node in xrange(0, nodeNo):
        logger.info("Node : %s , reconstruction value : %f , %f" %
                    (labels[node], costs[node], 1.0 - costs[node]))
    pathCost, treePath = calculatePath(binaryTree, costs, maxNodeNo)
    logger.info(
        " Calculated nested path. Cost : %f , Path : %s" %
        (pathCost, " ".join([labels[i.traversalID.mid] for i in treePath])))
    assert len(leafSeqFiles) == seqNo
    alignmentFiles = [None] * nodeNo
    seqFiles = [None] * nodeNo
    for i in xrange(0, seqNo):
        seqFiles[i * 2] = leafSeqFiles[i]
    logger.debug("About to start main nested loop")
    for subTree in treePath:
        assert subTree != binaryTree
        logger.info("Chosen sub tree to align : %s " %
                    printBinaryTree(subTree, True, False))
        alignmentFile = getTempFile()
        startTime = time.time()
        makeAlignment(subTree, seqFiles, alignmentFile, outputScoreFile,
                      alignerArgs)
        logger.info("Made alignment of subtree, time taken : %s (seconds)" %
                    (time.time() - startTime))
        #get the two ancestors
        subTreeTraversalIDs = binaryTree_depthFirstNumbers(subTree,
                                                           labelTree=False,
                                                           dontStopAtID=False)

        if subTree.left.internal:
            offset = subTreeTraversalIDs[subTree].midStart
            childXAlignmentFile = getTempFile()
            extractSubAlignment(alignmentFile, 0,
                                subTreeTraversalIDs[subTree].mid - offset,
                                childXAlignmentFile)
            alignmentFiles[subTree.left.traversalID.mid] = childXAlignmentFile
            logger.info("Extracted alignment of left child : %s " %
                        printBinaryTree(subTree.left, True, False))

            assert offset == subTreeTraversalIDs[subTree.left].midStart
            childXSeqFile = getTempFile()
            extractSubAlignment(
                childXAlignmentFile,
                subTreeTraversalIDs[subTree.left].mid - offset,
                subTreeTraversalIDs[subTree.left].mid - offset + 1,
                childXSeqFile)
            seqFiles[subTree.left.traversalID.mid] = childXSeqFile
            logger.info("Extracted sequence of left child : %s " %
                        printBinaryTree(subTree.left, True, False))

        if subTree.right.internal:
            offset = subTreeTraversalIDs[subTree].midStart
            childYAlignmentFile = getTempFile()
            extractSubAlignment(alignmentFile,
                                subTreeTraversalIDs[subTree].mid + 1 - offset,
                                subTreeTraversalIDs[subTree].midEnd - offset,
                                childYAlignmentFile)
            alignmentFiles[subTree.right.traversalID.mid] = childYAlignmentFile
            logger.info("Extracted alignment of right child : %s " %
                        printBinaryTree(subTree.right, True, False))

            offset = subTreeTraversalIDs[subTree.right].midStart
            childYSeqFile = getTempFile()
            extractSubAlignment(
                childYAlignmentFile,
                subTreeTraversalIDs[subTree.right].mid - offset,
                subTreeTraversalIDs[subTree.right].mid - offset + 1,
                childYSeqFile)
            seqFiles[subTree.right.traversalID.mid] = childYSeqFile
            logger.info("Extracted sequence of right child : %s " %
                        printBinaryTree(subTree.right, True, False))

        subTree.left.iD = labels[
            subTree.left.traversalID.
            mid]  #labels tree, so we only print relevant bits
        subTree.right.iD = labels[subTree.right.traversalID.mid]
        os.remove(alignmentFile)
        logger.info("Finished loop and reduced tree to : %s " %
                    printBinaryTree(subTree, True, False))
    startTime = time.time()
    makeAlignment(binaryTree, seqFiles, outputFile, outputScoreFile,
                  alignerArgs)
    logger.info("Finished final nested alignment, time taken : %s (seconds)" %
                (time.time() - startTime))
    alignmentFiles[binaryTree.traversalID.mid] = outputFile
    mergeTogetherAllAlignments(binaryTree, alignmentFiles, labels, [0])
    logger.info("Merged together all alignments")
    for i in xrange(1, nodeNo, 2):
        if seqFiles[i] != None:
            os.remove(seqFiles[i])
    removeInternalIDs(binaryTree)
    logger.info("Have cleaned up, and am returning")