Пример #1
0
def OrthologuesWorkflow(workingDir_ogs, 
                       orthofinderResultsDir, 
                       speciesToUse, nSpAll, 
                       clustersFilename_pairs, 
                       tree_options,
                       msa_method,
                       tree_method,
                       nHighParallel,
                       nLowParrallel,
                       userSpeciesTree = None, 
                       qStopAfterSeqs = False,
                       qStopAfterAlign = False,
                       qStopAfterTrees = False, 
                       qMSA = False,
                       qPhyldog = False,
                       pickleDir=None):
    """
    1. Setup:
        - ogSet, directories
        - DendroBLASTTress - object
    2. DendrobBLAST:
        - read scores
        - RunAnalysis: Get distance matrices, do trees
    3. Root species tree
    4. Reconciliation/Orthologues
    5. Clean up
    
    Variables:
    - ogSet - all the relevant information about the orthogroups, species etc.
    """
    ogSet = OrthoGroupsSet(workingDir_ogs, speciesToUse, nSpAll, clustersFilename_pairs, idExtractor = util.FirstWordExtractor, pickleDir=pickleDir)
    
    # Class that is going to run the analysis needs to check the dependencies
#    if not CanRunOrthologueDependencies(workingDir_ogs, qMSA, qStopAfterTrees, userSpeciesTree == None): 
#        print("Orthogroups have been inferred but the dependencies for inferring gene trees and")
#        print("orthologues have not been met. Please review previous messages for more information.")
#        sys.exit()
    
    resultsDir = util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_")
    """ === 1 === ust = UserSpeciesTree
    MSA:               Sequences    Alignments                        GeneTrees    db    SpeciesTree
    Phyldog:           Sequences    Alignments                        GeneTrees    db    SpeciesTree  
    Dendroblast:                                  DistanceMatrices    GeneTrees    db    SpeciesTree
    MSA (ust):         Sequences    Alignments                        GeneTrees    db
    Phyldog (ust):     Sequences    Alignments                        GeneTrees    db      
    Dendroblast (ust):                            DistanceMatrices    GeneTrees    db        
    """
    if qMSA or qPhyldog:
        treeGen = msa.TreesForOrthogroups(tree_options, msa_method, tree_method, resultsDir, workingDir_ogs)
        seqs_alignments_dirs = treeGen.DoTrees(ogSet.OGs(qInclAll=True), ogSet.Spec_SeqDict(), nHighParallel, qStopAfterSeqs, qStopAfterAlign or qPhyldog) 
        if qStopAfterSeqs:
            print("")
            return ("\nSequences for orthogroups:\n   %s\n" % seqs_alignments_dirs[0])
        elif qStopAfterAlign:
            print("")
            st = "\nSequences for orthogroups:\n   %s\n" % seqs_alignments_dirs[0]
            st += "\nMultiple sequence alignments:\n   %s\n" % seqs_alignments_dirs[1]
            return st
        db = DendroBLASTTrees(ogSet, resultsDir, nLowParrallel)
        if not userSpeciesTree:
            util.PrintUnderline("Inferring species tree (calculating gene distances)")
            print("Loading BLAST scores")
            db.ReadAndPickle()
            spTreeFN_ids, spTreeUnrootedFN = db.SpeciesTreeOnly()
        if qPhyldog:
            trees_from_phyldog.RunPhyldogAnalysis(resultsDir + "WorkingDirectory/phyldog/", ogSet.OGs(), speciesToUse)
            return "Running Phyldog" + "\n".join(seqs_alignments_dirs)       
    else:
        util.PrintUnderline("Calculating gene distances")
        db = DendroBLASTTrees(ogSet, resultsDir, nLowParrallel)
        db.ReadAndPickle()
        nOGs, D, spTreeFN_ids, spTreeUnrootedFN = db.RunAnalysis()
    
    """ === 2 ===
    Check can continue with analysis 
    """
    if len(ogSet.speciesToUse) < 4: 
        print("ERROR: Not enough species to infer species tree")
        util.Fail()
     
    """ === 3 ===
    MSA:               RootSpeciesTree
    Phyldog:           RootSpeciesTree    
    Dendroblast:       RootSpeciesTree  
    MSA (ust):         ConvertSpeciesTreeIDs
    Phyldog (ust):     ConvertSpeciesTreeIDs
    Dendroblast (ust): ConvertSpeciesTreeIDs
    """    
    if userSpeciesTree:
        util.PrintUnderline("Using user-supplied species tree") 
        userSpeciesTree = ConvertUserSpeciesTree(db.workingDir + "Trees_ids/", userSpeciesTree, ogSet.SpeciesDict())
        rootedSpeciesTreeFN = [userSpeciesTree]
        roots = [None]
        qMultiple = False
    else:
        util.PrintUnderline("Best outgroup(s) for species tree") 
        spDict = ogSet.SpeciesDict()
        roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot(spTreeFN_ids, os.path.split(db.TreeFilename_IDs(0))[0] + "/", rfd.GeneToSpecies_dash, nHighParallel, treeFmt = 1)
        if len(roots) > 1:
            print("Observed %d duplications. %d support the best roots and %d contradict them." % (len(clusters), nSupport, len(clusters) - nSupport))
            print("Best outgroups for species tree:")  
        else:
            print("Observed %d duplications. %d support the best root and %d contradict it." % (len(clusters), nSupport, len(clusters) - nSupport))
            print("Best outgroup for species tree:")  
        for r in roots: print("  " + (", ".join([spDict[s] for s in r]))  )
        qMultiple = len(roots) > 1
        
    if qStopAfterTrees:
        if userSpeciesTree:
            st = ""
            if qMSA:
                st += "\nSequences for orthogroups:\n   %s\n" % seqs_alignments_dirs[0]
                st += "\nMultiple sequence alignments:\n   %s\n" % seqs_alignments_dirs[1]
            st += "\nGene trees:\n   %s\n" % (resultsDir + "Gene_Trees/")
            return st
        # otherwise, root species tree
        resultsSpeciesTrees = []
        for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)):
            if len(roots) == 1:
                resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt")
            else:
                resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted_at_outgroup_%d.txt" % i)
            util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True)
        db.DeleteBlastMatrices()
        CleanWorkingDir(db.workingDir)
        return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None, False)
    
    if qMultiple: util.PrintUnderline("\nAnalysing each of the potential species tree roots", True)
    resultsSpeciesTrees = []
    for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)):
        util.PrintUnderline("Reconciling gene trees and species tree" + (" (root %d)"%i if qMultiple else "")) 
        if qMultiple: 
            resultsDir_new = resultsDir + "Orthologues_using_outgroup_%d/" % i
            reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees_using_outgroup_%d/" % i
            resultsSpeciesTrees.append(resultsDir_new + "SpeciesTree_rooted_at_outgroup_%d.txt" % i)
            print("Outgroup: " + (", ".join([spDict[s] for s in r])))
        elif userSpeciesTree:
            resultsDir_new = resultsDir + "Orthologues/"
            reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/"
            resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt")
        else:
            resultsDir_new = resultsDir + "Orthologues/"
            reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/"
            resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt")
            print("Outgroup: " + (", ".join([spDict[s] for s in r])))
        os.mkdir(resultsDir_new)
        util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True)
        ReconciliationAndOrthologues(db.TreeFilename_IDs, db.ogSet, speciesTree_fn, db.workingDir, resultsDir_new, reconTreesRenamedDir, nHighParallel, i if qMultiple else None, pickleDir=pickleDir) 
    
    db.DeleteBlastMatrices()
    CleanWorkingDir(db.workingDir)
    util.PrintUnderline("Writing results files", True)
    
    return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None)
Пример #2
0
def GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir, speciesToUse,
                   nSpAll, clustersFilename_pairs, nProcesses):
    ogSet = OrthoGroupsSet(orthofinderWorkingDir,
                           speciesToUse,
                           nSpAll,
                           clustersFilename_pairs,
                           idExtractor=util.FirstWordExtractor)
    if len(ogSet.speciesToUse) < 4:
        print("ERROR: Not enough species to infer species tree")
        util.Fail()

    print("\n1. Checking required programs are installed")
    print("-------------------------------------------")
    if not CanRunDependencies(orthofinderWorkingDir):
        print(
            "Orthogroups have been inferred but the dependencies for inferring gene trees and\northologues have not been met. Please review previous messages for more information."
        )
        sys.exit()

    print("\n2. Calculating gene distances")
    print("-----------------------------")
    resultsDir = util.CreateNewWorkingDirectory(orthofinderResultsDir +
                                                "Orthologues_")

    db = DendroBLASTTrees(ogSet, resultsDir, nProcesses)
    db.ReadAndPickle()
    nOGs, D, spPairs, spTreeFN_ids = db.RunAnalysis()

    print("\n4. Best outgroup(s) for species tree")
    print("------------------------------------")
    spDict = ogSet.SpeciesDict()
    roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot(
        spTreeFN_ids,
        os.path.split(db.treesPatIDs)[0] + "/",
        rfd.GeneToSpecies_dash,
        nProcesses,
        treeFmt=1)
    if len(roots) > 1:
        print(
            "Observed %d duplications. %d support the best roots and %d contradict them."
            % (len(clusters), nSupport, len(clusters) - nSupport))
        print("Best outgroups for species tree:")
    else:
        print(
            "Observed %d duplications. %d support the best root and %d contradict it."
            % (len(clusters), nSupport, len(clusters) - nSupport))
        print("Best outgroup for species tree:")
    for r in roots:
        print("  " + (", ".join([spDict[s] for s in r])))

    qMultiple = len(roots) > 1
    if qMultiple:
        print("\nAnalysing each of the potential species tree roots.")
    resultsSpeciesTrees = []
    for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)):
        if qMultiple:
            resultsDir_new = resultsDir + "Orthologues_using_outgroup_%d/" % i
            reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees_using_outgroup_%d/" % i
            resultsSpeciesTrees.append(
                resultsDir_new + "SpeciesTree_rooted_at_outgroup_%d.txt" % i)
        else:
            resultsDir_new = resultsDir + "Orthologues/"
            reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/"
            resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt")
        os.mkdir(resultsDir_new)
        util.RenameTreeTaxa(speciesTree_fn,
                            resultsSpeciesTrees[-1],
                            db.ogSet.SpeciesDict(),
                            qFixNegatives=True)

        print("\n5%s. Reconciling gene and species trees" %
              ("-%d" % i if qMultiple else ""))
        print("-------------------------------------" +
              ("--" if qMultiple else ""))
        print("Outgroup: " + (", ".join([spDict[s] for s in r])))
        dlcparResultsDir = RunDlcpar(db.treesPatIDs, ogSet, nOGs,
                                     speciesTree_fn, db.workingDir)
        os.mkdir(reconTreesRenamedDir)
        for iog in xrange(len(db.ogSet.OGs())):
            util.RenameTreeTaxa(dlcparResultsDir +
                                "OG%07d_tree_id.locus.tree" % iog,
                                reconTreesRenamedDir + "OG%07d_tree.txt" % iog,
                                db.ogSet.Spec_SeqDict(),
                                qFixNegatives=False,
                                inFormat=8)

        # Orthologue lists
        print("\n6%s. Inferring orthologues from gene trees" %
              ("-%d" % i if qMultiple else ""))
        print("----------------------------------------" +
              ("--" if qMultiple else ""))
        pt.get_orthologue_lists(ogSet, resultsDir_new, dlcparResultsDir,
                                db.workingDir)

    CleanWorkingDir(db)
    print("\n7. Writing results files")
    print("------------------------")

    return GetResultsFilesString(resultsSpeciesTrees)
Пример #3
0
def GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir,
                   clustersFilename_pairs, nProcesses):
    ogSet = OrthoGroupsSet(orthofinderWorkingDir,
                           clustersFilename_pairs,
                           idExtractor=orthofinder.FirstWordExtractor)
    if len(ogSet.speciesToUse) < 4:
        print("ERROR: Not enough species to infer species tree")
        orthofinder.Fail()

    print("\n1. Checking required programs are installed")
    print("-------------------------------------------")
    if not CanRunDependencies(orthofinderWorkingDir): orthofinder.Fail()

    print("\n2. Reading sequence similarity scores")
    print("-------------------------------------")
    resultsDir = orthofinder.util.CreateNewWorkingDirectory(
        orthofinderResultsDir + "Orthologues_")

    db = DendroBLASTTrees(ogSet, resultsDir, nProcesses)
    db.ReadAndPickle()
    nOGs, D, spPairs, spTreeFN_ids = db.RunAnalysis()

    print("\n4. Best outgroup(s) for species tree")
    print("------------------------------------")
    spDict = ogSet.SpeciesDict()
    roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot(
        spTreeFN_ids,
        os.path.split(db.treesPatIDs)[0] + "/",
        rfd.GeneToSpecies_dash,
        nProcesses,
        treeFmt=1)
    if len(roots) > 1:
        print(
            "Observed %d duplications. %d support the best roots and %d contradict them."
            % (len(clusters), nSupport, len(clusters) - nSupport))
        print("Best outgroups for species tree:")
    else:
        print(
            "Observed %d duplications. %d support the best root and %d contradict it."
            % (len(clusters), nSupport, len(clusters) - nSupport))
        print("Best outgroup for species tree:")
    for r in roots:
        print("  " + (", ".join([spDict[s] for s in r])))

    qMultiple = len(roots) > 1
    if qMultiple:
        print("\nAnalysing each of the potential species tree roots.")
    resultsSpeciesTrees = []
    for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)):
        if qMultiple:
            resultsDir_new = resultsDir + "Orthologues_for_potential_outgroup_%d/" % i
        else:
            resultsDir_new = resultsDir + "Orthologues/"
        os.mkdir(resultsDir_new)
        resultsSpeciesTrees.append(resultsDir_new + "SpeciesTree_rooted.txt")
        db.RenameTreeTaxa(speciesTree_fn,
                          resultsSpeciesTrees[-1],
                          db.ogSet.SpeciesDict(),
                          qFixNegatives=True)

        print("\n5%s. Reconciling gene and species trees" %
              ("-%d" % i if qMultiple else ""))
        print("-------------------------------------" +
              ("--" if qMultiple else ""))
        print("Root: " + (", ".join([spDict[s] for s in r])))
        dlcparResultsDir = RunDlcpar(db.treesPatIDs, ogSet, nOGs,
                                     speciesTree_fn, db.workingDir)

        # Orthologue lists
        print("\n6%s. Inferring orthologues from gene trees" %
              ("-%d" % i if qMultiple else ""))
        print("----------------------------------------" +
              ("--" if qMultiple else ""))
        pt.get_orthologue_lists(ogSet, resultsDir_new, dlcparResultsDir,
                                db.workingDir)

    print("\n7. Writing results files")
    print("------------------------")

    return GetResultsFilesString(resultsSpeciesTrees)