def StartFromTrees(self, wd1_list, wd2, base, clustersFilename_pairs, speciesTreeFN, qIsUSerSpeciesTree, user_name=None): """ Convert user species tree here if necessary For OF species tree copy it to location given by FileHandler For user species tree, this must be done immediately by OF code """ self.wd_base = wd1_list self.wd_trees = wd2 if user_name == None: self.rd1 = util.CreateNewWorkingDirectory(base + "Results_") else: self.rd1 = util.CreateNewWorkingDirectory(base + "Results_" + user_name, qDate=False) self.wd_current = self.rd1 + "WorkingDirectory/" os.mkdir(self.wd_current) self.clustersFilename = clustersFilename_pairs[:-len("_id_pairs.txt")] self.StartLog() if not qIsUSerSpeciesTree: shutil.copy(speciesTreeFN, self.GetSpeciesTreeIDsRootedFN()) self.WriteToLog("Species Tree: %s\n" % speciesTreeFN) self.LogWorkingDirectoryTrees()
def StartFromOrthogroupsOrSequenceSearch(self, wd_base_list, base, clustersFilename_pairs=None, user_name=None, userSpeciesTree=None): """ NEed to initialise: wd_base wd_trees wd_current """ if len(self.wd_base) != 0: raise Exception("Changing WorkingDirectory1") self.wd_base = wd_base_list if clustersFilename_pairs != None: self.clustersFilename = clustersFilename_pairs[:-len( "_id_pairs.txt")] if user_name == None: self.rd1 = util.CreateNewWorkingDirectory(base + "Results_") else: self.rd1 = util.CreateNewWorkingDirectory(base + "Results_" + user_name, qDate=False) self.wd_current = self.rd1 + "WorkingDirectory/" os.mkdir(self.wd_current) with open(self.rd1 + "Log.txt", 'wb'): pass self.wd_trees = self.wd_current self.StartLog()
def OrthologuesFromTrees(groupsDir, workingDir, nHighParallel, speciesTree_fn = None, pickleDir=None): """ groupsDir - directory with orthogroups file in userSpeciesTree_fn - None if not supplied otherwise rooted tree using user species names (not orthofinder IDs) workingDir - orthologues 'WorkingDirectory' qUserSpTree - is the speciesTree_fn user-supplied Just infer orthologues from trees, don't do any of the preceeding steps. """ # Check species tree qUserSpTree = (speciesTree_fn != None) if qUserSpTree: if not os.path.exists(speciesTree_fn): print("\nERROR: %s does not exist\n" % speciesTree_fn) util.Fail() else: possibilities = ["SpeciesTree_ids_0_rooted.txt", "SpeciesTree_ids_1_rooted.txt", "SpeciesTree_user_ids.txt"] # etc (only need to determine if unique) nTrees = 0 for p in possibilities: fn = workingDir + "Trees_ids/" + p if os.path.exists(fn): nTrees += 1 speciesTree_fn = fn if nTrees == 0: print("\nERROR: There is a problem with the specified directory. The rooted species tree %s or %s is not present." % (possibilities[0], possibilities[2])) print("Please rectify the problem or alternatively use the -s option to specify the species tree to use.\n") util.Fail() if nTrees > 1: print("\nERROR: There is more than one rooted species tree in the specified directory structure. Please use the -s option to specify which species tree should be used\n") util.Fail() def TreePatIDs(iog): return workingDir + ("Trees_ids/OG%07d_tree_id.txt" % iog) reconTreesRenamedDir = workingDir + "Recon_Gene_Trees/" resultsDir_new = workingDir + "../Orthologues" # for the Orthologues_Species/ directories # if os.path.exists(resultsDir_new): resultsDir_new = util.CreateNewWorkingDirectory(resultsDir_new + "_") # else: # resultsDir_new += os.sep # os.mkdir(resultsDir_new) orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs = util.GetOGsFile(groupsDir) speciesToUse, nSpAll = util.GetSpeciesToUse(orthofinderWorkingDir + "SpeciesIDs.txt") ogSet = OrthoGroupsSet(orthofinderWorkingDir, speciesToUse, nSpAll, clustersFilename_pairs, idExtractor = util.FirstWordExtractor) if qUserSpTree: speciesToUseNames = ogSet.SpeciesDict().values() CheckUserSpeciesTree(speciesTree_fn, speciesToUseNames) speciesTree_fn = ConvertUserSpeciesTree(workingDir + "Trees_ids/", speciesTree_fn, ogSet.SpeciesDict()) util.PrintUnderline("Running Orthologue Prediction", True) util.PrintUnderline("Reconciling gene and species trees") ReconciliationAndOrthologues(TreePatIDs, ogSet, speciesTree_fn, workingDir, resultsDir_new, reconTreesRenamedDir, nHighParallel, pickleDir=pickleDir) util.PrintUnderline("Writing results files") CleanWorkingDir(workingDir) return "Species-by-species orthologues directory:\n %s\n" % resultsDir_new
def CreateOutputDirFromStart_new(self, fasta_dir, base, user_name=None, old_wd_base_list=None): """ The intial difference will be that results will go in OrthoFinder/Results_DATE or USER_SPECIFIED/RESULTS_DATE whereas before they went in Results_DATE or USER_SPECIFIED. If this is a composite analysis (-f + -b) then old_wd_base_list != None old_wd_base_list - first item is the WD from a previous analysis to be extended. If this extended other ones itself then there will be other items in the list. """ if user_name == None: self.rd1 = util.CreateNewWorkingDirectory(base + "Results_") else: self.rd1 = util.CreateNewWorkingDirectory(base + "Results_" + user_name, qDate=False) self.wd_current = self.rd1 + "WorkingDirectory/" os.mkdir(self.wd_current) self.wd_base = [self.wd_current] if old_wd_base_list != None: shutil.copy(old_wd_base_list[0] + "SpeciesIDs.txt", self.wd_current + "SpeciesIDs.txt") shutil.copy(old_wd_base_list[0] + "SequenceIDs.txt", self.wd_current + "SequenceIDs.txt") # Log the first wd in list, this can then be followed back to previous ones # Log file - point to WD at start of chain which contains the new species # wd_base_list - should contain current directory and then previous linked directories with open(self.wd_current + "previous_wd.txt", 'wb') as outfile: outfile.write(old_wd_base_list[0] + "\n") self.wd_base.extend(old_wd_base_list) self.wd_trees = self.wd_current self.StartLog()
def GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir, speciesToUse, nSpAll, clustersFilename_pairs, nProcesses): ogSet = OrthoGroupsSet(orthofinderWorkingDir, speciesToUse, nSpAll, clustersFilename_pairs, idExtractor=util.FirstWordExtractor) if len(ogSet.speciesToUse) < 4: print("ERROR: Not enough species to infer species tree") util.Fail() print("\n1. Checking required programs are installed") print("-------------------------------------------") if not CanRunDependencies(orthofinderWorkingDir): print( "Orthogroups have been inferred but the dependencies for inferring gene trees and\northologues have not been met. Please review previous messages for more information." ) sys.exit() print("\n2. Calculating gene distances") print("-----------------------------") resultsDir = util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") db = DendroBLASTTrees(ogSet, resultsDir, nProcesses) db.ReadAndPickle() nOGs, D, spPairs, spTreeFN_ids = db.RunAnalysis() print("\n4. Best outgroup(s) for species tree") print("------------------------------------") spDict = ogSet.SpeciesDict() roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot( spTreeFN_ids, os.path.split(db.treesPatIDs)[0] + "/", rfd.GeneToSpecies_dash, nProcesses, treeFmt=1) if len(roots) > 1: print( "Observed %d duplications. %d support the best roots and %d contradict them." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroups for species tree:") else: print( "Observed %d duplications. %d support the best root and %d contradict it." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroup for species tree:") for r in roots: print(" " + (", ".join([spDict[s] for s in r]))) qMultiple = len(roots) > 1 if qMultiple: print("\nAnalysing each of the potential species tree roots.") resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): if qMultiple: resultsDir_new = resultsDir + "Orthologues_using_outgroup_%d/" % i reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees_using_outgroup_%d/" % i resultsSpeciesTrees.append( resultsDir_new + "SpeciesTree_rooted_at_outgroup_%d.txt" % i) else: resultsDir_new = resultsDir + "Orthologues/" reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/" resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") os.mkdir(resultsDir_new) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) print("\n5%s. Reconciling gene and species trees" % ("-%d" % i if qMultiple else "")) print("-------------------------------------" + ("--" if qMultiple else "")) print("Outgroup: " + (", ".join([spDict[s] for s in r]))) dlcparResultsDir = RunDlcpar(db.treesPatIDs, ogSet, nOGs, speciesTree_fn, db.workingDir) os.mkdir(reconTreesRenamedDir) for iog in xrange(len(db.ogSet.OGs())): util.RenameTreeTaxa(dlcparResultsDir + "OG%07d_tree_id.locus.tree" % iog, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, db.ogSet.Spec_SeqDict(), qFixNegatives=False, inFormat=8) # Orthologue lists print("\n6%s. Inferring orthologues from gene trees" % ("-%d" % i if qMultiple else "")) print("----------------------------------------" + ("--" if qMultiple else "")) pt.get_orthologue_lists(ogSet, resultsDir_new, dlcparResultsDir, db.workingDir) CleanWorkingDir(db) print("\n7. Writing results files") print("------------------------") return GetResultsFilesString(resultsSpeciesTrees)
def OrthologuesWorkflow(workingDir_ogs, orthofinderResultsDir, speciesToUse, nSpAll, clustersFilename_pairs, tree_options, msa_method, tree_method, nHighParallel, nLowParrallel, userSpeciesTree = None, qStopAfterSeqs = False, qStopAfterAlign = False, qStopAfterTrees = False, qMSA = False, qPhyldog = False, pickleDir=None): """ 1. Setup: - ogSet, directories - DendroBLASTTress - object 2. DendrobBLAST: - read scores - RunAnalysis: Get distance matrices, do trees 3. Root species tree 4. Reconciliation/Orthologues 5. Clean up Variables: - ogSet - all the relevant information about the orthogroups, species etc. """ ogSet = OrthoGroupsSet(workingDir_ogs, speciesToUse, nSpAll, clustersFilename_pairs, idExtractor = util.FirstWordExtractor, pickleDir=pickleDir) # Class that is going to run the analysis needs to check the dependencies # if not CanRunOrthologueDependencies(workingDir_ogs, qMSA, qStopAfterTrees, userSpeciesTree == None): # print("Orthogroups have been inferred but the dependencies for inferring gene trees and") # print("orthologues have not been met. Please review previous messages for more information.") # sys.exit() resultsDir = util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") """ === 1 === ust = UserSpeciesTree MSA: Sequences Alignments GeneTrees db SpeciesTree Phyldog: Sequences Alignments GeneTrees db SpeciesTree Dendroblast: DistanceMatrices GeneTrees db SpeciesTree MSA (ust): Sequences Alignments GeneTrees db Phyldog (ust): Sequences Alignments GeneTrees db Dendroblast (ust): DistanceMatrices GeneTrees db """ if qMSA or qPhyldog: treeGen = msa.TreesForOrthogroups(tree_options, msa_method, tree_method, resultsDir, workingDir_ogs) seqs_alignments_dirs = treeGen.DoTrees(ogSet.OGs(qInclAll=True), ogSet.Spec_SeqDict(), nHighParallel, qStopAfterSeqs, qStopAfterAlign or qPhyldog) if qStopAfterSeqs: print("") return ("\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0]) elif qStopAfterAlign: print("") st = "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] return st db = DendroBLASTTrees(ogSet, resultsDir, nLowParrallel) if not userSpeciesTree: util.PrintUnderline("Inferring species tree (calculating gene distances)") print("Loading BLAST scores") db.ReadAndPickle() spTreeFN_ids, spTreeUnrootedFN = db.SpeciesTreeOnly() if qPhyldog: trees_from_phyldog.RunPhyldogAnalysis(resultsDir + "WorkingDirectory/phyldog/", ogSet.OGs(), speciesToUse) return "Running Phyldog" + "\n".join(seqs_alignments_dirs) else: util.PrintUnderline("Calculating gene distances") db = DendroBLASTTrees(ogSet, resultsDir, nLowParrallel) db.ReadAndPickle() nOGs, D, spTreeFN_ids, spTreeUnrootedFN = db.RunAnalysis() """ === 2 === Check can continue with analysis """ if len(ogSet.speciesToUse) < 4: print("ERROR: Not enough species to infer species tree") util.Fail() """ === 3 === MSA: RootSpeciesTree Phyldog: RootSpeciesTree Dendroblast: RootSpeciesTree MSA (ust): ConvertSpeciesTreeIDs Phyldog (ust): ConvertSpeciesTreeIDs Dendroblast (ust): ConvertSpeciesTreeIDs """ if userSpeciesTree: util.PrintUnderline("Using user-supplied species tree") userSpeciesTree = ConvertUserSpeciesTree(db.workingDir + "Trees_ids/", userSpeciesTree, ogSet.SpeciesDict()) rootedSpeciesTreeFN = [userSpeciesTree] roots = [None] qMultiple = False else: util.PrintUnderline("Best outgroup(s) for species tree") spDict = ogSet.SpeciesDict() roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot(spTreeFN_ids, os.path.split(db.TreeFilename_IDs(0))[0] + "/", rfd.GeneToSpecies_dash, nHighParallel, treeFmt = 1) if len(roots) > 1: print("Observed %d duplications. %d support the best roots and %d contradict them." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroups for species tree:") else: print("Observed %d duplications. %d support the best root and %d contradict it." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroup for species tree:") for r in roots: print(" " + (", ".join([spDict[s] for s in r])) ) qMultiple = len(roots) > 1 if qStopAfterTrees: if userSpeciesTree: st = "" if qMSA: st += "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] st += "\nGene trees:\n %s\n" % (resultsDir + "Gene_Trees/") return st # otherwise, root species tree resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): if len(roots) == 1: resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") else: resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted_at_outgroup_%d.txt" % i) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) db.DeleteBlastMatrices() CleanWorkingDir(db.workingDir) return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None, False) if qMultiple: util.PrintUnderline("\nAnalysing each of the potential species tree roots", True) resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): util.PrintUnderline("Reconciling gene trees and species tree" + (" (root %d)"%i if qMultiple else "")) if qMultiple: resultsDir_new = resultsDir + "Orthologues_using_outgroup_%d/" % i reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees_using_outgroup_%d/" % i resultsSpeciesTrees.append(resultsDir_new + "SpeciesTree_rooted_at_outgroup_%d.txt" % i) print("Outgroup: " + (", ".join([spDict[s] for s in r]))) elif userSpeciesTree: resultsDir_new = resultsDir + "Orthologues/" reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/" resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") else: resultsDir_new = resultsDir + "Orthologues/" reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/" resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") print("Outgroup: " + (", ".join([spDict[s] for s in r]))) os.mkdir(resultsDir_new) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) ReconciliationAndOrthologues(db.TreeFilename_IDs, db.ogSet, speciesTree_fn, db.workingDir, resultsDir_new, reconTreesRenamedDir, nHighParallel, i if qMultiple else None, pickleDir=pickleDir) db.DeleteBlastMatrices() CleanWorkingDir(db.workingDir) util.PrintUnderline("Writing results files", True) return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None)