def GetIDsDict(orthofinderWorkingDir):
    # sequence IDs
    idsFilename = orthofinderWorkingDir + "SequenceIDs.txt"
    try:
        idExtract = orthofinder.FirstWordExtractor(idsFilename)
        idDict = idExtract.GetIDToNameDict()
    except RuntimeError as error:
        print(error.message)
        if error.message.startswith("ERROR"):
            print("ERROR: %s contains a duplicate ID. If %s was prepared manually then please check the IDs are correct. " % (idsFilename, idsFilename))
            orthofinder.Fail()
        else:
            print("Tried to use only the first part of the accession in order to list the sequences in each orthologous group more concisely but these were not unique. Will use the full accession line instead.")     
            try:
                idExtract = orthofinder.FullAccession(idsFilename)
                idDict = idExtract.GetIDToNameDict()
            except:
                print("ERROR: %s contains a duplicate ID. If %s was prepared manually then please check the IDs are correct. " % (idsFilename, idsFilename))
                orthofinder.Fail()
    
    # species names
    speciesDict = dict()
    with open(orthofinderWorkingDir + "SpeciesIDs.txt", 'rb') as idsFile:
        for line in idsFile:
            iSpecies, filename = line.rstrip().split(": ", 1)
            iSpecies = iSpecies.replace("#", "")
            speciesName = os.path.splitext(os.path.split(filename)[1])[0]
            speciesDict[iSpecies] = speciesName   
    idDict = {seqID:speciesDict[seqID.split("_")[0]] + "_" + name for seqID, name in idDict.items()}
    return idDict    
def GetOGsFile(userArg):
    """returns the WorkingDirectory, ResultsDirectory and clusters_id_pairs filename"""
    qSpecifiedResultsFile = False
    if userArg == None:
        print("ERROR: orthofinder_results_directory has not been specified")
        orthofinder.Fail()
    if os.path.isfile(userArg):
        fn = os.path.split(userArg)[1]
        if ("clusters_OrthoFinder_" not in fn) or ("txt_id_pairs.txt" not in fn):
            print("ERROR:\n    %s\nis neither a directory or a clusters_OrthoFinder_*.txt_id_pairs.txt file." % userArg)
            orthofinder.Fail()
        qSpecifiedResultsFile = True
        # user has specified specific results file
    elif userArg[-1] != os.path.sep: 
        userArg += os.path.sep
    
    # find required files
    if qSpecifiedResultsFile:
        orthofinderWorkingDir = os.path.split(userArg)[0] + os.sep
        if not IsWorkingDirectory(orthofinderWorkingDir):
            print("ERROR: cannot find files from OrthoFinder run in directory:\n   %s" % orthofinderWorkingDir)
            orthofinder.Fail()
    else:
        orthofinderWorkingDir = os.path.split(userArg)[0] if qSpecifiedResultsFile else userArg
        if not IsWorkingDirectory(orthofinderWorkingDir):
            orthofinderWorkingDir = userArg + "WorkingDirectory" + os.sep   
            if not IsWorkingDirectory(orthofinderWorkingDir):
                print("ERROR: cannot find files from OrthoFinder run in directory:\n   %s\nor\n   %s\n" % (userArg, orthofinderWorkingDir))
                orthofinder.Fail()
            
    if qSpecifiedResultsFile:
        print("Generating trees for orthogroups in file:\n    %s" % userArg)
        return orthofinderWorkingDir, orthofinderWorkingDir, userArg
    else:     
        # identify orthogroups file
        clustersFiles = glob.glob(orthofinderWorkingDir + "clusters_OrthoFinder_*.txt_id_pairs.txt")
        orthogroupFiles = glob.glob(orthofinderWorkingDir + "OrthologousGroups*.txt") 
        if orthofinderWorkingDir != userArg:
            orthogroupFiles += glob.glob(userArg + "OrthologousGroups*.txt")
        # User may have specified a WorkingDirectory and results could be in directory above
        if len(orthogroupFiles) < len(clustersFiles):
            orthogroupFiles += glob.glob(userArg + ".." + os.sep + "OrthologousGroups*.txt")
        clustersFiles = sorted(clustersFiles)
        orthogroupFiles = sorted(orthogroupFiles)
        if len(clustersFiles) > 1 or len(orthogroupFiles) > 1:
            print("ERROR: Results from multiple OrthoFinder runs found\n")
            print("Tab-delimiter OrthologousGroups*.txt files:")
            for fn in orthogroupFiles:
                print("    " + fn)
            print("With corresponding cluster files:")
            for fn in clustersFiles:
                print("    " + fn)
            print("\nPlease run with only one set of results in directories or specifiy the specific clusters_OrthoFinder_*.txt_id_pairs.txt file on the command line")
            orthofinder.Fail()        
            
        if len(clustersFiles) != 1 or len(orthogroupFiles) != 1:
            print("ERROR: Results not found in <orthofinder_results_directory> or <orthofinder_results_directory>/WorkingDirectory")
            print("\nCould not find:\n    OrthologousGroups*.txt\nor\n    clusters_OrthoFinder_*.txt_id_pairs.txt")
            orthofinder.Fail()
            
        print("Generating trees for orthogroups in file:\n    %s" % orthogroupFiles[0])
        print("and corresponding clusters file:\n    %s" % clustersFiles[0])
        return orthofinderWorkingDir, userArg, clustersFiles[0]
        return orthofinderWorkingDir, userArg, clustersFiles[0]

if __name__ == "__main__":
    print("\nOrthoFinder Alignments and Trees version %s Copyright (C) 2015 David Emms\n" % version)
    print("""    This program comes with ABSOLUTELY NO WARRANTY.
    This is free software, and you are welcome to redistribute it under certain conditions.
    For details please see the License.md that came with this software.\n""")
    if len(sys.argv) == 1 or sys.argv[1] == "--help" or sys.argv[1] == "help" or sys.argv[1] == "-h":
        PrintHelp()
        sys.exit()
        
    v = map(int, orthofinder.version.split("."))
    v = 100 * v[0] + 10*v[1] + v[2] 
    if v < 28: 
        print("ERROR: OrthoFinder program has not been updated, please update 'orthofinder.py' to the version %s\n" % version)
        orthofinder.Fail()

    # Get arguments    
    userDir = None
    nProcesses = None
    
    args = sys.argv[1:]    
    while len(args) != 0:
        arg = args.pop(0)
        if arg == "-t" or arg == "--threads":
            if len(args) == 0:
                print("Missing option for command line argument -t")
                orthofinder.Fail()
            arg = args.pop(0)
            try:
                nProcesses = int(arg)
示例#4
0
def GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir,
                   clustersFilename_pairs, nProcesses):
    ogSet = OrthoGroupsSet(orthofinderWorkingDir,
                           clustersFilename_pairs,
                           idExtractor=orthofinder.FirstWordExtractor)
    if len(ogSet.speciesToUse) < 4:
        print("ERROR: Not enough species to infer species tree")
        orthofinder.Fail()

    print("\n1. Checking required programs are installed")
    print("-------------------------------------------")
    if not CanRunDependencies(orthofinderWorkingDir): orthofinder.Fail()

    print("\n2. Reading sequence similarity scores")
    print("-------------------------------------")
    resultsDir = orthofinder.util.CreateNewWorkingDirectory(
        orthofinderResultsDir + "Orthologues_")

    db = DendroBLASTTrees(ogSet, resultsDir, nProcesses)
    db.ReadAndPickle()
    nOGs, D, spPairs, spTreeFN_ids = db.RunAnalysis()

    print("\n4. Best outgroup(s) for species tree")
    print("------------------------------------")
    spDict = ogSet.SpeciesDict()
    roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot(
        spTreeFN_ids,
        os.path.split(db.treesPatIDs)[0] + "/",
        rfd.GeneToSpecies_dash,
        nProcesses,
        treeFmt=1)
    if len(roots) > 1:
        print(
            "Observed %d duplications. %d support the best roots and %d contradict them."
            % (len(clusters), nSupport, len(clusters) - nSupport))
        print("Best outgroups for species tree:")
    else:
        print(
            "Observed %d duplications. %d support the best root and %d contradict it."
            % (len(clusters), nSupport, len(clusters) - nSupport))
        print("Best outgroup for species tree:")
    for r in roots:
        print("  " + (", ".join([spDict[s] for s in r])))

    qMultiple = len(roots) > 1
    if qMultiple:
        print("\nAnalysing each of the potential species tree roots.")
    resultsSpeciesTrees = []
    for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)):
        if qMultiple:
            resultsDir_new = resultsDir + "Orthologues_for_potential_outgroup_%d/" % i
        else:
            resultsDir_new = resultsDir + "Orthologues/"
        os.mkdir(resultsDir_new)
        resultsSpeciesTrees.append(resultsDir_new + "SpeciesTree_rooted.txt")
        db.RenameTreeTaxa(speciesTree_fn,
                          resultsSpeciesTrees[-1],
                          db.ogSet.SpeciesDict(),
                          qFixNegatives=True)

        print("\n5%s. Reconciling gene and species trees" %
              ("-%d" % i if qMultiple else ""))
        print("-------------------------------------" +
              ("--" if qMultiple else ""))
        print("Root: " + (", ".join([spDict[s] for s in r])))
        dlcparResultsDir = RunDlcpar(db.treesPatIDs, ogSet, nOGs,
                                     speciesTree_fn, db.workingDir)

        # Orthologue lists
        print("\n6%s. Inferring orthologues from gene trees" %
              ("-%d" % i if qMultiple else ""))
        print("----------------------------------------" +
              ("--" if qMultiple else ""))
        pt.get_orthologue_lists(ogSet, resultsDir_new, dlcparResultsDir,
                                db.workingDir)

    print("\n7. Writing results files")
    print("------------------------")

    return GetResultsFilesString(resultsSpeciesTrees)