def blast(parameters, logFile): ''' Perform the homology search using the different BLAST package programs. This module offers retrocompatibility to legacy blast. ''' ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) ## Get output file name and check whether has been previously generated or ## not. It will also affect whether the variable REPLACE is set or not outFile = ("%s.homology.blast.out") % (oFile) ## If output file exist and it is not set to replace it, just go back to the ## main function. Otherwise, set the replace parameter to TRUE in other to ## replace any already generated file downstream if lookForFile(outFile) and not parameters["replace"]: return parameters["replace"] = True ## Generate command-line depending on which BLAST package is being used. if parameters["homology"][0] == "legacy_blast": binary = parameters["legacy_blast"][0] params = parameters[binary +"_params"] cmd = ("%s %s -e %s -d %s -i %s -o %s") % (parameters[binary], params, \ str(parameters["e_value"]), parameters["db_file"], parameters["in_file"],\ outFile) elif parameters["homology"][0] == "blast+": binary = parameters["blast+"][0] params = parameters[binary +"_params"] cmd = ("%s %s -evalue %s -db %s -query %s -out %s") % (parameters[binary], \ params, str(parameters["e_value"]), parameters["db_file"], \ parameters["in_file"], outFile) name = getfqdn() print(("###\n###\t[%s]\tCommand-line\t%s\n###\n") % (name, cmd), file = \ logFile) logFile.flush() try: proc = sp.Popen(cmd, shell = True, stderr = logFile) except OSError as e: sys.exit("ERROR: Execution failed: " + str(e)) if proc.wait() != 0: sys.exit(("ERROR: Execution failed: '%s'") % (parameters[binary])) ## Remove any error file generated during the legacy_blast execution - We try ## to delete this file only if it is empty if not lookForFile("error.log"): sp.call(("rm -f error.log"), shell = True)
def blast(parameters, logFile): ''' Perform the homology search using the different BLAST package programs. This module offers retrocompatibility to legacy blast. ''' ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) ## Get output file name and check whether has been previously generated or ## not. It will also affect whether the variable REPLACE is set or not outFile = ("%s.homology.blast.out") % (oFile) ## If output file exist and it is not set to replace it, just go back to the ## main function. Otherwise, set the replace parameter to TRUE in other to ## replace any already generated file downstream if lookForFile(outFile) and not parameters["replace"]: return parameters["replace"] = True ## Generate command-line depending on which BLAST package is being used. if parameters["homology"][0] == "legacy_blast": binary = parameters["legacy_blast"][0] params = parameters[binary + "_params"] cmd = ("%s %s -e %s -d %s -i %s -o %s") % (parameters[binary], params, \ str(parameters["e_value"]), parameters["db_file"], parameters["in_file"],\ outFile) elif parameters["homology"][0] == "blast+": binary = parameters["blast+"][0] params = parameters[binary + "_params"] cmd = ("%s %s -evalue %s -db %s -query %s -out %s") % (parameters[binary], \ params, str(parameters["e_value"]), parameters["db_file"], \ parameters["in_file"], outFile) name = getfqdn() print(("###\n###\t[%s]\tCommand-line\t%s\n###\n") % (name, cmd), file = \ logFile) logFile.flush() try: proc = sp.Popen(cmd, shell=True, stderr=logFile) except OSError as e: sys.exit("ERROR: Execution failed: " + str(e)) if proc.wait() != 0: sys.exit(("ERROR: Execution failed: '%s'") % (parameters[binary])) ## Remove any error file generated during the legacy_blast execution - We try ## to delete this file only if it is empty if not lookForFile("error.log"): sp.call(("rm -f error.log"), shell=True)
def checkAlignment(ifile_1, ifile_2, iformat_1="fasta", iformat_2="fasta"): ''' Read two giving input files and check both contain the same sequences and the same input strings ''' ## We introduce a delay to ensure data is already written in the disk. ## With high-computing facilities, sometimes there are some problems of ## writing to disk the already computed results if not lookForFile(ifile_1) or not lookForFile(ifile_2, attempts=5): return False ## Read both input files - remvoving ambiguous characters and checking for ## duplicate names. We used regular expressions for removing any character inSeqs_1 = {} for record in SeqIO.parse(ifile_1, iformat_1): if record.id in inSeqs_1: print >> sys.stderr, ("ERROR: Repeated sequence '%s' ['%s']") \ % (record.id, ifile_1) return False seq = re.sub(r'[^a-zA-Z]', '', str(record.seq)) inSeqs_1.setdefault(record.id, seq.upper().strip()) inSeqs_2 = {} for record in SeqIO.parse(ifile_2, iformat_2): if record.id in inSeqs_2: print >> sys.stderr, ("ERROR: Repeated sequence '%s' ['%s']") \ % (record.id, ifile_2) return False seq = re.sub(r'[^a-zA-Z]', '', str(record.seq)) inSeqs_2.setdefault(record.id, seq.upper().strip()) ## If there are inconsistencies among sequences, inform about them if set(inSeqs_1.keys()) ^ set(inSeqs_2.keys()) != set(): print >> sys.stderr, ( "ERROR: Non-overlapping sequences identifier detected " + "between input ['%s'] and output ['%s'] files ") % (ifile_1, ifile_2) return False ## Check that sequences in both files contain the same residues for seq in inSeqs_1: if inSeqs_1[seq] != inSeqs_2[seq]: print >> sys.stderr, ( "ERROR: Different sequence composition for '%s' bet" + "ween input ['%s'] and output ['%s'] files") % (seq, ifile_1, ifile_2) return False ## If everything is OK, inform about it return True
def checkAlignment(ifile_1, ifile_2, iformat_1 = "fasta", iformat_2 = "fasta"): ''' Read two giving input files and check both contain the same sequences and the same input strings ''' ## We introduce a delay to ensure data is already written in the disk. ## With high-computing facilities, sometimes there are some problems of ## writing to disk the already computed results if not lookForFile(ifile_1) or not lookForFile(ifile_2, attempts = 5): return False ## Read both input files - remvoving ambiguous characters and checking for ## duplicate names. We used regular expressions for removing any character inSeqs_1 = {} for record in SeqIO.parse(ifile_1, iformat_1): if record.id in inSeqs_1: print(("ERROR: Repeated sequence '%s' ['%s']") \ % (record.id, ifile_1), file = sys.stderr) return False seq = re.sub(r'[^a-zA-Z]', '', str(record.seq)) inSeqs_1.setdefault(record.id, seq.upper().strip()) inSeqs_2 = {} for record in SeqIO.parse(ifile_2, iformat_2): if record.id in inSeqs_2: print(("ERROR: Repeated sequence '%s' ['%s']") \ % (record.id, ifile_2), file = sys.stderr) return False seq = re.sub(r'[^a-zA-Z]', '', str(record.seq)) inSeqs_2.setdefault(record.id, seq.upper().strip()) ## If there are inconsistencies among sequences, inform about them if set(inSeqs_1.keys()) ^ set(inSeqs_2.keys()) != set(): print(("ERROR: Non-overlapping sequences identifier detected " + "between input ['%s'] and output ['%s'] files ") % (ifile_1, ifile_2), \ file = sys.stderr) return False ## Check that sequences in both files contain the same residues for seq in inSeqs_1: if inSeqs_1[seq] != inSeqs_2[seq]: print(("ERROR: Different sequence composition for '%s' bet" + "ween input ['%s'] and output ['%s'] files") % (seq, ifile_1, \ ifile_2), file = sys.stderr) return False ## If everything is OK, inform about it return True
def reverseSequences(binary, in_file, out_file, replace, logFile): ''' Reverse the input sequences using readAl for that purpose ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False ## Define the command-line for getting the sequences reverse independently of ## being aligned or not and of the input format cmd = ("%s -in %s -out %s -reverse") % (binary, in_file, out_file) name = getfqdn() print(("###\n###\treadAl - reverse seqs"), file=logFile) print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file=logFile) try: proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile) except OSError as e: print("ERROR: Execution failed: " + str(e), file=sys.stderr) sys.exit(exit_codes["readal"]) if proc.wait() != 0: print(("ERROR: Execution failed: readAl"), file=sys.stderr) sys.exit(exit_codes["readal"]) return True
def convertInputFile_Format(label, binary, in_file, out_file, out_format, \ logFile, replace): ''' Convert a giving input file into a given output format ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False ## Define the command-line for getting the input file format cmd = ("%s -in %s -out %s -%s") % (binary, in_file, out_file, out_format) name = getfqdn() print(("###\n###\t%s - get format") % (label.upper()), file = logFile) print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file = logFile) logFile.flush() try: proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile) except OSError as e: print("ERROR: Execution failed: " + str(e), file = sys.stderr) sys.exit(exit_codes[label]) if proc.wait() != 0: print(("ERROR: Execution failed: %s") % (label.upper()), file = sys.stderr) sys.exit(exit_codes[label]) return True
def convertInputFile_Format(label, binary, in_file, out_file, out_format, \ logFile, replace): ''' Convert a giving input file into a given output format ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False ## Define the command-line for getting the input file format cmd = ("%s -in %s -out %s -%s") % (binary, in_file, out_file, out_format) name = getfqdn() print(("###\n###\t%s - get format") % (label.upper()), file=logFile) print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file=logFile) logFile.flush() try: proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile) except OSError as e: print("ERROR: Execution failed: " + str(e), file=sys.stderr) sys.exit(exit_codes[label]) if proc.wait() != 0: print(("ERROR: Execution failed: %s") % (label.upper()), file=sys.stderr) sys.exit(exit_codes[label]) return True
def reverseSequences(binary, in_file, out_file, replace, logFile): ''' Reverse the input sequences using readAl for that purpose ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False ## Define the command-line for getting the sequences reverse independently of ## being aligned or not and of the input format cmd = ("%s -in %s -out %s -reverse") % (binary, in_file, out_file) name = getfqdn() print(("###\n###\treadAl - reverse seqs"), file = logFile) print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file = logFile) try: proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile) except OSError as e: print("ERROR: Execution failed: " + str(e), file = sys.stderr) sys.exit(exit_codes["readal"]) if proc.wait() != 0: print(("ERROR: Execution failed: readAl"), file = sys.stderr) sys.exit(exit_codes["readal"]) return True
def trimmingAlignment(label, binary, parameters, out_file, logFile, replace, \ in_file = None, compare_msa = None, force_refer_msa = None, cds = None): ''' Function to trim a given multiple sequence alignment according to a number of parameters. It may also returns the output file in codons if appropiate parameters are used. ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False cmd = "" ## Construct a customize trimAl command-line call ## If an input CDS file is set, generate the output alignment using such ## information if cds: cmd = ("%s -backtrans %s ") % (cmd, cds) if compare_msa: cmd = ("%s -compareset %s ") % (cmd, compare_msa) if force_refer_msa: cmd = ("%s -forceselect %s ") % (cmd, force_refer_msa) if in_file: cmd = ("%s -in %s ") % (cmd, in_file) cmd = ("%s %s -out %s %s") % (binary, cmd, out_file, parameters) ## Record the time and precise command-line name = getfqdn() start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tTrimming Input MSA\t%s") % (date), file=logFile) print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file=logFile) logFile.flush() try: proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile) except OSError as e: print("ERROR: Execution failed: " + str(e), file=sys.stderr) sys.exit(exit_codes[label]) if proc.wait() != 0: print(("ERROR: Execution failed: %s") % (label.upper()), file=sys.stderr) sys.exit(exit_codes[label]) final = datetime.datetime.now() ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTime\t%s\n###") % (total), file=logFile) logFile.flush() return True
def trimmingAlignment(label, binary, parameters, out_file, logFile, replace, \ in_file = None, compare_msa = None, force_refer_msa = None, cds = None): ''' Function to trim a given multiple sequence alignment according to a number of parameters. It may also returns the output file in codons if appropiate parameters are used. ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False cmd = "" ## Construct a customize trimAl command-line call ## If an input CDS file is set, generate the output alignment using such ## information if cds: cmd = ("%s -backtrans %s ") % (cmd, cds) if compare_msa: cmd = ("%s -compareset %s ") % (cmd, compare_msa) if force_refer_msa: cmd = ("%s -forceselect %s ") % (cmd, force_refer_msa) if in_file: cmd = ("%s -in %s ") % (cmd, in_file) cmd = ("%s %s -out %s %s") % (binary, cmd, out_file, parameters) ## Record the time and precise command-line name = getfqdn() start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tTrimming Input MSA\t%s") % (date), file = logFile) print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file = logFile) logFile.flush() try: proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile) except OSError as e: print("ERROR: Execution failed: " + str(e), file = sys.stderr) sys.exit(exit_codes[label]) if proc.wait() != 0: print(("ERROR: Execution failed: %s") % (label.upper()), file = sys.stderr) sys.exit(exit_codes[label]) final = datetime.datetime.now() ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTime\t%s\n###") % (total), file = logFile) logFile.flush() return True
def perform_tree(label, binary, parameters, in_file, out_file, stats_file, \ logFile, replace): ''' Function to format the command-line of different phylogenetic tree reconstruc- tion programs and execute such command lines. ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False if label in ["phyml", "codonphyml"]: cmd = ("%s -i %s %s") % (binary, in_file, parameters) elif label in ["fasttree"]: cmd = ("%s %s -log %s -out %s %s") % (binary, parameters, stats_file, \ out_file, in_file) elif label in ["raxml"]: random_seed = randint(1, 10000) suffix = ("%s_%d") % (label, random_seed) cmd = ("%s -n %s -p %d -s %s %s") % (binary, suffix, random_seed, in_file, \ parameters) else: sys.exit(exit_codes["generic"]) ## Record the time and precise command-line name = getfqdn() start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print >> logFile, ("###\n###\t%s - Phylogenetic Trees\t") % ( label.upper()), print >> logFile, ("%s\n###\t[%s]\tCommand-line\t%s\n###") % (date, name, cmd) logFile.flush() try: ## We add a small pipeline to avoid informatin written in the same line proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile, stdin=sp.PIPE) except OSError, e: print >> sys.stderr, "ERROR: Execution failed: " + str(e) sys.exit(exit_codes[label])
def replaceRareAminoAcids(in_file, out_file, replace, logFile, combinations, \ back = False, in_format = "fasta"): ''' Replace rare amino-acids occurrence by wildcards, and vice-versa. It will only works with input files in FASTA format ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False subs = {} for comb in map(strip, combinations.split()): ## Depending on the direction of the conversion, make it on one way or in ## the way around src, dst = comb.split(":")[::-1] if back else comb.split(":") subs.setdefault(src, dst) ## Record some stats about which amino-acids and how many times they have been ## detected stats = dict([(letter, 0) for letter in subs]) oFile = open(out_file, "w") for record in SeqIO.parse(in_file, in_format): seq = str(record.seq) for letter in subs: seq = seq.replace(letter, subs[letter]) stats[letter] += seq.count(subs[letter]) print((">%s\n%s") % (record.id, splitSequence(seq)), file=oFile) oFile.close() output = "|\t".join([("'%s' > '%s'\tfreq: %d") % (aa, subs[aa], stats[aa]) \ for aa in stats if stats[aa] > 0]) name = getfqdn() print(("###\n###\t[%s]\tSubstituting Rare Amino-Acids") % (name), file = \ logFile) print(("###\tReport\t%s") % (output), file=logFile) logFile.flush() return True
def replaceRareAminoAcids(in_file, out_file, replace, logFile, combinations, \ back = False, in_format = "fasta"): ''' Replace rare amino-acids occurrence by wildcards, and vice-versa. It will only works with input files in FASTA format ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False subs = {} for comb in map(strip, combinations.split()): ## Depending on the direction of the conversion, make it on one way or in ## the way around src, dst = comb.split(":")[::-1] if back else comb.split(":") subs.setdefault(src, dst) ## Record some stats about which amino-acids and how many times they have been ## detected stats = dict([(letter, 0) for letter in subs]) oFile = open(out_file, "w") for record in SeqIO.parse(in_file, in_format): seq = str(record.seq) for letter in subs: seq = seq.replace(letter, subs[letter]) stats[letter] += seq.count(subs[letter]) print((">%s\n%s") % (record.id, splitSequence(seq)), file = oFile) oFile.close() output = "|\t".join([("'%s' > '%s'\tfreq: %d") % (aa, subs[aa], stats[aa]) \ for aa in stats if stats[aa] > 0]) name = getfqdn() print(("###\n###\t[%s]\tSubstituting Rare Amino-Acids") % (name), file = \ logFile) print(("###\tReport\t%s") % (output), file = logFile) logFile.flush() return True
def get_likelihood(label, stats_file): ## Check whether the STATS file is available or not if not lookForFile(stats_file): return None logLK = None ## PHYML/CodonPhyML if label in ["phyml", "codonphyml"]: for line in open(stats_file, "rU"): if not line.startswith(". Log-likelihood"): continue logLK = float(list(map(strip, line.split()))[2]) break ## FastTree elif label in ["fasttree"]: for line in open(stats_file, "rU"): if line.lower().find("loglk") == -1: continue f = list(map(strip, line.split("\t"))) try: value = float(f[2]) except: continue logLK = value if not logLK or value < logLK else logLK ## RAXML for line in open(stats_file, "rU"): if not line.lower().startswith("final") or line.lower().find( "score") == -1: continue logLK = float(list(map(strip, line.split()))[-1]) break ## Return the likelihood value for the current tree return logLK
def get_likelihood(label, stats_file): ## Check whether the STATS file is available or not if not lookForFile(stats_file): return None logLK = None ## PHYML/CodonPhyML if label in ["phyml", "codonphyml"]: for line in open(stats_file, "rU"): if not line.startswith(". Log-likelihood"): continue logLK = float(list(map(strip, line.split()))[2]) break ## FastTree elif label in ["fasttree"]: for line in open(stats_file, "rU"): if line.lower().find("loglk") == -1: continue f = list(map(strip, line.split("\t"))) try: value = float(f[2]) except: continue logLK = value if not logLK or value < logLK else logLK ## RAXML for line in open(stats_file, "rU"): if not line.lower().startswith("final") or line.lower().find("score") == -1: continue logLK = float(list(map(strip, line.split()))[-1]) break ## Return the likelihood value for the current tree return logLK
if lookForDirectory(args.outDir, False): sys.exit(("ERROR: Output ROOT folder already exist '%s'") % (args.outDir)) args.outDir = os.path.abspath(args.outDir) ## ... and try to create it in case it doesn't exist if not lookForDirectory(args.outDir, create = True): sys.exit(("ERROR: ROOT folder '%s' cannot be created") % (args.outDir)) ## Create folders to store the jobs file and (potentially) the configuration ## file and input databases lookForDirectory(os.path.join(args.outDir, "jobs")) lookForDirectory(os.path.join(args.outDir, "Data")) lookForDirectory(os.path.join(args.outDir, "BlastDB")) ## Check parameters related to files / directories if not lookForFile(os.path.abspath(args.script)): sys.exit(("ERROR: Check input SCRIPT file '%s'") % (args.script)) args.script = os.path.abspath(args.script) ## Databases and configuration files will be, by default, copied into the new ## data structure. It will guarantee to have everything under the same ROOT ## folder if not lookForFile(os.path.abspath(args.configFile)): sys.exit(("ERROR: Check input CONFIG file '%s'") % (args.configFile)) args.configFile = os.path.abspath(args.configFile) config = ("%s/jobs/%s") % (args.outDir, os.path.split(args.configFile)[1]) \ if args.copy else args.configFile if not lookForFile(os.path.abspath(args.dbFile)): sys.exit(("ERROR: Check input TARGET SEQUENCES file '%s'") % (args.dbFile))
## Get current directory - we will use this for normalizing input files and ## directories to their absolute paths current_directory = os.getcwd() ## Assign input parameters directly to the dictionary which will contain all ## current run configuration. parameters = {} parameters.setdefault("replace", args.replace) ## Assign which step is being executed. It is useful to know whether the log ## file should be replaced or not - even when the flag "replace" is set parameters.setdefault("step", 0) ## Check parameters related to files / directories if not lookForFile(args.inFile): sys.exit(("ERROR: Check input QUERY SEQUENCE/s file '%s'") % (args.inFile)) parameters.setdefault("in_file", os.path.abspath(args.inFile)) if not lookForFile(args.dbFile): sys.exit(("ERROR: Check input TARGET SEQUENCES file '%s' [Mode: HOMOLOGY " + "SEARCH]") % (args.dbFile)) parameters.setdefault("db_file", os.path.abspath(args.dbFile)) if args.cdsFile: if not lookForFile(args.cdsFile): sys.exit(("ERROR: Check input CDS file '%s'") % (args.cdsFile)) parameters.setdefault("cds", os.path.abspath(args.cdsFile)) if not lookForFile(args.configFile): sys.exit(("ERROR: Check input CONFIG file '%s'") % (args.configFile))
def phylogenetic_trees(parameters): ''' Phylogenetic trees are reconstructed according to the input parameters. Once the different files have been generated, the function moves those files into a pre-established filename schema ''' ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) current_directory = os.getcwd() ## Change current directory to the output folder. Any temporary file will be ## generated therefore in this folder os.chdir(parameters["out_directory"]) ## Depending on the verbosity level - set the appropriate logfile value if not "verbose" in parameters or parameters["verbose"] == 0: logFile = open(os.devnull, 'wb') ## ALL/logfile elif parameters["verbose"] == 1: ## Set output filename and log file mode = "w" if parameters["replace"] and parameters[ "step"] == 0 else "a+" logFile = open(oFile + ".log", mode) ## ALL/Stderr elif parameters["verbose"] == 2: logFile = sys.stderr start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tSTART\t" + "%s\n###") % (date), file=logFile) logFile.flush() ## Get which program will be used to reconstruct phylogenetic trees. Check ## such program is listed among the available binaries if not "tree" in parameters: sys.exit( "ERROR: Check your configuration file. There is no definition for " + "the Phylogenetic TREE reconstruction step") prog = parameters["tree"][0] if not prog in parameters: sys.exit( ("ERROR: Selected program '%s' is not available accordding to the " "the configuration file") % (prog)) ## Get binary as well as any default parameters for the selected program binary = parameters[prog] key = ("%s_params") % (prog) progr_params = parameters[key] if key in parameters else "" if not "evol_models" in parameters: sys.exit( "ERROR: Check your configuration file. There is no definition for " + "the <evol_models> parameter") ## If the evolutionary model list is not appropiately formated, do it if isinstance(parameters["evol_models"], str): parameters["evol_models"] = list( map(strip, parameters["evol_models"].split())) ## Check if <numb_models parameters is defined and how many models are ## requested to be evaluated if not "numb_models" in parameters or parameters["numb_models"].lower() \ == "all": parameters["numb_models"] = len(parameters["evol_models"]) parameters["numb_models"] = int(parameters["numb_models"]) if not parameters["numb_models"] in range( 1, len(parameters["evol_models"]) + 1): sys.exit( ("ERROR: Check how many evolutionary models has been asked to re" + "construct '%d'") % (parameters["numb_models"])) ## Check whether "readAl" is available or not. It is useful for sequences ## manipulation independently of the input format. if not "readal" in parameters: sys.exit("ERROR: Check your CONFIG file. 'readAl' is not available") ## Create a temporary FASTA file which will be used to detect the sequence ## number on the input alignment and the presence of rare amino-acids TEMPFILE = tempfile.NamedTemporaryFile() convertInputFile_Format("readal", parameters["readal"], parameters["in_file"], TEMPFILE.name, "fasta", logFile, parameters["replace"]) TEMPFILE.flush() numSeqs, selenocys, pyrrolys = check_count_sequences(TEMPFILE.name) ## Set the minimum number of sequences required to reconstruct an alignment min_seqs = int(parameters["min_seqs"] if "min_seqs" in parameters else \ min_seqs_analysis) ## Finish when there are not enough sequences to make an alignment if numSeqs < min_seqs: print(("### INFO: It is necessary, at least, %d sequences to " + "to reconstruct an alignment (%d)") % (min_seqs, numSeqs), file=logFile) sys.exit(80) ## Check which approaches should be used for the phylogenetic reconstruction ## and whether there are specific program's parameters for them if not "tree_approach" in parameters: parameters["tree_approach"] = ["ml"] ## Remove potential duplicates and lowercase all approaches for the tree ## reconstruction parameters["tree_approach"] = set([p.lower() for p in \ parameters["tree_approach"]]) ## We will first loot for Neighbour Joining tree reconstruction, then for ## Maximum likelihood and then for any other approach defined in the config ## file tree_approaches = [] if "nj" in parameters["tree_approach"]: tree_approaches.append("nj") if "ml" in parameters["tree_approach"]: tree_approaches.append("ml") others = parameters["tree_approach"] - set(["nj", "ml"]) if others != set(): tree_approaches += sorted(others) ## When using RAxML, it may crash when Selenocysteines or Pyrrolysines are ## present in the input alignment if prog in ["raxml"]: ## If Selenocysteines or Pyrrolysines are present, substitute them by "X" if selenocys or pyrrolys: out_file = ("%s.no_rare_aa") % (parameters["in_file"]) if replaceRareAminoAcids(TEMPFILE.name, out_file, parameters["replace"], logFile, "U:X O:X"): parameters["replace"] = True parameters["in_file"] = out_file TEMPFILE.close() ## When using FastTree force the conversion of input alignment to FASTA format ## since it may crash reading standard interleave PHYLIP format files if prog in ["fasttree"]: in_file_format, aligned = getFileFormat("readal", parameters["readal"], \ parameters["in_file"], logFile) if in_file_format != "fasta": out_file = ("%s.fa") % (parameters["in_file"]) if (convertInputFile_Format("readal", parameters["readal"], \ parameters["in_file"], out_file, "fasta", logFile, parameters["replace"])): parameters["replace"] = True parameters["in_file"] = out_file replace = parameters["replace"] selected_models = parameters["evol_models"] ## Reconstruct trees for each approach considering evolutionary models order ## according their likelihood values for approach in tree_approaches: ## Save results - we will use such data for selecting the best -if required- ## models fitting to the input data results = {} ## Format the choosen program's parameters according to the default ones and ## the specific ones for the current approach params = ("%s ") % (progr_params) params += parameters[approach] if approach in parameters else "" for model in selected_models: out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog, approach, model) stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog, approach, model) if prog in ["phyml"]: exec_params = ("%s -m %s") % (params, model) ## Get additional model -if any- for codons elif prog in ["codonphyml"]: exec_params = ("%s -m %s") % (params, model) add_model = [p.split()[1] for p in map(strip, exec_params.split("-")) \ if p.startswith("fmodel")] if len(add_model) == 1: add_model = add_model.pop() model = ("%s_%s") % (model, add_model) out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog, approach, model) stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog, approach, model) elif prog in ["fasttree"]: ## On FastTree is selected by default JTT model for AAs - so we don't ## set-up that model exec_params = ("%s -%s") % (params, model) if model.lower() != "jtt" \ and model.lower() != "jc" else params model = model.upper() ## In the case of RAxML, we would concatenate the model to an specific ## input parameter elif prog in ["raxml"]: final_model = model ## It is possible to add some suffixes to the evolutionary models ## in RAxML - There is not better/easy way to code this option if "raxml_model_suffix" in parameters: final_model += parameters["raxml_model_suffix"] exec_params = " ".join([ ("-%s%s") % (p, final_model if p.startswith("m ") else "") for p in map(strip, params.split("-")) if p ]) ## Build the phylogenetic tree using any of the available methods and ## register if any downstream file should be redone. if perform_tree(prog, binary, exec_params, parameters["in_file"], out_file, stats_file, logFile, parameters["replace"]): replace = True ## Get the likelihood for each of the reconstructed models log_lk = get_likelihood(prog, stats_file) if not log_lk: print(("ERROR: Impossible to the Log likelihood values " + "for '%s' model using this program '%s'") % (model, prog), file = \ sys.stderr) sys.exit(exit_codes[prog]) results.setdefault(model, log_lk) ## Get the models sorted by their likelihood values records = sorted(iter(results.items()), key=itemgetter(1), reverse=True) ## Set the filename which stores the ranking rank_file = ("%s.tree.%s.rank.%s") % (oFile, prog, approach) update = False ## Check the content of the rankings file - if any. ## Marked the file as updatable if there is any discrepancy if not replace and lookForFile(rank_file): old_content = "\n".join([ "\t".join(list(map(strip, line.split("\t")))) for line in open(rank_file, "rU") ]) newly_generated = "\n".join([("%s\t%s") % (r[0], r[1]) for r in records]) ## Decide whether ranking file should be updated after comparing current ## content with newly generated content update = old_content != newly_generated ## If the file containing the ranking doesn't exist, generate it. ## Update the file content if the replace flag is set to true or the content ## has changed - since the phylogenetic tree reconstruction step is the most ## expensive one - in terms of time/memory consumption - we are not setting ## replace flag to True even when this file is generated/updated. On this ## way, we can take adventage of any tree generated in any downstream step. if not lookForFile(rank_file) or replace or update: out_file = open(rank_file, "w") print("\n".join([("%s\t%s") % (r[0], r[1]) for r in records]), file = \ out_file) out_file.close() ## We could set the replace flag to True. However, if any tree has been ## generated 'de novo' during this iteration, then the flag is already set ## to True. #~ parameters["replace"] = True ## Select a given number of models for the next iteration - if any selected_models = [ pair[0] for pair in records[:parameters["numb_models"]] ] ## Remove the Codon Frequency model from potential new iterations if prog in ["codonphyml"] and add_model: selected_models = [ m.replace("_" + add_model, "") for m in selected_models if m.endswith(add_model) ] final = datetime.datetime.now() date = final.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tEND\t" + "%s") % (date), file=logFile) ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTOTAL Time\tPhylogenetic Tree Reconstruction\t%s" + "\n###") % (total), file=logFile) ## We just close logfile and clean it up when it is a file if "verbose" in parameters and parameters["verbose"] == 1: logFile.close() ## Clean-up log directory from undesirable lines try: sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell=True) sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell=True) sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell=True) except OSError: print(("ERROR: Impossible to clean-up '%s.log' log file") \ % (oFile), file=sys.stderr) ## Before returning to the main program, get back to the original working ## directory os.chdir(current_directory) return parameters
def perform_tree(label, binary, parameters, in_file, out_file, stats_file, \ logFile, replace): ''' Function to format the command-line of different phylogenetic tree reconstruc- tion programs and execute such command lines. ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False if label in ["phyml", "codonphyml"]: cmd = ("%s -i %s %s") % (binary, in_file, parameters) elif label in ["fasttree"]: cmd = ("%s %s -log %s -out %s %s") % (binary, parameters, stats_file, \ out_file, in_file) elif label in ["raxml"]: random_seed = randint(1, 10000) suffix = ("%s_%d") % (label, random_seed) cmd = ("%s -n %s -p %d -s %s %s") % (binary, suffix, random_seed, in_file, \ parameters) else: sys.exit(exit_codes["generic"]) ## Record the time and precise command-line name = getfqdn() start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\t%s - Phylogenetic Trees\t") % (label.upper()), end = ' ', \ file = logFile) print(("%s\n###\t[%s]\tCommand-line\t%s\n###") % (date, name, cmd), file = \ logFile) logFile.flush() try: ## We add a small pipeline to avoid informatin written in the same line proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile, stdin=sp.PIPE) except OSError as e: print("ERROR: Execution failed: " + str(e), file=sys.stderr) sys.exit(exit_codes[label]) proc.communicate(b'\n\nY\n') if proc.wait() != 0: print(("ERROR: Execution failed: %s") % (label.upper()), file=sys.stderr) sys.exit(exit_codes[label]) final = datetime.datetime.now() ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTime\t%s\n###") % (total), file=logFile) logFile.flush() ## Process program's output and rename output files according to our own ## scheme if label in ["phyml", "codonphyml"]: ## Since resulting tree/stats file have slightly changed between version, ## we have to control for that. tree_file = ("%s_%s_tree.txt") % (in_file, label) sts_file = ("%s_%s_stats.txt") % (in_file, label) if not lookForFile(tree_file, attempts=2): tree_file = ("%s_%s_tree") % (in_file, label) sts_file = ("%s_%s_stats") % (in_file, label) try: sp.call(("mv %s %s") % (tree_file, out_file), shell=True) sp.call(("mv %s %s") % (sts_file, stats_file), shell=True) except OSError: print(("ERROR: Impossible to rename '%s' output files") \ % (label.upper()), file=sys.stderr) sys.exit(exit_codes[label]) elif label in ["raxml"]: try: sp.call(("mv RAxML_bestTree.%s %s") % (suffix, out_file), shell=True) sp.call(("mv RAxML_info.%s %s") % (suffix, stats_file), shell=True) except OSError: print(("ERROR: Impossible to rename RAxML output files"), file = \ sys.stderr) sys.exit(exit_codes[label]) oFile = open(stats_file, "a+") for oth_file in listDirectory(os.path.split(stats_file)[0], suffix): fileName = os.path.split(oth_file)[1] hz_line = "#" * (len(fileName) + 4) print(("%s\n%s\n%s") % (hz_line, fileName, hz_line), file=oFile) print(("%s") % ("".join(open(oth_file, "rU").readlines())), file=oFile) sp.call(("rm -f %s") % (oth_file), shell=True) oFile.close() return True
def perfomAlignment(label, binary, parameters, in_file, out_file, logFile, \ replace): ''' Function to format the command-line of different multiple sequence alignment programs and execute such command lines. It is also support a generic call for those programs which has no specific support in the pipeline ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False if label in ["muscle", "kalign"]: cmd = ("%s %s -in %s -out %s") % (binary, parameters, in_file, out_file) elif label in ["clustalw"]: cmd = ("%s %s -INFILE=%s -OUTFILE=%s") % (binary, parameters, in_file, \ out_file) elif label in ["clustal_omega"]: cmd = ("%s %s --in %s --out %s") % (binary, parameters, in_file, out_file) ## elif label in ["mafft", "dialign_tx"]: elif label in ["mafft"]: cmd = ("%s %s %s > %s") % (binary, parameters, in_file, out_file) elif label in ["prank"]: cmd = ("%s %s -d=%s -o=%s") % (binary, parameters, in_file, out_file) ## Starting for newer DiAlign-TX versions elif label in ["dialign_tx"]: cmd = ("%s %s %s %s") % (binary, parameters, in_file, out_file) ## On t-coffee case, we need to set-up some ENV variables to be able to run ## smoothly the program elif label in ["t_coffee", "m_coffee"]: sp.call(("mkdir -p -m0777 /tmp/tcoffee"), shell = True) drc = ("/tmp/tcoffee/%s") % (getuser()) sp.call(("mkdir -p -m0777 %s") % (drc), shell = True) os.putenv("LOCKDIR_4_TCOFFEE", drc) os.putenv("TMP_4_TCOFFEE", drc) cmd = ("%s %s %s -outfile %s") % (binary, in_file, parameters, out_file) ## In any other case, finish with a generic error else: sys.exit(exit_codes["generic"]) ## Record the time and precise command-line name = getfqdn() start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\t%s - Alignment\t%s") % (label.upper(), date), file = \ logFile) print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file = logFile) logFile.flush() try: proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile) except OSError as e: print("ERROR: Execution failed: " + str(e), file = sys.stderr) sys.exit(exit_codes[label]) if proc.wait() != 0: print(("ERROR: Execution failed: %s [exit code != -1]") \ % (label.upper()), file = sys.stderr) sys.exit(exit_codes[label]) final = datetime.datetime.now() ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTime\t%s\n###") % (total), file = logFile) logFile.flush() ## If we are working with PRANK, move output file - which should have a suffix ## depending on the output format if label in ["prank"]: suffix = "fas" if parameters.find("-f=") == -1 else \ "nex" if parameters.find("-f=nexus") != -1 else "phy" if lookForFile(out_file + ".best." + suffix): sp.call(("mv %s.best.%s %s") % (out_file, suffix, out_file), shell = True) ## If any mode of t_coffee is used: t_coffee or m_coffee, we should remove the ## guide tree generate during the program execution if label in ["t_coffee", "m_coffee"]: guide_tree = ".".join(os.path.split(in_file)[1].split(".")[:-1]) sp.call(("rm -f %s.dnd") % (guide_tree), shell = True) ## Check whether the output alignment has been already generated. ## In case something goes wrong, remove the output file and finish the ## current execution if not checkAlignment(in_file, out_file): print(("ERROR: Check input '%s' and output '%s' alignments") % (in_file, \ out_file), file = sys.stderr) print(("ERROR: Execution failed: %s [file check]") % \ (label.upper()), file = sys.stderr) # sp.call(("rm -f %s") % (out_file), shell = True) sys.exit(exit_codes[label]) return True
def phylogenetic_trees(parameters): ''' Phylogenetic trees are reconstructed according to the input parameters. Once the different files have been generated, the function moves those files into a pre-established filename schema ''' ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) current_directory = os.getcwd() ## Change current directory to the output folder. Any temporary file will be ## generated therefore in this folder os.chdir(parameters["out_directory"]) ## Depending on the verbosity level - set the appropriate logfile value if not "verbose" in parameters or parameters["verbose"] == 0: logFile = open(os.devnull, 'wb') ## ALL/logfile elif parameters["verbose"] == 1: ## Set output filename and log file mode = "w" if parameters["replace"] and parameters["step"] == 0 else "a+" logFile = open(oFile + ".log", mode) ## ALL/Stderr elif parameters["verbose"] == 2: logFile = sys.stderr start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tSTART\t" + "%s\n###") % (date), file=logFile) logFile.flush() ## Get which program will be used to reconstruct phylogenetic trees. Check ## such program is listed among the available binaries if not "tree" in parameters: sys.exit("ERROR: Check your configuration file. There is no definition for " + "the Phylogenetic TREE reconstruction step") prog = parameters["tree"][0] if not prog in parameters: sys.exit(("ERROR: Selected program '%s' is not available accordding to the " "the configuration file") % (prog)) ## Get binary as well as any default parameters for the selected program binary = parameters[prog] key = ("%s_params") % (prog) progr_params = parameters[key] if key in parameters else "" if not "evol_models" in parameters: sys.exit("ERROR: Check your configuration file. There is no definition for " + "the <evol_models> parameter") ## If the evolutionary model list is not appropiately formated, do it if isinstance(parameters["evol_models"], str): parameters["evol_models"] = list(map(strip, parameters["evol_models"].split())) ## Check if <numb_models parameters is defined and how many models are ## requested to be evaluated if not "numb_models" in parameters or parameters["numb_models"].lower() \ == "all": parameters["numb_models"] = len(parameters["evol_models"]) parameters["numb_models"] = int(parameters["numb_models"]) if not parameters["numb_models"] in range(1,len(parameters["evol_models"])+1): sys.exit(("ERROR: Check how many evolutionary models has been asked to re" + "construct '%d'") % (parameters["numb_models"])) ## Check whether "readAl" is available or not. It is useful for sequences ## manipulation independently of the input format. if not "readal" in parameters: sys.exit("ERROR: Check your CONFIG file. 'readAl' is not available") ## Create a temporary FASTA file which will be used to detect the sequence ## number on the input alignment and the presence of rare amino-acids TEMPFILE = tempfile.NamedTemporaryFile() convertInputFile_Format("readal", parameters["readal"], parameters["in_file"], TEMPFILE.name, "fasta", logFile, parameters["replace"]) TEMPFILE.flush() numSeqs, selenocys, pyrrolys = check_count_sequences(TEMPFILE.name) ## Set the minimum number of sequences required to reconstruct an alignment min_seqs = int(parameters["min_seqs"] if "min_seqs" in parameters else \ min_seqs_analysis) ## Finish when there are not enough sequences to make an alignment if numSeqs < min_seqs: print(("### INFO: It is necessary, at least, %d sequences to " + "to reconstruct an alignment (%d)") % (min_seqs, numSeqs), file=logFile) sys.exit(80) ## Check which approaches should be used for the phylogenetic reconstruction ## and whether there are specific program's parameters for them if not "tree_approach" in parameters: parameters["tree_approach"] = ["ml"] ## Remove potential duplicates and lowercase all approaches for the tree ## reconstruction parameters["tree_approach"] = set([p.lower() for p in \ parameters["tree_approach"]]) ## We will first loot for Neighbour Joining tree reconstruction, then for ## Maximum likelihood and then for any other approach defined in the config ## file tree_approaches = [] if "nj" in parameters["tree_approach"]: tree_approaches.append("nj") if "ml" in parameters["tree_approach"]: tree_approaches.append("ml") others = parameters["tree_approach"] - set(["nj", "ml"]) if others != set(): tree_approaches += sorted(others) ## When using RAxML, it may crash when Selenocysteines or Pyrrolysines are ## present in the input alignment if prog in ["raxml"]: ## If Selenocysteines or Pyrrolysines are present, substitute them by "X" if selenocys or pyrrolys: out_file = ("%s.no_rare_aa") % (parameters["in_file"]) if replaceRareAminoAcids(TEMPFILE.name, out_file, parameters["replace"], logFile, "U:X O:X"): parameters["replace"] = True parameters["in_file"] = out_file TEMPFILE.close() ## When using FastTree force the conversion of input alignment to FASTA format ## since it may crash reading standard interleave PHYLIP format files if prog in ["fasttree"]: in_file_format, aligned = getFileFormat("readal", parameters["readal"], \ parameters["in_file"], logFile) if in_file_format != "fasta": out_file = ("%s.fa") % (parameters["in_file"]) if (convertInputFile_Format("readal", parameters["readal"], \ parameters["in_file"], out_file, "fasta", logFile, parameters["replace"])): parameters["replace"] = True parameters["in_file"] = out_file replace = parameters["replace"] selected_models = parameters["evol_models"] ## Reconstruct trees for each approach considering evolutionary models order ## according their likelihood values for approach in tree_approaches: ## Save results - we will use such data for selecting the best -if required- ## models fitting to the input data results = {} ## Format the choosen program's parameters according to the default ones and ## the specific ones for the current approach params = ("%s ") % (progr_params) params += parameters[approach] if approach in parameters else "" for model in selected_models: out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog, approach, model) stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog, approach, model) if prog in ["phyml"]: exec_params = ("%s -m %s") % (params, model) ## Get additional model -if any- for codons elif prog in ["codonphyml"]: exec_params = ("%s -m %s") % (params, model) add_model = [p.split()[1] for p in map(strip, exec_params.split("-")) \ if p.startswith("fmodel")] if len(add_model) == 1: add_model = add_model.pop() model = ("%s_%s") % (model, add_model) out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog, approach, model) stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog, approach, model) elif prog in ["fasttree"]: ## On FastTree is selected by default JTT model for AAs - so we don't ## set-up that model exec_params = ("%s -%s") % (params, model) if model.lower() != "jtt" \ and model.lower() != "jc" else params model = model.upper() ## In the case of RAxML, we would concatenate the model to an specific ## input parameter elif prog in ["raxml"]: final_model = model ## It is possible to add some suffixes to the evolutionary models ## in RAxML - There is not better/easy way to code this option if "raxml_model_suffix" in parameters: final_model += parameters["raxml_model_suffix"] exec_params = " ".join([("-%s%s") %(p, final_model if p.startswith("m ") else "") for p in map(strip, params.split("-")) if p]) ## Build the phylogenetic tree using any of the available methods and ## register if any downstream file should be redone. if perform_tree(prog, binary, exec_params, parameters["in_file"], out_file, stats_file, logFile, parameters["replace"]): replace = True ## Get the likelihood for each of the reconstructed models log_lk = get_likelihood(prog, stats_file) if not log_lk: print(("ERROR: Impossible to the Log likelihood values " + "for '%s' model using this program '%s'") % (model, prog), file = \ sys.stderr) sys.exit(exit_codes[prog]) results.setdefault(model, log_lk) ## Get the models sorted by their likelihood values records = sorted(iter(results.items()), key = itemgetter(1), reverse = True) ## Set the filename which stores the ranking rank_file = ("%s.tree.%s.rank.%s") % (oFile, prog, approach) update = False ## Check the content of the rankings file - if any. ## Marked the file as updatable if there is any discrepancy if not replace and lookForFile(rank_file): old_content = "\n".join(["\t".join(list(map(strip, line.split("\t")))) for line in open(rank_file, "rU")]) newly_generated = "\n".join([("%s\t%s") % (r[0], r[1]) for r in records]) ## Decide whether ranking file should be updated after comparing current ## content with newly generated content update = old_content != newly_generated ## If the file containing the ranking doesn't exist, generate it. ## Update the file content if the replace flag is set to true or the content ## has changed - since the phylogenetic tree reconstruction step is the most ## expensive one - in terms of time/memory consumption - we are not setting ## replace flag to True even when this file is generated/updated. On this ## way, we can take adventage of any tree generated in any downstream step. if not lookForFile(rank_file) or replace or update: out_file = open(rank_file, "w") print("\n".join([("%s\t%s") % (r[0], r[1]) for r in records]), file = \ out_file) out_file.close() ## We could set the replace flag to True. However, if any tree has been ## generated 'de novo' during this iteration, then the flag is already set ## to True. #~ parameters["replace"] = True ## Select a given number of models for the next iteration - if any selected_models = [pair[0] for pair in records[:parameters["numb_models"]]] ## Remove the Codon Frequency model from potential new iterations if prog in ["codonphyml"] and add_model: selected_models = [m.replace("_"+ add_model, "") for m in selected_models if m.endswith(add_model)] final = datetime.datetime.now() date = final.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tEND\t" + "%s") % (date), file=logFile) ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTOTAL Time\tPhylogenetic Tree Reconstruction\t%s" + "\n###") % (total), file=logFile) ## We just close logfile and clean it up when it is a file if "verbose" in parameters and parameters["verbose"] == 1: logFile.close() ## Clean-up log directory from undesirable lines try: sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell = True) sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell = True) sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell = True) except OSError: print(("ERROR: Impossible to clean-up '%s.log' log file") \ % (oFile), file=sys.stderr) ## Before returning to the main program, get back to the original working ## directory os.chdir(current_directory) return parameters
def homology(parameters): ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) current_directory = os.getcwd() ## Change current directory to the output folder. Any temporary file will be ## generated therefore in this folder os.chdir(parameters["out_directory"]) ## Depending on the verbosity level - set the appropriate logfile value if not "verbose" in parameters or parameters["verbose"] == 0: logFile = open(os.devnull, 'wb') ## ALL/logfile elif parameters["verbose"] == 1: ## Set output filename and log file mode = "w" if parameters["replace"] and parameters["step"] == 0 else "a+" logFile = open(oFile + ".log", mode) ## ALL/Stderr elif parameters["verbose"] == 2: logFile = sys.stderr start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tHomology\tSTART\t%s\n###") % (date), file = logFile) logFile.flush() ## Get which tool will be used to perform the homology search. Check such tool ## is listed among the available binaries if not "homology" in parameters: sys.exit("ERROR: Check your configuration file. There is not tool set for " + "the homology search") if not parameters["homology"][0] in parameters: sys.exit("ERROR: Check your configuration file. This tool '%s' is not among" + " available methods") ## Check whether if an special mode has been selected - for instance ## "prot2codon" or "prot2nuc" - and a CDS file has been defined ## If not mode is define, we will work with a datatype - normally proteins if "cds" in parameters and not parameters["residue_datatype"] in \ ["prot2codon", "prot2nuc"]: sys.exit("ERROR: To use an additional CDS file, you should set the <parame" + "ter> 'residue_datatype' to either 'prot2codon' or 'prot2nuc'") if not "cds" in parameters and parameters["residue_datatype"] in \ ["prot2codon", "prot2nuc"]: sys.exit("ERROR: When 'residue_datatype' is set to either 'prot2codon' or " + "'prot2nuc', an input CDS file is needed") ## If the homology search will use any program from the BLAST package, check ## whether the TARGET SEQUENCES file has been already formatted. if parameters["homology"][0] in ["legacy_blast", "blast+"]: ## Get database sequence type - p: protein or n:nucleotide dt = "p" if parameters["residue_datatype"].startswith("prot") else "n" ## Check if BLAST DB associated files already exist or not for extension in ["hr", "in", "sq"]: filename = ("%s.%s%s") % (parameters["db_file"], dt, extension) ## If the input file doesn't exist check whether input database has been ## split into different volumes if not lookForFile(filename): alternative = ("%s.00.%s%s") % (parameters["db_file"], dt, extension) if not lookForFile(alternative): db_file = parameters["db_file"] sys.exit(("ERROR: Check your input TARGET SEQUENCES file '%s' has " + "been formated using 'formatdb'/'makeblastdb'") % (db_file)) ## If the homology search step should be perfomed using BLAST, call the ## appropiate function blast(parameters, logFile) tag = "blast" elif parameters["homology"][0] in ["phmmer", "jackhmmer", "hmmer_search"]: hmmer(parameters, logFile) ## Set the tag for the output files tag = "hmmer" ## Check whether the output file contains any result homologs = 0 inFile = ("%s.homology.%s.out") % (oFile, tag) for line in open(inFile, "rU"): if not line.strip() or line.startswith("#"): continue homologs += 1 if not homologs: print(("INFO: NO Homologous sequences found for '%s'") % \ parameters["prefix"], file = sys.stderr) sys.exit(80) ## Filter homology search data. A dictionary containing selected sequences, ## including the sequences themselves selected_sequences = filter_results(parameters, logFile) ## Generate a MD5 file containing selected sequences for the current run. ## MD5s are used to recompute the same phylogenetic tree starting from other ## seqs - with identical similarity search results - in the set of homologs outFile = ("%s.seqs.md5") % (oFile) ## Check whether the file already exists or not. if not lookForFile(outFile) or parameters["replace"]: parameters["replace"] = True seqs_md5 = md5("".join(sorted(selected_sequences.keys()))).hexdigest() print(("%s\t%s") % (parameters["prefix"], seqs_md5), file = \ open(outFile, "w")) ## Generate a file containing the selected sequences after performing the ## homology search and filtering its output according to a set of parameters. outFile = ("%s.seqs") % (oFile) ## Check whether the file already exists or not. if not lookForFile(outFile) or parameters["replace"]: parameters["replace"] = True output_file = open(outFile, "w") for seqId in sorted(selected_sequences): print((">%s\n%s") % (seqId, selected_sequences[seqId][1]), file = \ output_file) output_file.close() ## If a CDS input file is set, use it to associate to homologous protein ## sequences their corresponding CDS if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]: cdsFile = ("%s.seqs_cds") % (oFile) ## Check whether the file already exists or not. if not lookForFile(cdsFile) or parameters["replace"]: parameters["replace"] = True output_file = open(cdsFile, "w") found = set() for record in SeqIO.parse(parameters["cds"], "fasta"): if not record.id in selected_sequences: continue seq = splitSequence(str(record.seq)) print((">%s\n%s") % (record.id, seq), file = output_file) found.add(record.id) output_file.close() if set(selected_sequences.keys()) - found != set(): missed = ",".join(sorted(set(selected_sequences.keys()) - found)) sys.exit(("ERROR: Check your input CDS file '%s'. Impossible to find " "homologs sequences [missing:'%s']") % (parameters["cds"], missed)) ## Print how much time was needed to perform the whole homology search step final = datetime.datetime.now() date = final.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tHomology\tEND\t%s") % (date), file = logFile) ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTOTAL Time\tHomology\t%s\n###") % (total), file = logFile) ## We just close logfile and clean it up when it is a file if "verbose" in parameters and parameters["verbose"] == 1: logFile.close() ## Clean-up log directory from undesirable lines try: sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell = True) sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell = True) sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell = True) except OSError: print(("ERROR: Impossible to clean-up '%s.log' log file") \ % (oFile), file = sys.stderr) ## Update the input file parameter and return the dictionary containing all ## parameters. Those parameters may be used in other steps parameters["in_file"] = outFile ## Update the associate CDS file with the resulting cds file. It will be used ## to make the back-translation in a hypothetical MSA step if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]: parameters["cds"] = ("%s.seqs_cds") % (oFile) ## Before returning to the main program, get back to the original working ## directory os.chdir(current_directory) return parameters
def filter_results(parameters, logFile): ''' Filter Homology search results taking into account which package was used to perform the search. Depending on the package only e-values (HMMER) or e-value plus coverage -ratio of aligned region between query and target sequences vs. query sequence lenght- (BLAST) are used. ''' ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) ## Get tag for the input/output file. It will depend on which method has been ## used to perform the homology seach tag = "hmmer" if parameters["homology"][0] in ["phmmer", "jackhmmer", \ "hmmer_search"] else "blast" if parameters["homology"][0] in \ ["legacy_blast", "blast+"] else "" ## Get input file inFile = ("%s.homology.%s.out") % (oFile, tag) ## If input file doesn't exist, just go back to the main function if not lookForFile(inFile): sys.exit(("ERROR: Check previously generated file '%s'") % (inFile)) ## Get input file name and check whether has been previously generated or ## not. It will also affect whether the variable REPLACE is set or not outFile = ("%s.homology.%s.filter") % (oFile, tag) ## If output file exist and it is not set to replace it, then load the ## selected sequences and go back to the main function. Otherwise, set the ## replace parameter to TRUE in other to replace any already generated file ## downstream if lookForFile(outFile) and not parameters["replace"]: ## Get selected sequences. It will be used to produce MD5s key as well as to ## generate the sequences FASTA file target_sequences = set() for line in open(outFile, "rU"): ## Parse line f = list(map(strip, line.split())) parsed = [elem for elem in parseComments([e for e in f if e]) if elem] ## Include only target sequences - we assume query sequence had been ## include as part of the filtered results target_sequences.add(parsed[0] if tag == "hmmer" else parsed[1]) ## We read selected sequences from input database and return it to the main ## function selected_sequences = read_database(parameters["db_file"], target_sequences) return selected_sequences ## We set the replace flag to true in order to reconstruct any downstream file parameters["replace"] = True input_lines, target_sequences, query_line = [], set(), None for line in open(inFile, "rU"): ## Parse line parsed_line = [element for element in parseComments([e for e in map(strip, \ line.split()) if e]) if element] ## Discard empty lines or those starting by "#" if not parsed_line: continue ## Detect the target sequence which is placed at different columns depending ## whether it is blast or hmmer package which generated the output target = parsed_line[0] if tag == "hmmer" else parsed_line[1] ## We also include the query sequence, it is only important for the BLAST- ## based search query = parsed_line[2] if tag == "hmmer" else parsed_line[0] ## Store the self-hit line - on this way we make sure we will include the ## query protein among the finally selected sequences despite any cut-off if target == query and not query_line: query_line = [parsed_line] ## If current target sequence has not been found yet, register it if not target in target_sequences: input_lines.append(parsed_line) target_sequences|= set([target]) ## We make sure query sequence is included sequences = read_database(parameters["db_file"], target_sequences) seed_seqs = read_database(parameters["in_file"]) ## Depending on how the search was performed, we will filter-out data ## by e-values and coverage (BLAST only) or not e_value = float(parameters["e_value"]) coverage = float(parameters["coverage"]) hits = -1 if not "hits" in parameters or parameters["hits"] == "no_limit" \ else int(parameters["hits"]) accepted_lines, accepted_targets = [], set() for line in input_lines: ## If the current target has been already found, move to next hit target = line[0] if tag == "hmmer" else line[1] if target in accepted_targets: continue ## Depending on the package, filter just by two e-values (sequence and best ## found domain) or by sequence e-value + coverage between sequences if tag == "hmmer": if float(line[4]) > e_value or float(line[7]) > e_value: continue elif tag == "blast": ## To make sure we have the seed sequence used to perform the homology ## search, we read it independently of the input sequence database seed = line[0] seedSeq = (seed_seqs[seed] if seed in seed_seqs else sequences[seed])[0] covTarget = ((int(line[7]) - int(line[6]))+1)/float(seedSeq) if covTarget < coverage or float(line[-2]) > e_value: continue ## Store current line and target sequence accepted_lines.append(line) accepted_targets.add(target) ## Sort by e-values (and bit-score for BLAST only) accepted lines accepted_lines.sort(sort_blast_hits if tag == "blast" else sort_hmmer_hits) ## Recover query ID from query line. Including the starting sequence depends ## on the configuration query = None if query_line: query = query_line[0][2] if tag == "hmmer" else query_line[0][0] if not query in accepted_targets and parameters["force_seed_sequence"]: accepted_lines = query_line + accepted_lines if hits != -1 and len(accepted_lines) > hits: accepted_lines = accepted_lines[:hits] ## Get selected sequences. It will be used to produce MD5s key as well as to ## generate the sequences FASTA file selected_sequences = {} for line in accepted_lines: sequence_id = line[0] if tag == "hmmer" else line[1] selected_sequences.setdefault(sequence_id, sequences[sequence_id]) out = ["\t".join([str(x).ljust(6) for x in l]) for l in accepted_lines] print("\n".join(out), file = open(outFile, "w")) return selected_sequences
def hmmer(parameters, logFile): ''' Perform the homology search using three different approximations implemented in the HMMER package. ''' ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) ## Get output file name and check whether has been previously generated or ## not. It will also affect whether the variable REPLACE is set or not outFile = ("%s.homology.hmmer.out") % (oFile) ## If output file exist and it is not set to replace it, just go back to the ## main function. Otherwise, set the replace parameter to TRUE in other to ## replace any already generated file downstream if lookForFile(outFile) and not parameters["replace"]: return parameters["replace"] = True ## If we are ask to perform a HMM search using a Multiple Sequence Alignment ## as input rather than a single sequence, we need first to construct a HMM ## to perfom the search if parameters["homology"][0] == "hmmsearch" : if not "readal" in parameters or not "hmmbuild" in parameters: sys.exit(("ERROR: Check your CONFIG file to search whether 'readAl' and " + "'hmmbuild' are available")) ## Create a temporary FASTA file which will be used as input for HMMBuild TEMPFILE = tempfile.NamedTemporaryFile() convertInputFile_Format("readal", parameters["readal"], parameters["in_file"], TEMPFILE.name, "fasta", logFile, parameters["replace"]) TEMPFILE.flush() ## Generate the profile ## Set the current residues type to amino-acids if search is performed using ## proteins, otherwise, allow the program to guess it dt = "--amino" if parameters["residue_datatype"].startswith("prot") else "" hmmFile = ("%s.homology.hmmer.hmm") % (oFile) cmd = ("%s --informat afa %s %s %s") % (parameters["hmmbuild"], dt, hmmFile, TEMPFILE.name) name = getfqdn() print(("###\n###\t[%s]\tCommand-line\t%s\n###\n") % (name, cmd), file = \ logFile) logFile.flush() try: proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile) except OSError as e: sys.exit("ERROR: Execution failed: " + str(e)) if proc.wait() != 0: sys.exit(("ERROR: Execution failed: '%s'") % ("hmmbuild")) ## We update the input file for performing the HMM-based homology search parameters["in_file"] = hmmFile TEMPFILE.close() ## Generate command-line depending on HMMER specific program and parameters binary = parameters["homology"][0] params = parameters["hmmer_params"] cmd = ("%s %s -E %s --tblout %s %s %s") % (parameters[binary], params, \ str(parameters["e_value"]), outFile, parameters["in_file"], \ parameters["db_file"]) name = getfqdn() print(("###\n###\t[%s]\tCommand-line\t%s\n###\n") % (name, cmd), file = \ logFile) logFile.flush() try: proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile) except OSError as e: sys.exit("ERROR: Execution failed: " + str(e)) if proc.wait() != 0: sys.exit(("ERROR: Execution failed: '%s'") % (parameters[binary]))
def hmmer(parameters, logFile): ''' Perform the homology search using three different approximations implemented in the HMMER package. ''' ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) ## Get output file name and check whether has been previously generated or ## not. It will also affect whether the variable REPLACE is set or not outFile = ("%s.homology.hmmer.out") % (oFile) ## If output file exist and it is not set to replace it, just go back to the ## main function. Otherwise, set the replace parameter to TRUE in other to ## replace any already generated file downstream if lookForFile(outFile) and not parameters["replace"]: return parameters["replace"] = True ## If we are ask to perform a HMM search using a Multiple Sequence Alignment ## as input rather than a single sequence, we need first to construct a HMM ## to perfom the search if parameters["homology"][0] == "hmmsearch": if not "readal" in parameters or not "hmmbuild" in parameters: sys.exit(( "ERROR: Check your CONFIG file to search whether 'readAl' and " + "'hmmbuild' are available")) ## Create a temporary FASTA file which will be used as input for HMMBuild TEMPFILE = tempfile.NamedTemporaryFile() convertInputFile_Format("readal", parameters["readal"], parameters["in_file"], TEMPFILE.name, "fasta", logFile, parameters["replace"]) TEMPFILE.flush() ## Generate the profile ## Set the current residues type to amino-acids if search is performed using ## proteins, otherwise, allow the program to guess it dt = "--amino" if parameters["residue_datatype"].startswith( "prot") else "" hmmFile = ("%s.homology.hmmer.hmm") % (oFile) cmd = ("%s --informat afa %s %s %s") % (parameters["hmmbuild"], dt, hmmFile, TEMPFILE.name) name = getfqdn() print(("###\n###\t[%s]\tCommand-line\t%s\n###\n") % (name, cmd), file = \ logFile) logFile.flush() try: proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile) except OSError as e: sys.exit("ERROR: Execution failed: " + str(e)) if proc.wait() != 0: sys.exit(("ERROR: Execution failed: '%s'") % ("hmmbuild")) ## We update the input file for performing the HMM-based homology search parameters["in_file"] = hmmFile TEMPFILE.close() ## Generate command-line depending on HMMER specific program and parameters binary = parameters["homology"][0] params = parameters["hmmer_params"] cmd = ("%s %s -E %s --tblout %s %s %s") % (parameters[binary], params, \ str(parameters["e_value"]), outFile, parameters["in_file"], \ parameters["db_file"]) name = getfqdn() print(("###\n###\t[%s]\tCommand-line\t%s\n###\n") % (name, cmd), file = \ logFile) logFile.flush() try: proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile) except OSError as e: sys.exit("ERROR: Execution failed: " + str(e)) if proc.wait() != 0: sys.exit(("ERROR: Execution failed: '%s'") % (parameters[binary]))
def perfomAlignment(label, binary, parameters, in_file, out_file, logFile, \ replace): ''' Function to format the command-line of different multiple sequence alignment programs and execute such command lines. It is also support a generic call for those programs which has no specific support in the pipeline ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False if label in ["muscle", "kalign"]: cmd = ("%s %s -in %s -out %s") % (binary, parameters, in_file, out_file) elif label in ["clustalw"]: cmd = ("%s %s -INFILE=%s -OUTFILE=%s") % (binary, parameters, in_file, \ out_file) elif label in ["clustal_omega"]: cmd = ("%s %s --in %s --out %s") % (binary, parameters, in_file, out_file) ## elif label in ["mafft", "dialign_tx"]: elif label in ["mafft"]: cmd = ("%s %s %s > %s") % (binary, parameters, in_file, out_file) elif label in ["prank"]: cmd = ("%s %s -d=%s -o=%s") % (binary, parameters, in_file, out_file) ## Starting for newer DiAlign-TX versions elif label in ["dialign_tx"]: cmd = ("%s %s %s %s") % (binary, parameters, in_file, out_file) ## On t-coffee case, we need to set-up some ENV variables to be able to run ## smoothly the program elif label in ["t_coffee", "m_coffee"]: sp.call(("mkdir -p -m0777 /tmp/tcoffee"), shell=True) drc = ("/tmp/tcoffee/%s") % (getuser()) sp.call(("mkdir -p -m0777 %s") % (drc), shell=True) os.putenv("LOCKDIR_4_TCOFFEE", drc) os.putenv("TMP_4_TCOFFEE", drc) cmd = ("%s %s %s -outfile %s") % (binary, in_file, parameters, out_file) ## In any other case, finish with a generic error else: sys.exit(exit_codes["generic"]) ## Record the time and precise command-line name = getfqdn() start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print >> logFile, ("###\n###\t%s - Alignment\t%s") % (label.upper(), date) print >> logFile, ("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd) logFile.flush() try: proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile) except OSError, e: print >> sys.stderr, "ERROR: Execution failed: " + str(e) sys.exit(exit_codes[label])
print >> sys.stderr, ("ERROR: Execution failed: %s [exit code != -1]") \ % (label.upper()) sys.exit(exit_codes[label]) final = datetime.datetime.now() ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print >> logFile, ("###\tTime\t%s\n###") % (total) logFile.flush() ## If we are working with PRANK, move output file - which should have a suffix ## depending on the output format if label in ["prank"]: suffix = "fas" if parameters.find("-f=") == -1 else \ "nex" if parameters.find("-f=nexus") != -1 else "phy" if lookForFile(out_file + ".best." + suffix): sp.call(("mv %s.best.%s %s") % (out_file, suffix, out_file), shell=True) ## If any mode of t_coffee is used: t_coffee or m_coffee, we should remove the ## guide tree generate during the program execution if label in ["t_coffee", "m_coffee"]: guide_tree = ".".join(os.path.split(in_file)[1].split(".")[:-1]) sp.call(("rm -f %s.dnd") % (guide_tree), shell=True) ## Check whether the output alignment has been already generated. ## In case something goes wrong, remove the output file and finish the ## current execution if not checkAlignment(in_file, out_file): print in_file, out_file print >> sys.stderr, ("ERROR: Execution failed: %s [file check]") % \
def filter_results(parameters, logFile): ''' Filter Homology search results taking into account which package was used to perform the search. Depending on the package only e-values (HMMER) or e-value plus coverage -ratio of aligned region between query and target sequences vs. query sequence lenght- (BLAST) are used. ''' ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) ## Get tag for the input/output file. It will depend on which method has been ## used to perform the homology seach tag = "hmmer" if parameters["homology"][0] in ["phmmer", "jackhmmer", \ "hmmer_search"] else "blast" if parameters["homology"][0] in \ ["legacy_blast", "blast+"] else "" ## Get input file inFile = ("%s.homology.%s.out") % (oFile, tag) ## If input file doesn't exist, just go back to the main function if not lookForFile(inFile): sys.exit(("ERROR: Check previously generated file '%s'") % (inFile)) ## Get input file name and check whether has been previously generated or ## not. It will also affect whether the variable REPLACE is set or not outFile = ("%s.homology.%s.filter") % (oFile, tag) ## If output file exist and it is not set to replace it, then load the ## selected sequences and go back to the main function. Otherwise, set the ## replace parameter to TRUE in other to replace any already generated file ## downstream if lookForFile(outFile) and not parameters["replace"]: ## Get selected sequences. It will be used to produce MD5s key as well as to ## generate the sequences FASTA file target_sequences = set() for line in open(outFile, "rU"): ## Parse line f = list(map(strip, line.split())) parsed = [ elem for elem in parseComments([e for e in f if e]) if elem ] ## Include only target sequences - we assume query sequence had been ## include as part of the filtered results target_sequences.add(parsed[0] if tag == "hmmer" else parsed[1]) ## We read selected sequences from input database and return it to the main ## function selected_sequences = read_database(parameters["db_file"], target_sequences) return selected_sequences ## We set the replace flag to true in order to reconstruct any downstream file parameters["replace"] = True input_lines, target_sequences, query_line = [], set(), None for line in open(inFile, "rU"): ## Parse line parsed_line = [element for element in parseComments([e for e in map(strip, \ line.split()) if e]) if element] ## Discard empty lines or those starting by "#" if not parsed_line: continue ## Detect the target sequence which is placed at different columns depending ## whether it is blast or hmmer package which generated the output target = parsed_line[0] if tag == "hmmer" else parsed_line[1] ## We also include the query sequence, it is only important for the BLAST- ## based search query = parsed_line[2] if tag == "hmmer" else parsed_line[0] ## Store the self-hit line - on this way we make sure we will include the ## query protein among the finally selected sequences despite any cut-off if target == query and not query_line: query_line = [parsed_line] ## If current target sequence has not been found yet, register it if not target in target_sequences: input_lines.append(parsed_line) target_sequences |= set([target]) ## We make sure query sequence is included sequences = read_database(parameters["db_file"], target_sequences) seed_seqs = read_database(parameters["in_file"]) ## Depending on how the search was performed, we will filter-out data ## by e-values and coverage (BLAST only) or not e_value = float(parameters["e_value"]) coverage = float(parameters["coverage"]) hits = -1 if not "hits" in parameters or parameters["hits"] == "no_limit" \ else int(parameters["hits"]) accepted_lines, accepted_targets = [], set() for line in input_lines: ## If the current target has been already found, move to next hit target = line[0] if tag == "hmmer" else line[1] if target in accepted_targets: continue ## Depending on the package, filter just by two e-values (sequence and best ## found domain) or by sequence e-value + coverage between sequences if tag == "hmmer": if float(line[4]) > e_value or float(line[7]) > e_value: continue elif tag == "blast": ## To make sure we have the seed sequence used to perform the homology ## search, we read it independently of the input sequence database seed = line[0] seedSeq = (seed_seqs[seed] if seed in seed_seqs else sequences[seed])[0] covTarget = ((int(line[7]) - int(line[6])) + 1) / float(seedSeq) if covTarget < coverage or float(line[-2]) > e_value: continue ## Store current line and target sequence accepted_lines.append(line) accepted_targets.add(target) ## Sort by e-values (and bit-score for BLAST only) accepted lines accepted_lines.sort(sort_blast_hits if tag == "blast" else sort_hmmer_hits) ## Recover query ID from query line. Including the starting sequence depends ## on the configuration query = None if query_line: query = query_line[0][2] if tag == "hmmer" else query_line[0][0] if not query in accepted_targets and parameters["force_seed_sequence"]: accepted_lines = query_line + accepted_lines if hits != -1 and len(accepted_lines) > hits: accepted_lines = accepted_lines[:hits] ## Get selected sequences. It will be used to produce MD5s key as well as to ## generate the sequences FASTA file selected_sequences = {} for line in accepted_lines: sequence_id = line[0] if tag == "hmmer" else line[1] selected_sequences.setdefault(sequence_id, sequences[sequence_id]) out = ["\t".join([str(x).ljust(6) for x in l]) for l in accepted_lines] print("\n".join(out), file=open(outFile, "w")) return selected_sequences
def perfomAlignment(label, binary, parameters, in_file, out_file, logFile, \ replace): ''' Function to format the command-line of different multiple sequence alignment programs and execute such command lines. It is also support a generic call for those programs which has no specific support in the pipeline ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False if label in ["muscle", "kalign"]: cmd = ("%s %s -in %s -out %s") % (binary, parameters, in_file, out_file) elif label in ["clustalw"]: cmd = ("%s %s -INFILE=%s -OUTFILE=%s") % (binary, parameters, in_file, \ out_file) elif label in ["clustal_omega"]: cmd = ("%s %s --in %s --out %s") % (binary, parameters, in_file, out_file) ## elif label in ["mafft", "dialign_tx"]: elif label in ["mafft"]: cmd = ("%s %s %s > %s") % (binary, parameters, in_file, out_file) elif label in ["prank"]: cmd = ("%s %s -d=%s -o=%s") % (binary, parameters, in_file, out_file) ## Starting for newer DiAlign-TX versions elif label in ["dialign_tx"]: cmd = ("%s %s %s %s") % (binary, parameters, in_file, out_file) ## On t-coffee case, we need to set-up some ENV variables to be able to run ## smoothly the program elif label in ["t_coffee", "m_coffee"]: sp.call(("mkdir -p -m0777 /tmp/tcoffee"), shell=True) drc = ("/tmp/tcoffee/%s") % (getuser()) sp.call(("mkdir -p -m0777 %s") % (drc), shell=True) os.putenv("LOCKDIR_4_TCOFFEE", drc) os.putenv("TMP_4_TCOFFEE", drc) cmd = ("%s %s %s -outfile %s") % (binary, in_file, parameters, out_file) ## In any other case, finish with a generic error else: sys.exit(exit_codes["generic"]) ## Record the time and precise command-line name = getfqdn() start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\t%s - Alignment\t%s") % (label.upper(), date), file = \ logFile) print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file=logFile) logFile.flush() try: proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile) except OSError as e: print("ERROR: Execution failed: " + str(e), file=sys.stderr) sys.exit(exit_codes[label]) if proc.wait() != 0: print(("ERROR: Execution failed: %s [exit code != -1]") \ % (label.upper()), file = sys.stderr) sys.exit(exit_codes[label]) final = datetime.datetime.now() ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTime\t%s\n###") % (total), file=logFile) logFile.flush() ## If we are working with PRANK, move output file - which should have a suffix ## depending on the output format if label in ["prank"]: suffix = "fas" if parameters.find("-f=") == -1 else \ "nex" if parameters.find("-f=nexus") != -1 else "phy" if lookForFile(out_file + ".best." + suffix): sp.call(("mv %s.best.%s %s") % (out_file, suffix, out_file), shell=True) ## If any mode of t_coffee is used: t_coffee or m_coffee, we should remove the ## guide tree generate during the program execution if label in ["t_coffee", "m_coffee"]: guide_tree = ".".join(os.path.split(in_file)[1].split(".")[:-1]) sp.call(("rm -f %s.dnd") % (guide_tree), shell=True) ## Check whether the output alignment has been already generated. ## In case something goes wrong, remove the output file and finish the ## current execution if not checkAlignment(in_file, out_file): print(("ERROR: Check input '%s' and output '%s' alignments") % (in_file, \ out_file), file = sys.stderr) print(("ERROR: Execution failed: %s [file check]") % \ (label.upper()), file = sys.stderr) # sp.call(("rm -f %s") % (out_file), shell = True) sys.exit(exit_codes[label]) return True
final = datetime.datetime.now() ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print >> logFile, ("###\tTime\t%s\n###") % (total) logFile.flush() ## Process program's output and rename output files according to our own ## scheme if label in ["phyml", "codonphyml"]: ## Since resulting tree/stats file have slightly changed between version, ## we have to control for that. tree_file = ("%s_%s_tree.txt") % (in_file, label) sts_file = ("%s_%s_stats.txt") % (in_file, label) if not lookForFile(tree_file, attempts=2): tree_file = ("%s_%s_tree") % (in_file, label) sts_file = ("%s_%s_stats") % (in_file, label) try: sp.call(("mv %s %s") % (tree_file, out_file), shell=True) sp.call(("mv %s %s") % (sts_file, stats_file), shell=True) except OSError: print >> sys.stderr, ("ERROR: Impossible to rename '%s' output files") \ % (label.upper()) sys.exit(exit_codes[label]) elif label in ["raxml"]: try: sp.call(("mv RAxML_bestTree.%s %s") % (suffix, out_file), shell=True)
def perform_tree(label, binary, parameters, in_file, out_file, stats_file, \ logFile, replace): ''' Function to format the command-line of different phylogenetic tree reconstruc- tion programs and execute such command lines. ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False if label in ["phyml", "codonphyml"]: cmd = ("%s -i %s %s") % (binary, in_file, parameters) elif label in ["fasttree"]: cmd = ("%s %s -log %s -out %s %s") % (binary, parameters, stats_file, \ out_file, in_file) elif label in ["raxml"]: random_seed = randint(1, 10000) suffix = ("%s_%d") % (label, random_seed) cmd = ("%s -n %s -p %d -s %s %s") % (binary, suffix, random_seed, in_file, \ parameters) else: sys.exit(exit_codes["generic"]) ## Record the time and precise command-line name = getfqdn() start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\t%s - Phylogenetic Trees\t") % (label.upper()), end = ' ', \ file = logFile) print(("%s\n###\t[%s]\tCommand-line\t%s\n###") % (date, name, cmd), file = \ logFile) logFile.flush() try: ## We add a small pipeline to avoid informatin written in the same line proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile, stdin = sp.PIPE) except OSError as e: print("ERROR: Execution failed: " + str(e), file=sys.stderr) sys.exit(exit_codes[label]) proc.communicate(b'\n\nY\n') if proc.wait() != 0: print(("ERROR: Execution failed: %s") % (label.upper()), file = sys.stderr) sys.exit(exit_codes[label]) final = datetime.datetime.now() ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTime\t%s\n###") % (total), file=logFile) logFile.flush() ## Process program's output and rename output files according to our own ## scheme if label in ["phyml", "codonphyml"]: ## Since resulting tree/stats file have slightly changed between version, ## we have to control for that. tree_file = ("%s_%s_tree.txt") % (in_file, label) sts_file = ("%s_%s_stats.txt") % (in_file, label) if not lookForFile(tree_file, attempts = 2): tree_file = ("%s_%s_tree") % (in_file, label) sts_file = ("%s_%s_stats") % (in_file, label) try: sp.call(("mv %s %s") % (tree_file, out_file), shell = True) sp.call(("mv %s %s") % (sts_file, stats_file), shell = True) except OSError: print(("ERROR: Impossible to rename '%s' output files") \ % (label.upper()), file=sys.stderr) sys.exit(exit_codes[label]) elif label in ["raxml"]: try: sp.call(("mv RAxML_bestTree.%s %s") % (suffix, out_file), shell = True) sp.call(("mv RAxML_info.%s %s") % (suffix, stats_file), shell = True) except OSError: print(("ERROR: Impossible to rename RAxML output files"), file = \ sys.stderr) sys.exit(exit_codes[label]) oFile = open(stats_file, "a+") for oth_file in listDirectory(os.path.split(stats_file)[0], suffix): fileName = os.path.split(oth_file)[1] hz_line = "#" * (len(fileName) + 4) print(("%s\n%s\n%s") % (hz_line, fileName, hz_line), file = oFile) print(("%s") % ("".join(open(oth_file, "rU").readlines())), file = oFile) sp.call(("rm -f %s") % (oth_file), shell = True) oFile.close() return True
def homology(parameters): ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) current_directory = os.getcwd() ## Change current directory to the output folder. Any temporary file will be ## generated therefore in this folder os.chdir(parameters["out_directory"]) ## Depending on the verbosity level - set the appropriate logfile value if not "verbose" in parameters or parameters["verbose"] == 0: logFile = open(os.devnull, 'wb') ## ALL/logfile elif parameters["verbose"] == 1: ## Set output filename and log file mode = "w" if parameters["replace"] and parameters[ "step"] == 0 else "a+" logFile = open(oFile + ".log", mode) ## ALL/Stderr elif parameters["verbose"] == 2: logFile = sys.stderr start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tHomology\tSTART\t%s\n###") % (date), file=logFile) logFile.flush() ## Get which tool will be used to perform the homology search. Check such tool ## is listed among the available binaries if not "homology" in parameters: sys.exit( "ERROR: Check your configuration file. There is not tool set for " + "the homology search") if not parameters["homology"][0] in parameters: sys.exit( "ERROR: Check your configuration file. This tool '%s' is not among" + " available methods") ## Check whether if an special mode has been selected - for instance ## "prot2codon" or "prot2nuc" - and a CDS file has been defined ## If not mode is define, we will work with a datatype - normally proteins if "cds" in parameters and not parameters["residue_datatype"] in \ ["prot2codon", "prot2nuc"]: sys.exit( "ERROR: To use an additional CDS file, you should set the <parame" + "ter> 'residue_datatype' to either 'prot2codon' or 'prot2nuc'") if not "cds" in parameters and parameters["residue_datatype"] in \ ["prot2codon", "prot2nuc"]: sys.exit( "ERROR: When 'residue_datatype' is set to either 'prot2codon' or " + "'prot2nuc', an input CDS file is needed") ## If the homology search will use any program from the BLAST package, check ## whether the TARGET SEQUENCES file has been already formatted. if parameters["homology"][0] in ["legacy_blast", "blast+"]: ## Get database sequence type - p: protein or n:nucleotide dt = "p" if parameters["residue_datatype"].startswith("prot") else "n" ## Check if BLAST DB associated files already exist or not for extension in ["hr", "in", "sq"]: filename = ("%s.%s%s") % (parameters["db_file"], dt, extension) ## If the input file doesn't exist check whether input database has been ## split into different volumes if not lookForFile(filename): alternative = ("%s.00.%s%s") % (parameters["db_file"], dt, extension) if not lookForFile(alternative): db_file = parameters["db_file"] sys.exit(( "ERROR: Check your input TARGET SEQUENCES file '%s' has " + "been formated using 'formatdb'/'makeblastdb'") % (db_file)) ## If the homology search step should be perfomed using BLAST, call the ## appropiate function blast(parameters, logFile) tag = "blast" elif parameters["homology"][0] in ["phmmer", "jackhmmer", "hmmer_search"]: hmmer(parameters, logFile) ## Set the tag for the output files tag = "hmmer" ## Check whether the output file contains any result homologs = 0 inFile = ("%s.homology.%s.out") % (oFile, tag) for line in open(inFile, "rU"): if not line.strip() or line.startswith("#"): continue homologs += 1 if not homologs: print(("INFO: NO Homologous sequences found for '%s'") % \ parameters["prefix"], file = sys.stderr) sys.exit(80) ## Filter homology search data. A dictionary containing selected sequences, ## including the sequences themselves selected_sequences = filter_results(parameters, logFile) ## Generate a MD5 file containing selected sequences for the current run. ## MD5s are used to recompute the same phylogenetic tree starting from other ## seqs - with identical similarity search results - in the set of homologs outFile = ("%s.seqs.md5") % (oFile) ## Check whether the file already exists or not. if not lookForFile(outFile) or parameters["replace"]: parameters["replace"] = True seqs_md5 = md5("".join(sorted(selected_sequences.keys()))).hexdigest() print(("%s\t%s") % (parameters["prefix"], seqs_md5), file = \ open(outFile, "w")) ## Generate a file containing the selected sequences after performing the ## homology search and filtering its output according to a set of parameters. outFile = ("%s.seqs") % (oFile) ## Check whether the file already exists or not. if not lookForFile(outFile) or parameters["replace"]: parameters["replace"] = True output_file = open(outFile, "w") for seqId in sorted(selected_sequences): print((">%s\n%s") % (seqId, selected_sequences[seqId][1]), file = \ output_file) output_file.close() ## If a CDS input file is set, use it to associate to homologous protein ## sequences their corresponding CDS if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]: cdsFile = ("%s.seqs_cds") % (oFile) ## Check whether the file already exists or not. if not lookForFile(cdsFile) or parameters["replace"]: parameters["replace"] = True output_file = open(cdsFile, "w") found = set() for record in SeqIO.parse(parameters["cds"], "fasta"): if not record.id in selected_sequences: continue seq = splitSequence(str(record.seq)) print((">%s\n%s") % (record.id, seq), file=output_file) found.add(record.id) output_file.close() if set(selected_sequences.keys()) - found != set(): missed = ",".join( sorted(set(selected_sequences.keys()) - found)) sys.exit(( "ERROR: Check your input CDS file '%s'. Impossible to find " "homologs sequences [missing:'%s']") % (parameters["cds"], missed)) ## Print how much time was needed to perform the whole homology search step final = datetime.datetime.now() date = final.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tHomology\tEND\t%s") % (date), file=logFile) ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTOTAL Time\tHomology\t%s\n###") % (total), file=logFile) ## We just close logfile and clean it up when it is a file if "verbose" in parameters and parameters["verbose"] == 1: logFile.close() ## Clean-up log directory from undesirable lines try: sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell=True) sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell=True) sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell=True) except OSError: print(("ERROR: Impossible to clean-up '%s.log' log file") \ % (oFile), file = sys.stderr) ## Update the input file parameter and return the dictionary containing all ## parameters. Those parameters may be used in other steps parameters["in_file"] = outFile ## Update the associate CDS file with the resulting cds file. It will be used ## to make the back-translation in a hypothetical MSA step if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]: parameters["cds"] = ("%s.seqs_cds") % (oFile) ## Before returning to the main program, get back to the original working ## directory os.chdir(current_directory) return parameters