def make_tabular(raw_handle, out_handle): """Parse text output into tabular, return query count.""" identifier = None queries = 0 for line in raw_handle: #print repr(line) if not line.strip() or line == "Promoter prediction:\n": pass elif line[0] != " ": identifier = line.strip().replace("\t", " ").split(None,1)[0] queries += 1 elif line == " No promoter predicted\n": #End of a record identifier = None elif line == " Position Score Likelihood\n": assert identifier else: try: position, score, likelihood = line.strip().split(None,2) except ValueError: print "WARNING: Problem with line: %r" % line continue #stop_err("ERROR: Problem with line: %r" % line) if likelihood not in ["ignored", "Marginal prediction", "Medium likely prediction", "Highly likely prediction"]: stop_err("ERROR: Problem with line: %r" % line) out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) return queries
def clean_tabular(raw_handle, out_handle): """Clean up tabular TMHMM output, returns output line count.""" count = 0 for line in raw_handle: if not line.strip() or line.startswith("#"): #Ignore any blank lines or comment lines continue parts = line.rstrip("\r\n").split("\t") try: identifier, length, expAA, first60, predhel, topology = parts except: assert len(parts)!=6 stop_err("Bad line: %r" % line) assert length.startswith("len="), line length = length[4:] assert expAA.startswith("ExpAA="), line expAA = expAA[6:] assert first60.startswith("First60="), line first60 = first60[8:] assert predhel.startswith("PredHel="), line predhel = predhel[8:] assert topology.startswith("Topology="), line topology = topology[9:] out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \ % (identifier, length, expAA, first60, predhel, topology)) count += 1 return count
def clean_tabular(raw_handle, out_handle): """Clean up tabular TMHMM output, returns output line count.""" count = 0 for line in raw_handle: if not line.strip() or line.startswith("#"): #Ignore any blank lines or comment lines continue parts = line.rstrip("\r\n").split("\t") try: identifier, length, expAA, first60, predhel, topology = parts except: assert len(parts) != 6 stop_err("Bad line: %r" % line) assert length.startswith("len="), line length = length[4:] assert expAA.startswith("ExpAA="), line expAA = expAA[6:] assert first60.startswith("First60="), line first60 = first60[8:] assert predhel.startswith("PredHel="), line predhel = predhel[8:] assert topology.startswith("Topology="), line topology = topology[9:] out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \ % (identifier, length, expAA, first60, predhel, topology)) count += 1 return count
def get_path_and_binary(): platform = commands.getoutput("uname") #e.g. Linux shell_script = commands.getoutput("which promoter") if not os.path.isfile(shell_script): stop_err("ERROR: Missing promoter executable shell script") path = None for line in open(shell_script): if line.startswith("setenv"): #could then be tab or space! parts = line.rstrip().split(None, 2) if parts[0] == "setenv" and parts[1] == "PROM": path = parts[2] if not path: stop_err("ERROR: Could not find promoter path (PROM) in %r" % shell_script) if not os.path.isdir(path): stop_error("ERROR: %r is not a directory" % path) bin = "%s/bin/promoter_%s" % (path, platform) if not os.path.isfile(bin): stop_err("ERROR: Missing promoter binary %r" % bin) return path, bin
promoter 2 with long descriptions in the FASTA header line (over 200 characters) which produces stray fragements of the description in the output file, making parsing non-trivial. TODO - Automatically extract the sequence containing a promoter prediction? """ import sys import os import commands import tempfile from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 if len(sys.argv) != 4: stop_err("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " "Got %i arguments." % (len(sys.argv)-1)) num_threads = thread_count(sys.argv[3],default=4) fasta_file = os.path.abspath(sys.argv[2]) tabular_file = os.path.abspath(sys.argv[3]) tmp_dir = tempfile.mkdtemp() def get_path_and_binary(): platform = commands.getoutput("uname") #e.g. Linux shell_script = commands.getoutput("which promoter") if not os.path.isfile(shell_script): stop_err("ERROR: Missing promoter executable shell script") path = None for line in open(shell_script): if line.startswith("setenv"): #could then be tab or space!
with a # character as used elsewhere in Galaxy. """ import sys import os import tempfile from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 if "-v" in sys.argv or "--version" in sys.argv: """Return underlying PSORTb's version""" sys.exit(os.system("psort --version")) if len(sys.argv) != 8: stop_err("Require 7 arguments, number of threads (int), type (e.g. archaea), " "output (e.g. terse/normal/long), cutoff, divergent, input protein " "FASTA file & output tabular file") num_threads = thread_count(sys.argv[1], default=4) org_type = sys.argv[2] out_type = sys.argv[3] cutoff = sys.argv[4] if cutoff.strip() and float(cutoff.strip()) != 0.0: cutoff = "-c %s" % cutoff else: cutoff = "" divergent = sys.argv[5] if divergent.strip() and float(divergent.strip()) != 0.0: divergent = "-d %s" % divergent else: divergent = ""
#!/usr/bin/env python #Wrapper script to call WoLF PSORT from its own directory. import os import sys import subprocess saved_dir = os.path.abspath(os.curdir) os.chdir("/opt/WoLFPSORT_package_v0.2/bin") args = ["./runWolfPsortSummary"] + sys.argv[1:] return_code = subprocess.call(args) os.chdir(saved_dir) sys.exit(return_code) """ if len(sys.argv) != 5: stop_err("Require four arguments, organism, threads, input protein FASTA file & output tabular file") organism = sys.argv[1] if organism not in ["animal", "plant", "fungi"]: stop_err("Organism argument %s is not one of animal, plant, fungi" % organism) num_threads = thread_count(sys.argv[2], default=4) fasta_file = sys.argv[3] tabular_file = sys.argv[4] def clean_tabular(raw_handle, out_handle): """Clean up WoLF PSORT output to make it tabular.""" for line in raw_handle: if not line or line.startswith("#"): continue name, data = line.rstrip("\r\n").split(None,1)
with a # character as used elsewhere in Galaxy. """ import sys import os import tempfile from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 if "-v" in sys.argv or "--version" in sys.argv: """Return underlying PSORTb's version""" sys.exit(os.system("psort --version")) if len(sys.argv) != 8: stop_err( "Require 7 arguments, number of threads (int), type (e.g. archaea), " "output (e.g. terse/normal/long), cutoff, divergent, input protein " "FASTA file & output tabular file") num_threads = thread_count(sys.argv[1], default=4) org_type = sys.argv[2] out_type = sys.argv[3] cutoff = sys.argv[4] if cutoff.strip() and float(cutoff.strip()) != 0.0: cutoff = "-c %s" % cutoff else: cutoff = "" divergent = sys.argv[5] if divergent.strip() and float(divergent.strip()) != 0.0: divergent = "-d %s" % divergent else: divergent = ""
Note that this is somewhat redundant with job-splitting available in Galaxy itself (see the SignalP XML file for settings). Also tmhmm2 can fail without returning an error code, for example if run on a 64 bit machine with only the 32 bit binaries installed. This script will spot when there is no output from tmhmm2, and raise an error. """ import sys import os import tempfile from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 if len(sys.argv) != 4: stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") num_threads = thread_count(sys.argv[1], default=4) fasta_file = sys.argv[2] tabular_file = sys.argv[3] tmp_dir = tempfile.mkdtemp() def clean_tabular(raw_handle, out_handle): """Clean up tabular TMHMM output, returns output line count.""" count = 0 for line in raw_handle: if not line.strip() or line.startswith("#"): #Ignore any blank lines or comment lines continue parts = line.rstrip("\r\n").split("\t")
Finally, you can opt to have a GFF3 file produced which will describe the predicted signal peptide and mature peptide for each protein (using one of the predictors which gives a cleavage site). *WORK IN PROGRESS* """ import sys import os import tempfile from seq_analysis_utils import stop_err, split_fasta, fasta_iterator from seq_analysis_utils import run_jobs, thread_count FASTA_CHUNK = 500 MAX_LEN = 6000 #Found by trial and error if len(sys.argv) not in [6, 8]: stop_err("Require five (or 7) arguments, organism, truncate, threads, " "input protein FASTA file & output tabular file (plus " "optionally cut method and GFF3 output file). " "Got %i arguments." % (len(sys.argv) - 1)) organism = sys.argv[1] if organism not in ["euk", "gram+", "gram-"]: stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism) try: truncate = int(sys.argv[2]) except: truncate = 0 if truncate < 0: stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])
Whisson et al. (2007) used SignalP v3.0 anyway. Whisson et al. (2007) used HMMER 2.3.2, and althought their HMM model can still be used with hmmsearch from HMMER 3 this this does give slightly different results. We expect the hmmsearch from HMMER 2.3.2 (the last stable release of HMMER 2) to be present on the path under the name hmmsearch2 (allowing it to co-exist with HMMER 3). """ import os import sys import re import subprocess from seq_analysis_utils import stop_err, fasta_iterator if len(sys.argv) != 5: stop_err("Requires four arguments: protein FASTA filename, threads, model, and output filename") fasta_file, threads, model, tabular_file = sys.argv[1:] hmm_output_file = tabular_file + ".hmm.tmp" signalp_input_file = tabular_file + ".fasta.tmp" signalp_output_file = tabular_file + ".tabular.tmp" min_signalp_hmm = 0.9 hmmer_search = "hmmsearch2" if model == "Bhattacharjee2006": signalp_trunc = 70 re_rxlr = re.compile("R.LR") min_sp = 10 max_sp = 40 max_sp_rxlr = 100 min_rxlr_start = 1
#!/usr/bin/env python #Wrapper script to call WoLF PSORT from its own directory. import os import sys import subprocess saved_dir = os.path.abspath(os.curdir) os.chdir("/opt/WoLFPSORT_package_v0.2/bin") args = ["./runWolfPsortSummary"] + sys.argv[1:] return_code = subprocess.call(args) os.chdir(saved_dir) sys.exit(return_code) """ if len(sys.argv) != 5: stop_err( "Require four arguments, organism, threads, input protein FASTA file & output tabular file" ) organism = sys.argv[1] if organism not in ["animal", "plant", "fungi"]: stop_err("Organism argument %s is not one of animal, plant, fungi" % organism) num_threads = thread_count(sys.argv[2], default=4) fasta_file = sys.argv[3] tabular_file = sys.argv[4] def clean_tabular(raw_handle, out_handle): """Clean up WoLF PSORT output to make it tabular.""" for line in raw_handle:
itself (see the SignalP XML file for settings). Also tmhmm2 can fail without returning an error code, for example if run on a 64 bit machine with only the 32 bit binaries installed. This script will spot when there is no output from tmhmm2, and raise an error. """ import sys import os import tempfile from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 if len(sys.argv) != 4: stop_err( "Require three arguments, number of threads (int), input protein FASTA file & output tabular file" ) num_threads = thread_count(sys.argv[1], default=4) fasta_file = sys.argv[2] tabular_file = sys.argv[3] tmp_dir = tempfile.mkdtemp() def clean_tabular(raw_handle, out_handle): """Clean up tabular TMHMM output, returns output line count.""" count = 0 for line in raw_handle: if not line.strip() or line.startswith("#"): #Ignore any blank lines or comment lines
Finally, you can opt to have a GFF3 file produced which will describe the predicted signal peptide and mature peptide for each protein (using one of the predictors which gives a cleavage site). *WORK IN PROGRESS* """ import sys import os import tempfile from seq_analysis_utils import stop_err, split_fasta, fasta_iterator from seq_analysis_utils import run_jobs, thread_count FASTA_CHUNK = 500 MAX_LEN = 6000 #Found by trial and error if len(sys.argv) not in [6,8]: stop_err("Require five (or 7) arguments, organism, truncate, threads, " "input protein FASTA file & output tabular file (plus " "optionally cut method and GFF3 output file). " "Got %i arguments." % (len(sys.argv)-1)) organism = sys.argv[1] if organism not in ["euk", "gram+", "gram-"]: stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism) try: truncate = int(sys.argv[2]) except: truncate = 0 if truncate < 0: stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) num_threads = thread_count(sys.argv[3], default=4) fasta_file = sys.argv[4]