Exemplo n.º 1
0
def make_tabular(raw_handle, out_handle):
    """Parse text output into tabular, return query count."""
    identifier = None
    queries = 0
    for line in raw_handle:
        #print repr(line)
        if not line.strip() or line == "Promoter prediction:\n":
            pass
        elif line[0] != " ":
            identifier = line.strip().replace("\t", " ").split(None,1)[0]
            queries += 1
        elif line == "  No promoter predicted\n":
            #End of a record
            identifier = None
        elif line == "  Position  Score  Likelihood\n":
            assert identifier
        else:
            try:
                position, score, likelihood = line.strip().split(None,2)
            except ValueError:
                print "WARNING: Problem with line: %r" % line
                continue
                #stop_err("ERROR: Problem with line: %r" % line)
            if likelihood not in ["ignored",
                                  "Marginal prediction",
                                  "Medium likely prediction",
                                  "Highly likely prediction"]:
                stop_err("ERROR: Problem with line: %r" % line)
            out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood))
    return queries
Exemplo n.º 2
0
def clean_tabular(raw_handle, out_handle):
    """Clean up tabular TMHMM output, returns output line count."""
    count = 0
    for line in raw_handle:
        if not line.strip() or line.startswith("#"):
            #Ignore any blank lines or comment lines
            continue
        parts = line.rstrip("\r\n").split("\t")
        try:
            identifier, length, expAA, first60, predhel, topology = parts
        except:
            assert len(parts)!=6
            stop_err("Bad line: %r" % line)
        assert length.startswith("len="), line
        length = length[4:]
        assert expAA.startswith("ExpAA="), line
        expAA = expAA[6:]
        assert first60.startswith("First60="), line
        first60 = first60[8:]
        assert predhel.startswith("PredHel="), line
        predhel = predhel[8:]
        assert topology.startswith("Topology="), line
        topology = topology[9:]
        out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \
                   % (identifier, length, expAA, first60, predhel, topology))
        count += 1
    return count
Exemplo n.º 3
0
def clean_tabular(raw_handle, out_handle):
    """Clean up tabular TMHMM output, returns output line count."""
    count = 0
    for line in raw_handle:
        if not line.strip() or line.startswith("#"):
            #Ignore any blank lines or comment lines
            continue
        parts = line.rstrip("\r\n").split("\t")
        try:
            identifier, length, expAA, first60, predhel, topology = parts
        except:
            assert len(parts) != 6
            stop_err("Bad line: %r" % line)
        assert length.startswith("len="), line
        length = length[4:]
        assert expAA.startswith("ExpAA="), line
        expAA = expAA[6:]
        assert first60.startswith("First60="), line
        first60 = first60[8:]
        assert predhel.startswith("PredHel="), line
        predhel = predhel[8:]
        assert topology.startswith("Topology="), line
        topology = topology[9:]
        out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \
                   % (identifier, length, expAA, first60, predhel, topology))
        count += 1
    return count
Exemplo n.º 4
0
def get_path_and_binary():
    platform = commands.getoutput("uname") #e.g. Linux
    shell_script = commands.getoutput("which promoter")
    if not os.path.isfile(shell_script):
        stop_err("ERROR: Missing promoter executable shell script")
    path = None
    for line in open(shell_script):
        if line.startswith("setenv"): #could then be tab or space!
            parts = line.rstrip().split(None, 2)
            if parts[0] == "setenv" and parts[1] == "PROM":
                path = parts[2]
    if not path:
        stop_err("ERROR: Could not find promoter path (PROM) in %r" % shell_script)
    if not os.path.isdir(path):
        stop_error("ERROR: %r is not a directory" % path)
    bin = "%s/bin/promoter_%s" % (path, platform)
    if not os.path.isfile(bin):
        stop_err("ERROR: Missing promoter binary %r" % bin)
    return path, bin
Exemplo n.º 5
0
promoter 2 with long descriptions in the FASTA header line (over 200
characters) which produces stray fragements of the description in the
output file, making parsing non-trivial.

TODO - Automatically extract the sequence containing a promoter prediction?
"""
import sys
import os
import commands
import tempfile
from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count

FASTA_CHUNK = 500

if len(sys.argv) != 4:
    stop_err("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. "
             "Got %i arguments." % (len(sys.argv)-1))

num_threads = thread_count(sys.argv[3],default=4)
fasta_file = os.path.abspath(sys.argv[2])
tabular_file = os.path.abspath(sys.argv[3])

tmp_dir = tempfile.mkdtemp()

def get_path_and_binary():
    platform = commands.getoutput("uname") #e.g. Linux
    shell_script = commands.getoutput("which promoter")
    if not os.path.isfile(shell_script):
        stop_err("ERROR: Missing promoter executable shell script")
    path = None
    for line in open(shell_script):
        if line.startswith("setenv"): #could then be tab or space!
Exemplo n.º 6
0
with a # character as used elsewhere in Galaxy.
"""
import sys
import os
import tempfile
from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count

FASTA_CHUNK = 500

if "-v" in sys.argv or "--version" in sys.argv:
    """Return underlying PSORTb's version"""
    sys.exit(os.system("psort --version"))

if len(sys.argv) != 8:
    stop_err("Require 7 arguments, number of threads (int), type (e.g. archaea), "
             "output (e.g. terse/normal/long), cutoff, divergent, input protein "
             "FASTA file & output tabular file")

num_threads = thread_count(sys.argv[1], default=4)
org_type = sys.argv[2]
out_type = sys.argv[3]
cutoff = sys.argv[4]
if cutoff.strip() and float(cutoff.strip()) != 0.0:
    cutoff = "-c %s" % cutoff
else:
    cutoff = ""
divergent = sys.argv[5]
if divergent.strip() and float(divergent.strip()) != 0.0:
    divergent = "-d %s" % divergent
else:
    divergent = ""
Exemplo n.º 7
0
#!/usr/bin/env python
#Wrapper script to call WoLF PSORT from its own directory.
import os
import sys
import subprocess
saved_dir = os.path.abspath(os.curdir)
os.chdir("/opt/WoLFPSORT_package_v0.2/bin")
args = ["./runWolfPsortSummary"] + sys.argv[1:]
return_code = subprocess.call(args)
os.chdir(saved_dir)
sys.exit(return_code)
"""

if len(sys.argv) != 5:
    stop_err("Require four arguments, organism, threads, input protein FASTA file & output tabular file")

organism = sys.argv[1]
if organism not in ["animal", "plant", "fungi"]:
    stop_err("Organism argument %s is not one of animal, plant, fungi" % organism)

num_threads = thread_count(sys.argv[2], default=4)
fasta_file = sys.argv[3]
tabular_file = sys.argv[4]

def clean_tabular(raw_handle, out_handle):
    """Clean up WoLF PSORT output to make it tabular."""
    for line in raw_handle:
        if not line or line.startswith("#"):
            continue
        name, data = line.rstrip("\r\n").split(None,1)
Exemplo n.º 8
0
with a # character as used elsewhere in Galaxy.
"""
import sys
import os
import tempfile
from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count

FASTA_CHUNK = 500

if "-v" in sys.argv or "--version" in sys.argv:
    """Return underlying PSORTb's version"""
    sys.exit(os.system("psort --version"))

if len(sys.argv) != 8:
    stop_err(
        "Require 7 arguments, number of threads (int), type (e.g. archaea), "
        "output (e.g. terse/normal/long), cutoff, divergent, input protein "
        "FASTA file & output tabular file")

num_threads = thread_count(sys.argv[1], default=4)
org_type = sys.argv[2]
out_type = sys.argv[3]
cutoff = sys.argv[4]
if cutoff.strip() and float(cutoff.strip()) != 0.0:
    cutoff = "-c %s" % cutoff
else:
    cutoff = ""
divergent = sys.argv[5]
if divergent.strip() and float(divergent.strip()) != 0.0:
    divergent = "-d %s" % divergent
else:
    divergent = ""
Exemplo n.º 9
0
Note that this is somewhat redundant with job-splitting available in Galaxy
itself (see the SignalP XML file for settings).

Also tmhmm2 can fail without returning an error code, for example if run on a
64 bit machine with only the 32 bit binaries installed. This script will spot
when there is no output from tmhmm2, and raise an error.
"""
import sys
import os
import tempfile
from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count

FASTA_CHUNK = 500

if len(sys.argv) != 4:
    stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file")

num_threads = thread_count(sys.argv[1], default=4)
fasta_file = sys.argv[2]
tabular_file = sys.argv[3]

tmp_dir = tempfile.mkdtemp()

def clean_tabular(raw_handle, out_handle):
    """Clean up tabular TMHMM output, returns output line count."""
    count = 0
    for line in raw_handle:
        if not line.strip() or line.startswith("#"):
            #Ignore any blank lines or comment lines
            continue
        parts = line.rstrip("\r\n").split("\t")
Exemplo n.º 10
0
Finally, you can opt to have a GFF3 file produced which will describe the
predicted signal peptide and mature peptide for each protein (using one of
the predictors which gives a cleavage site). *WORK IN PROGRESS*
"""
import sys
import os
import tempfile
from seq_analysis_utils import stop_err, split_fasta, fasta_iterator
from seq_analysis_utils import run_jobs, thread_count

FASTA_CHUNK = 500
MAX_LEN = 6000  #Found by trial and error

if len(sys.argv) not in [6, 8]:
    stop_err("Require five (or 7) arguments, organism, truncate, threads, "
             "input protein FASTA file & output tabular file (plus "
             "optionally cut method and GFF3 output file). "
             "Got %i arguments." % (len(sys.argv) - 1))

organism = sys.argv[1]
if organism not in ["euk", "gram+", "gram-"]:
    stop_err("Organism argument %s is not one of euk, gram+ or gram-" %
             organism)

try:
    truncate = int(sys.argv[2])
except:
    truncate = 0
if truncate < 0:
    stop_err("Truncate argument %s is not a positive integer (or zero)" %
             sys.argv[2])
Exemplo n.º 11
0
Whisson et al. (2007) used SignalP v3.0 anyway.

Whisson et al. (2007) used HMMER 2.3.2, and althought their HMM model
can still be used with hmmsearch from HMMER 3 this this does give
slightly different results. We expect the hmmsearch from HMMER 2.3.2
(the last stable release of HMMER 2) to be present on the path under
the name hmmsearch2 (allowing it to co-exist with HMMER 3).
"""
import os
import sys
import re
import subprocess
from seq_analysis_utils import stop_err, fasta_iterator

if len(sys.argv) != 5:
   stop_err("Requires four arguments: protein FASTA filename, threads, model, and output filename")

fasta_file, threads, model, tabular_file = sys.argv[1:]
hmm_output_file = tabular_file + ".hmm.tmp"
signalp_input_file = tabular_file + ".fasta.tmp"
signalp_output_file = tabular_file + ".tabular.tmp"
min_signalp_hmm = 0.9
hmmer_search = "hmmsearch2"

if model == "Bhattacharjee2006":
   signalp_trunc = 70
   re_rxlr = re.compile("R.LR")
   min_sp = 10
   max_sp = 40
   max_sp_rxlr = 100
   min_rxlr_start = 1
Exemplo n.º 12
0
#!/usr/bin/env python
#Wrapper script to call WoLF PSORT from its own directory.
import os
import sys
import subprocess
saved_dir = os.path.abspath(os.curdir)
os.chdir("/opt/WoLFPSORT_package_v0.2/bin")
args = ["./runWolfPsortSummary"] + sys.argv[1:]
return_code = subprocess.call(args)
os.chdir(saved_dir)
sys.exit(return_code)
"""

if len(sys.argv) != 5:
    stop_err(
        "Require four arguments, organism, threads, input protein FASTA file & output tabular file"
    )

organism = sys.argv[1]
if organism not in ["animal", "plant", "fungi"]:
    stop_err("Organism argument %s is not one of animal, plant, fungi" %
             organism)

num_threads = thread_count(sys.argv[2], default=4)
fasta_file = sys.argv[3]
tabular_file = sys.argv[4]


def clean_tabular(raw_handle, out_handle):
    """Clean up WoLF PSORT output to make it tabular."""
    for line in raw_handle:
Exemplo n.º 13
0
itself (see the SignalP XML file for settings).

Also tmhmm2 can fail without returning an error code, for example if run on a
64 bit machine with only the 32 bit binaries installed. This script will spot
when there is no output from tmhmm2, and raise an error.
"""
import sys
import os
import tempfile
from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count

FASTA_CHUNK = 500

if len(sys.argv) != 4:
    stop_err(
        "Require three arguments, number of threads (int), input protein FASTA file & output tabular file"
    )

num_threads = thread_count(sys.argv[1], default=4)
fasta_file = sys.argv[2]
tabular_file = sys.argv[3]

tmp_dir = tempfile.mkdtemp()


def clean_tabular(raw_handle, out_handle):
    """Clean up tabular TMHMM output, returns output line count."""
    count = 0
    for line in raw_handle:
        if not line.strip() or line.startswith("#"):
            #Ignore any blank lines or comment lines
Exemplo n.º 14
0
Finally, you can opt to have a GFF3 file produced which will describe the
predicted signal peptide and mature peptide for each protein (using one of
the predictors which gives a cleavage site). *WORK IN PROGRESS*
"""
import sys
import os
import tempfile
from seq_analysis_utils import stop_err, split_fasta, fasta_iterator
from seq_analysis_utils import run_jobs, thread_count

FASTA_CHUNK = 500
MAX_LEN = 6000 #Found by trial and error

if len(sys.argv) not in  [6,8]:
    stop_err("Require five (or 7) arguments, organism, truncate, threads, "
             "input protein FASTA file & output tabular file (plus "
             "optionally cut method and GFF3 output file). "
             "Got %i arguments." % (len(sys.argv)-1))

organism = sys.argv[1]
if organism not in ["euk", "gram+", "gram-"]:
    stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism)

try:
    truncate = int(sys.argv[2])
except:
    truncate = 0
if truncate < 0:
    stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])

num_threads = thread_count(sys.argv[3], default=4)
fasta_file = sys.argv[4]