def set_sff_trimpoints_with_sfftools( sff_dir, technical_lengths, sffinfo_path='sffinfo', sfffile_path='sfffile', debug=False): """Set trimpoints to end of technical read for all SFF files in directory. This function essentially provides the reference implementation. It uses the official sfftools from Roche to process the SFF files. """ if not (exists(sffinfo_path) or which(sffinfo_path)): raise ApplicationNotFoundError( 'sffinfo executable not found. Is it installed and in your $PATH?') if not (exists(sfffile_path) or which(sfffile_path)): raise ApplicationNotFoundError( 'sfffile executable not found. Is it installed and in your $PATH?') for lib_id, sff_fp in get_per_lib_sff_fps(sff_dir): try: readlength = technical_lengths[lib_id] except KeyError: continue sffinfo_args = [sffinfo_path, '-s', sff_fp] if debug: print "Running sffinfo command %s" % sffinfo_args sffinfo_output_file = TemporaryFile() check_call(sffinfo_args, stdout=sffinfo_output_file) sffinfo_output_file.seek(0) seqlengths = {} for line in sffinfo_output_file: if line.startswith('>'): fields = line[1:].split() seq_len = fields[1].split('=')[1] seqlengths[fields[0]] = seq_len trim_fp = sff_fp + '.trim' trim_file = open(trim_fp, 'w') for id_, length in seqlengths.items(): curr_length = int(seqlengths[id_]) # Sfftools use 1-based index left_trim = readlength + 1 # Key sequence not included in FASTA length right_trim = curr_length + 4 if curr_length > left_trim: trim_file.write( "%s\t%s\t%s\n" % (id_, left_trim, right_trim)) else: stderr.write( 'Rejected read %s with trim points %s and %s (orig ' 'length %s)' % (id_, left_trim, curr_length, length)) trim_file.close() trimmed_sff_fp = sff_fp + '.trimmed' sfffile_args = [ sfffile_path, '-t', trim_fp, '-o', trimmed_sff_fp, sff_fp] if debug: print "Running sfffile command:", sfffile_args check_call(sfffile_args, stdout=open(devnull, 'w')) remove(sff_fp) rename(trimmed_sff_fp, sff_fp)
def _error_on_missing_application(self, params): """ Raise an ApplicationNotFoundError if the app is not accessible """ if not app_path('RNAforester'): raise ApplicationNotFoundError( "Cannot find RNAforester. Is it installed? Is it in your path?" ) if not app_path('RNAshapes'): raise ApplicationNotFoundError( "Cannot find RNAshapes. Is it installed? Is it in your path?")
def _error_on_missing_application(self, params): """Raise an ApplicationNotFoundError if the app is not accessible """ command = self._get_jar_fp() if not exists(command): raise ApplicationNotFoundError("Cannot find jar file. Is it installed? Is $RDP_JAR_PATH"+\ " set correctly?")
def raise_gdata_not_found_error(*args, **kwargs): raise ApplicationNotFoundError( "gdata cannot be found.\nIs it installed? " "Is it in your $PYTHONPATH?\nThis is an optional QIIME " "dependency, but is required if you plan to use QIIME's remote " "mapping file features. For more information, please see " "http://qiime.org/install/install.html.")
def check_flowgram_ali_exe(): """Check if we have a working FlowgramAligner""" ali_exe = get_flowgram_ali_exe() if which(ali_exe) is None: raise ApplicationNotFoundError("The alignment program %s is not " "accessible via the PATH environment " "variable." % ali_exe) # test if its callable and actually works command = "%s -h" % ali_exe proc = Popen(command, shell=True, universal_newlines=True, stdout=PIPE, stderr=STDOUT) if (proc.wait() != 0): raise ApplicationError( "Calling %s failed. Check permissions and that it is in fact an executable." % ali_exe) result = proc.stdout.read() # check that the help string looks correct if (not result.startswith("Usage")): raise ApplicationError( "Calling %s failed. Check permissions and that it is in fact an executable." % ali_exe) return True
def _error_on_missing_application(self,params): """Raise an ApplicationNotFoundError if the app is not accessible In this case, checks for the java runtime and the RDP jar file. """ if not (os.path.exists('java') or app_path('java')): raise ApplicationNotFoundError( "Cannot find java runtime. Is it installed? Is it in your " "path?") jar_fp = self._get_jar_fp() if jar_fp is None: raise ApplicationNotFoundError( "JAR file not found in current directory and the RDP_JAR_PATH " "environment variable is not set. Please set RDP_JAR_PATH to " "the full pathname of the JAR file.") if not os.path.exists(jar_fp): raise ApplicationNotFoundError( "JAR file %s does not exist." % jar_fp)
def test_blastall_fp(self): """blastall_fp is set to a valid path""" blastall = self.config["blastall_fp"] if not self.config["blastall_fp"].startswith("/"): #path is relative, figure out absolute path blast_all = app_path(blastall) if not blast_all: raise ApplicationNotFoundError("blastall_fp set to %s, but is not in your PATH. Either use an absolute path to or put it in your PATH." % blastall) self.config["blastall_fp"] = blast_all test_qiime_config_variable("blastall_fp", self.config, self, X_OK)
def submit_jobs(commands, prefix): """submit jobs using exe pointed to by cluster_jobs_fp. commands: List of commands (strings) that should be executed prefix: A uniq prefix used to name submit script """ qiime_config = load_qiime_config() CLUSTER_JOBS_SCRIPT = qiime_config['cluster_jobs_fp'] if not CLUSTER_JOBS_SCRIPT: raise ApplicationNotFoundError( "cluster_jobs_fp not set in config file!") if not (exists(CLUSTER_JOBS_SCRIPT) or which(CLUSTER_JOBS_SCRIPT)): raise ApplicationNotFoundError( "cluster_jobs_fp not in $PATH or provided as full path!") outfilename = join(get_qiime_temp_dir(), "%s_commands.txt" % prefix) fh = open(outfilename, "w") fh.write("\n".join(commands)) fh.close() cmd = '%s -ms %s %s' % (CLUSTER_JOBS_SCRIPT, outfilename, prefix) system(cmd) remove(outfilename)
def wait_for_cluster_ids(ids, interval=10): """Puts process to sleep until jobs with ids are done. ids: list of ids to wait for interval: time to sleep in seconds NOT USED ANYMORE """ if which("qstat"): for id in ids: while(getoutput("qstat %s" % id).startswith("Job")): sleep(interval) else: raise ApplicationNotFoundError("qstat not available. Is it installed?\n" + "This test may fail if not run on a cluster.")
def submit_jobs(filenames, verbose=False): """Submit jobs in filenames. filenames: list of prepared qsub job scripts, ready to be submitted verbose: a binary verbose flag """ if not which("qsub"): raise ApplicationNotFoundError("qsub not found. Can't submit jobs.") for file in filenames: command = 'qsub %s' % file result = Popen(command, shell=True, universal_newlines=True, stdout=PIPE, stderr=STDOUT).stdout.read() if verbose: print result
def run_pyfeast(data, labels, features, method='MIM', n_select=15): """ run_pyfeast(data, labels, method) @data - numpy data (dense) @labels - vector of class labels (discrete) @features - list of feature names @method - feature selection method @n_select - number of features to select The feature selection method is based off of the FEAST C variable selection toolbox. Reference: Gavin Brown, Adam Pocock, Ming-Jie Zhao, and Mikel Lujan, "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection," Journal of Machine Learning Research, vol. 13, pp. 27--66, 2012. (http://jmlr.csail.mit.edu/papers/v13/brown12a.html) """ try: import feast except ImportError: raise ApplicationNotFoundError( "Error loading the PyFeast module. It is likely that you do not have PyFeast installed locally." ) try: fs_method = getattr(feast, method) except AttributeError: raise AttributeError( "Unknown feature selection method is being specified for PyFeast. Make sure the feature selection method being selected is a valid one. " ) if len(data.transpose()) < n_select: raise ValueError( "n_select must be less than the number of observations.") if n_select <= 0: raise ValueError("n_select cannot be less than or equal to zero.") sf = fs_method(data, labels, n_select) reduced_set = [] for k in range(len(sf)): reduced_set.append(features[int(sf[k])]) return reduced_set
def get_clusters_from_fasta_filepath(fasta_filepath, original_fasta_path, percent_ID=0.97, max_accepts=1, max_rejects=8, stepwords=8, word_length=8, optimal=False, exact=False, suppress_sort=False, output_dir=None, enable_rev_strand_matching=False, subject_fasta_filepath=None, suppress_new_clusters=False, return_cluster_maps=False, stable_sort=False, save_uc_files=True, HALT_EXEC=False): """ Main convenience wrapper for using uclust to generate cluster files A source fasta file is required for the fasta_filepath. This will be sorted to be in order of longest to shortest length sequences. Following this, the sorted fasta file is used to generate a cluster file in the uclust (.uc) format. Next the .uc file is converted to cd-hit format (.clstr). Finally this file is parsed and returned as a list of lists, where each sublist a cluster of sequences. If an output_dir is specified, the intermediate files will be preserved, otherwise all files created are temporary and will be deleted at the end of this function The percent_ID parameter specifies the percent identity for a clusters, i.e., if 99% were the parameter, all sequences that were 99% identical would be grouped as a cluster. """ # Create readable intermediate filenames if they are to be kept fasta_output_filepath = None uc_output_filepath = None cd_hit_filepath = None if output_dir and not output_dir.endswith('/'): output_dir += '/' if save_uc_files: uc_save_filepath = get_output_filepaths(output_dir, original_fasta_path) else: uc_save_filepath = None sorted_fasta_filepath = "" uc_filepath = "" clstr_filepath = "" # Error check in case any app controller fails files_to_remove = [] try: if not suppress_sort: # Sort fasta input file from largest to smallest sequence sort_fasta = uclust_fasta_sort_from_filepath(fasta_filepath, \ output_filepath=fasta_output_filepath) # Get sorted fasta name from application wrapper sorted_fasta_filepath = sort_fasta['Output'].name files_to_remove.append(sorted_fasta_filepath) else: sort_fasta = None sorted_fasta_filepath = fasta_filepath # Generate uclust cluster file (.uc format) uclust_cluster = uclust_cluster_from_sorted_fasta_filepath( sorted_fasta_filepath, uc_save_filepath, percent_ID=percent_ID, max_accepts=max_accepts, max_rejects=max_rejects, stepwords=stepwords, word_length=word_length, optimal=optimal, exact=exact, suppress_sort=suppress_sort, enable_rev_strand_matching=enable_rev_strand_matching, subject_fasta_filepath=subject_fasta_filepath, suppress_new_clusters=suppress_new_clusters, stable_sort=stable_sort, HALT_EXEC=HALT_EXEC) # Get cluster file name from application wrapper remove_files(files_to_remove) except ApplicationError: remove_files(files_to_remove) raise ApplicationError, ( 'Error running uclust. Possible causes are ' 'unsupported version (current supported version is v1.2.22) is installed or ' 'improperly formatted input file was provided') except ApplicationNotFoundError: remove_files(files_to_remove) raise ApplicationNotFoundError('uclust not found, is it properly '+\ 'installed?') # Get list of lists for each cluster clusters, failures, seeds = \ clusters_from_uc_file(uclust_cluster['ClusterFile']) # Remove temp files unless user specifies output filepath if not save_uc_files: uclust_cluster.cleanUp() if return_cluster_maps: return clusters, failures, seeds else: return clusters.values(), failures, seeds
def assign_taxonomy(dataPath, reference_sequences_fp, id_to_taxonomy_fp, read_1_seqs_fp, read_2_seqs_fp, single_ok=False, no_single_ok_generic=False, header_id_regex=None, read_id_regex = "\S+\s+(\S+)", amplicon_id_regex = "(\S+)\s+(\S+?)\/", output_fp=None, log_path=None, HALT_EXEC=False, base_tmp_dir = '/tmp'): """Assign taxonomy to each sequence in data with the RTAX classifier # data: open fasta file object or list of fasta lines dataPath: path to a fasta file output_fp: path to write output; if not provided, result will be returned in a dict of {seq_id:(taxonomy_assignment,confidence)} """ usearch_command = "usearch" if not (exists(usearch_command) or app_path(usearch_command)): raise ApplicationNotFoundError("Cannot find %s. Is it installed? Is it in your path?"\ % usearch_command) my_tmp_dir = get_tmp_filename(tmp_dir=base_tmp_dir,prefix='rtax_',suffix='',result_constructor=str) os.makedirs(my_tmp_dir) try: # RTAX classifier doesn't necessarily preserve identifiers # it reports back only the id extracted as $1 using header_id_regex # since rtax takes the original unclustered sequence files as input, # the usual case is that the regex extracts the amplicon ID from the second field # Use lookup table read_1_id_to_orig_id = {} readIdExtractor = re.compile(read_id_regex) # OTU clustering produces ">clusterID read_1_id" data = open(dataPath,'r') for seq_id, seq in MinimalFastaParser(data): # apply the regex extract = readIdExtractor.match(seq_id) if extract is None: stderr.write("Matched no ID with read_id_regex " + read_id_regex +" in '" + seq_id + "' from file " + dataPath + "\n") else: read_1_id_to_orig_id[extract.group(1)] = seq_id #stderr.write(extract.group(1) + " => " + seq_id + "\n") #seq_id_lookup[seq_id.split()[1]] = seq_id data.close() # make list of amplicon IDs to pass to RTAX id_list_fp = open(my_tmp_dir+"/ampliconIdsToClassify", "w") # Establish mapping of amplicon IDs to read_1 IDs # simultaneously write the amplicon ID file for those IDs found in the input mapping above amplicon_to_read_1_id = {} ampliconIdExtractor = re.compile(amplicon_id_regex) # split_libraries produces >read_1_id ampliconID/1 ... // see also assign_taxonomy 631 read_1_data = open(read_1_seqs_fp,'r') for seq_id, seq in MinimalFastaParser(read_1_data): # apply the regex extract = ampliconIdExtractor.match(seq_id) if extract is None: stderr.write("Matched no ID with amplicon_id_regex " + amplicon_id_regex + " in '" + seq_id + "' from file " + read_1_seqs_fp + "\n") else: read_1_id = extract.group(1) amplicon_id = extract.group(2) try: amplicon_to_read_1_id[amplicon_id] = read_1_id bogus = read_1_id_to_orig_id[read_1_id] # verify that the id is valid id_list_fp.write('%s\n' % (amplicon_id)) except KeyError: pass data.close() id_list_fp.close() app = Rtax(HALT_EXEC=HALT_EXEC) temp_output_file = tempfile.NamedTemporaryFile( prefix='RtaxAssignments_', suffix='.txt') app.Parameters['-o'].on(temp_output_file.name) app.Parameters['-r'].on(reference_sequences_fp) app.Parameters['-t'].on(id_to_taxonomy_fp) # app.Parameters['-d'].on(delimiter) app.Parameters['-l'].on(id_list_fp.name) # these are amplicon IDs app.Parameters['-a'].on(read_1_seqs_fp) if read_2_seqs_fp is not None: app.Parameters['-b'].on(read_2_seqs_fp) app.Parameters['-i'].on(header_id_regex) app.Parameters['-m'].on(my_tmp_dir) if single_ok: app.Parameters['-f'].on(); if no_single_ok_generic: app.Parameters['-g'].on(); #app.Parameters['-v'].on() app_result = app() if log_path: f=open(log_path, 'a') errString=''.join(app_result['StdErr'].readlines()) + '\n' f.write(errString) f.close() assignments = {} # restore original sequence IDs with spaces for line in app_result['Assignments']: toks = line.strip().split('\t') rtax_id = toks.pop(0) if len(toks): bestpcid = toks.pop(0) # ignored lineage = toks # RTAX does not provide a measure of confidence. We could pass one in, # based on the choice of primers, or even look it up on the fly in the tables # from the "optimal primers" paper; but it would be the same for every # query sequence anyway. # we could also return bestpcid, but that's not the same thing as confidence. confidence = 1.0 read_1_id = amplicon_to_read_1_id[rtax_id] orig_id = read_1_id_to_orig_id[read_1_id] if lineage: assignments[orig_id] = (';'.join(lineage), confidence) else: assignments[orig_id] = ('Unclassified', 1.0) if output_fp: try: output_file = open(output_fp, 'w') except OSError: raise OSError("Can't open output file for writing: %s" % output_fp) for seq_id, assignment in list(assignments.items()): lineage, confidence = assignment output_file.write( '%s\t%s\t%1.3f\n' % (seq_id, lineage, confidence)) output_file.close() return None else: return assignments finally: try: rmtree(my_tmp_dir) except OSError: pass
def raise_tax2tree_not_found_error(*args, **kwargs): raise ApplicationNotFoundError( "Tax2Tree cannot be found.\nIs Tax2Tree installed? Is it in your $PYTHONPATH?" + "\nYou can obtain Tax2Tree from http://sourceforge.net/projects/tax2tree/." )
def check_sfffile(): """Raise error if sfffile is not in $PATH """ if not app_path('sfffile'): raise ApplicationNotFoundError(_MISSING_APP_MESSAGE % 'sfffile')
def check_sffinfo(): """Raise error if sffinfo is not in $PATH """ if not which('sffinfo'): raise ApplicationNotFoundError(_MISSING_APP_MESSAGE % 'sffinfo')
def raise_pynast_not_found_error(*args, **kwargs): raise ApplicationNotFoundError( "PyNAST cannot be found.\nIs PyNAST installed? Is it in your $PYTHONPATH?" + "\nYou can obtain PyNAST from http://qiime.org/pynast/.")
__credits__ = ["Kyle Patnode", "Jai Ram Rideout", "Antonio Gonzalez Pena"] __license__ = "GPL" __version__ = "1.5.3-dev" __maintainer__ = "Kyle Patnode" __email__ = "*****@*****.**" """Test suite for the generate_taxa_compare_table.py module. Tests each function in the tax2tree controller module. It should be noted that these tests are fairly sparse, since tax2tree implements quite a few of its own tests.""" from cogent.app.util import ApplicationNotFoundError try: from t2t.nlevel import load_tree, load_consensus_map, determine_rank_order except ImportError: raise ApplicationNotFoundError( "Cannot find tax2tree. Is it installed? Is it in your path?") from os import makedirs, getcwd, chdir from os.path import exists from shutil import rmtree from tempfile import mkdtemp from cogent.util.unit_test import TestCase, main from cogent.util.misc import remove_files from qiime.test import initiate_timeout, disable_timeout from qiime.util import get_qiime_temp_dir from qiime.pycogent_backports.tax2tree import * class GenerateTaxaCompareTableTests(TestCase): """Tests for the tax2tree_controller.py module.""" def setUp(self): """Set up files/environment that will be used by the tests."""
def _error_on_missing_application(self, params): """ Raise an ApplicationNotFoundError if the app is not accessible """ if not app_path('blastall'): raise ApplicationNotFoundError( "Cannot find blastall. Is it installed? Is it in your path?")