def test_fasta_sorting(self): """ Should sort fasta seqs from largest to smallest in outfile Since a fasta file has to be passed to the app controller for uclust, a temporary fasta file is created, and the raw fasta seqs supplied in this module are written to it. This file is sent to the app controller, and the resulting sorted file is compared to the expected results to ensure proper function of uclust as called by this app controller.""" test_app = Uclust({'--tmpdir': self.tmpdir}) test_app_res = test_app( data={ '--mergesort': self.tmp_unsorted_fasta_filepath, '--output': self.tmp_sorted_fasta_filepath }) sorted_fasta_actual = [ l.strip() for l in open(test_app_res['Output'].name, "U") ] sorted_fasta_expected = [l.strip() for l in sorted_dna_seqs if l] self.assertEqual(sorted_fasta_actual, sorted_fasta_expected) test_app_res.cleanUp()
def test_clustering_fasta_filepath(self): """ Should create clusters in uclust format from sorted fasta file Since a fasta file has to be passed to the app controller for uclust, a temporary fasta file is created, and the sorted seqs supplied in this module are written to it. This file is sent to the app controller, and the resulting uclust file is compared to the expected results to ensure proper function of uclust as called by this app controller.""" test_app = Uclust({'--id': 0.9}, HALT_EXEC=False) test_app_res = test_app( data={ '--input': self.tmp_sorted_fasta_filepath, '--uc': self.tmp_uc_filepath }) uc_file = open(test_app_res['ClusterFile'].name, "U") # compare the actual and expect uc files, ignoring comment lines uc_file_actual = [l.strip() for l in uc_file if not l.startswith('#')] uc_file_expected = [ l.strip() for l in uc_dna_clusters if not l.startswith('#') ] self.assertEqual(uc_file_actual, uc_file_expected) test_app_res.cleanUp()
def seqSort(infasta, outfasta, tmpdir): params = {'--tmpdir': tmpdir} app = Uclust(params, HALT_EXEC=False, TmpDir=tmpdir) app_result = app(data={'--mergesort': infasta, '--output': outfasta}) if app_result['ExitStatus'] != 0: sys.stderr.write(app_result['StdErr'].read()) sys.exit(app_result['ExitStatus']) app_result['Output'].close()
def search(infasta, inlib, outclust, pid, rev, tmpdir): params = {'--id': pid, '--lib': inlib, '--libonly': True} if rev: params['--rev'] = True app = Uclust(params, HALT_EXEC=False, TmpDir=tmpdir) app_result = app({'--input': infasta, '--uc': outclust}) if app_result['ExitStatus'] != 0: sys.stderr.write(app_result['StdErr'].read()) sys.exit(app_result['ExitStatus']) app_result['ClusterFile'].close()
def test_parameter_availability(self): """ Often used parameters are accessible This is just some basic sanity checking. """ a = Uclust() # if a parameter is not accessible, trying to turn it on will # raise a KeyError a.Parameters['--allhits'].on() a.Parameters['--libonly'].on() a.Parameters['--maxaccepts'].on(42) a.Parameters['--maxrejects'].on(42) a.Parameters['--rev'].on()
def __call__(self, seq_path, result_path=None, uc_path=None, log_path=None, HALT_EXEC=False): """Returns mapping of each seq to (tax, consensus fraction, n) Results: If result_path is specified, the results will be written to file as tab-separated lines of: query_id <tab> tax <tab> consensus fraction <tab> n If result_path is None (default), the results will be returned as a dict of: {'query_id': (tax, consensus fraction, n)} In both cases, the values are: tax: the consensus taxonomy assignment consensus fraction: the fraction of the assignments for the query that contained the lowest level tax assignment that is included in tax (e.g., if the assignment goes to genus level, this will be the fraction of assignments that had the consensus genus assignment) n: the number of assignments that were considered when constructing the consensus Parameters: seq_path: path to file of query sequences result_path: path where results should be written. If None (default), returns results as a dict uc_path: path where .uc file should be saved. If None (default), and log_path is specified, the .uc contents will be written to appended to the log file. log_path: path where run log should be written. If None (default), no log file is written. HALT_EXEC: debugging paramter. If pass, will exit just before the uclust command is issued, and will print the command that would have been called to stdout. """ # initialize the logger logger = self._get_logger(log_path) logger.info(str(self)) # set the user-defined parameters params = { '--id': self.Params['similarity'], '--maxaccepts': self.Params['max_accepts'] } # initialize the application controller object app = Uclust(params, HALT_EXEC=HALT_EXEC) # Configure for consensus taxonomy assignment app.Parameters['--rev'].on() app.Parameters['--lib'].on(self.Params['reference_sequences_fp']) app.Parameters['--libonly'].on() app.Parameters['--allhits'].on() if uc_path is None: uc = NamedTemporaryFile(prefix='UclustConsensusTaxonAssigner_', suffix='.uc', dir=get_qiime_temp_dir()) uc_path = uc.name store_uc_in_log = True else: store_uc_in_log = False app_result = app({'--input': seq_path, '--uc': uc_path}) result = self._uc_to_assignment(app_result['ClusterFile']) if result_path is not None: # if the user provided a result_path, write the # results to file of = open(result_path, 'w') for seq_id, (assignment, consensus_fraction, n) in result.items(): assignment_str = ';'.join(assignment) of.write('%s\t%s\t%1.2f\t%d\n' % (seq_id, assignment_str, consensus_fraction, n)) of.close() result = None logger.info('Result path: %s' % result_path) else: # If no result_path was provided, the result dict is # returned as-is. logger.info('Result path: None, returned as dict.') if store_uc_in_log: # This is a little hackish, but we don't have a good way # to pass the uc_path value right now through the # assign_taxonomy.py script, so writing the contents to the # user-specified log file (since this is being stored for logging # purposes). app_result['ClusterFile'].seek(0) logger.info('\n.uc file contents:\n') for line in app_result['ClusterFile']: logger.info(line.strip()) return result