def _train_with_rdp_files(self, training_seqs_file, taxonomy_file, model_output_dir, remove_tmp=True): """Creates a set of training data for the RDP Classifier training_seqs_file: A pre-classified set of training sequences, in fasta-like format. Each sequence must be labelled with an identifier (no spaces) and an assigned lineage (taxa separated by ';'). Example of a valid label: ">seq1 ROOT;Ph1;Fam1;G1;" taxonomy_file: A File-like object that specifies a taxonomic heirarchy. Each line in the file must contain a '*'-separated list of the following items: Taxon ID, Taxon Name, Parent Taxon ID, Depth, and Rank. IDs should have an integer format. Example of a valid line: "1*Bacteria*0*0*domain" model_output_dir: Directory in which to store training data. remove_tmp: if True, removes tmp files To use the resulting model with the RdpClassifier, set '-training_data' to the following path: model_output_dir + RdpClassifier.PropertiesFile """ # Three extra pieces of information are required to create # training data. Unless we want built-in support for # versioned training sets, these may be set to sensible # defaults. training_set_id = '1' taxonomy_version = 'version1' modification_info = 'cogent' # The properties file specifies the names of the files in the # training directory. We use the example properties file # directly from the rdp_classifier distribution, which lists # the default set of files created by the application. We # must write this file explicitly after generating the # training data. properties = ( "# Sample ResourceBundle properties file\n" "bergeyTree=bergeyTrainingTree.xml\n" "probabilityList=genus_wordConditionalProbList.txt\n" "probabilityIndex=wordConditionalProbIndexArr.txt\n" "wordPrior=logWordPrior.txt\n" "classifierVersion=Naive Bayesian rRNA Classifier Version 1.0, November 2003\n" ) input_handler = self.InputHandler suppress_stdout = self.SuppressStdout suppress_stderr = self.SuppressStderr if suppress_stdout: outfile = FilePath('/dev/null') else: outfile = self.getTmpFilename(self.TmpDir) if suppress_stderr: errfile = FilePath('/dev/null') else: errfile = FilePath(self.getTmpFilename(self.TmpDir)) input_handler_function = getattr(self, input_handler) taxonomy_filename = input_handler_function(taxonomy_file) training_seqs_filename = input_handler_function(training_seqs_file) # Build up the command, consisting of a BaseCommand followed # by input and output (file) specifications # Example from rdp_classifier/sampledata/README: # java -Xmx400m -cp rdp_classifier-2.0.jar # edu/msu/cme/rdp/classifier/train/ClassifierTraineeMaker # mydata/mytaxon.txt mydata/mytrainseq.fasta 1 version1 test # mydata command = self._commandline_join([ self.BaseCommand, taxonomy_filename, training_seqs_filename, training_set_id, taxonomy_version, modification_info, model_output_dir, '>', outfile, '2>', errfile ]) if self.HaltExec: raise AssertionError, "Halted exec with command:\n" + command # The return value of system is a 16-bit number containing the signal # number that killed the process, and then the exit status. # We only want to keep the exit status so do a right bitwise shift to # get rid of the signal number byte exit_status = system(command) >> 8 # Determine if error should be raised due to exit status of # appliciation if not self._accept_exit_status(exit_status): raise ApplicationError, \ 'Unacceptable application exit status: %s, command: %s'\ % (str(exit_status),command) # must write properties file to output directory manually properties_fp = path.join(model_output_dir, self.PropertiesFile) properties_file = open(properties_fp, 'w') properties_file.write(properties) properties_file.close() # open the stdout and stderr if not being suppressed out = None if not suppress_stdout: out = open(outfile, "r") err = None if not suppress_stderr: err = open(errfile, "r") result = CommandLineAppResult( out, err, exit_status, result_paths=self._get_result_paths(model_output_dir)) # Clean up the input files if remove_tmp: remove(taxonomy_filename) remove(training_seqs_filename) return result
def __call__(self, data=None, remove_tmp=True): """Run the application with the specified kwargs on data data: anything that can be cast into a string or written out to a file. Usually either a list of things or a single string or number. input_handler will be called on this data before it is passed as part of the command-line argument, so by creating your own input handlers you can customize what kind of data you want your application to accept remove_tmp: if True, removes tmp files """ input_handler = self.InputHandler suppress_stdout = self.SuppressStdout suppress_stderr = self.SuppressStderr assignment_fp = FilePath(self.getTmpFilename(self.TmpDir)) if suppress_stdout: outfile = FilePath('/dev/null') else: outfile = FilePath(self.getTmpFilename(self.TmpDir)) if suppress_stderr: errfile = FilePath('/dev/null') else: errfile = FilePath(self.getTmpFilename(self.TmpDir)) if data is None: input_arg = '' else: input_arg = getattr(self, input_handler)(data) training_data = self.PositionalParameters['-training-data'] # Build up the command, consisting of a BaseCommand followed by # input and output (file) specifications command = self._commandline_join([ self.BaseCommand, input_arg, assignment_fp, training_data, '>', outfile, '2>', errfile, ]) if self.HaltExec: raise AssertionError, "Halted exec with command:\n" + command # The return value of system is a 16-bit number containing the signal # number that killed the process, and then the exit status. # We only want to keep the exit status so do a right bitwise shift to # get rid of the signal number byte exit_status = system(command) >> 8 # Determine if error should be raised due to exit status of # appliciation if not self._accept_exit_status(exit_status): raise ApplicationError, \ 'Unacceptable application exit status: %s, command: %s'\ % (str(exit_status),command) # open the stdout and stderr if not being suppressed out = None if not suppress_stdout: out = open(outfile, "r") err = None if not suppress_stderr: err = open(errfile, "r") result_paths = self._get_result_paths(data) result_paths['Assignments'] = ResultPath(assignment_fp) result = CommandLineAppResult(out, err, exit_status, result_paths=result_paths) # Clean up the input file if one was created if remove_tmp: if self._input_filename: remove(self._input_filename) self._input_filename = None return result
def __call__(self, data=None, remove_tmp=True): """Run the application with the specified kwargs on data data: anything that can be cast into a string or written out to a file. Usually either a list of things or a single string or number. input_handler will be called on this data before it is passed as part of the command-line argument, so by creating your own input handlers you can customize what kind of data you want your application to accept remove_tmp: if True, removes tmp files """ input_handler = self.InputHandler suppress_stdout = self.SuppressStdout suppress_stderr = self.SuppressStderr assignment_fp = FilePath(self.getTmpFilename(self.TmpDir)) if suppress_stdout: outfile = FilePath('/dev/null') else: outfile = FilePath(self.getTmpFilename(self.TmpDir)) if suppress_stderr: errfile = FilePath('/dev/null') else: errfile = FilePath(self.getTmpFilename(self.TmpDir)) if data is None: input_arg = '' else: input_arg = getattr(self,input_handler)(data) training_data = self.PositionalParameters['-training-data'] # Build up the command, consisting of a BaseCommand followed by # input and output (file) specifications command = self._commandline_join( [self.BaseCommand, input_arg, assignment_fp, training_data, '>', outfile, '2>', errfile,] ) if self.HaltExec: raise AssertionError, "Halted exec with command:\n" + command # The return value of system is a 16-bit number containing the signal # number that killed the process, and then the exit status. # We only want to keep the exit status so do a right bitwise shift to # get rid of the signal number byte exit_status = system(command) >> 8 # Determine if error should be raised due to exit status of # appliciation if not self._accept_exit_status(exit_status): raise ApplicationError, \ 'Unacceptable application exit status: %s, command: %s'\ % (str(exit_status),command) # open the stdout and stderr if not being suppressed out = None if not suppress_stdout: out = open(outfile,"r") err = None if not suppress_stderr: err = open(errfile,"r") result_paths = self._get_result_paths(data) result_paths['Assignments'] = ResultPath(assignment_fp) result = CommandLineAppResult( out, err, exit_status, result_paths=result_paths) # Clean up the input file if one was created if remove_tmp: if self._input_filename: remove(self._input_filename) self._input_filename = None return result
def _train_with_rdp_files(self, training_seqs_file, taxonomy_file, model_output_dir, remove_tmp=True): """Creates a set of training data for the RDP Classifier training_seqs_file: A pre-classified set of training sequences, in fasta-like format. Each sequence must be labelled with an identifier (no spaces) and an assigned lineage (taxa separated by ';'). Example of a valid label: ">seq1 ROOT;Ph1;Fam1;G1;" taxonomy_file: A File-like object that specifies a taxonomic heirarchy. Each line in the file must contain a '*'-separated list of the following items: Taxon ID, Taxon Name, Parent Taxon ID, Depth, and Rank. IDs should have an integer format. Example of a valid line: "1*Bacteria*0*0*domain" model_output_dir: Directory in which to store training data. remove_tmp: if True, removes tmp files To use the resulting model with the RdpClassifier, set '-training_data' to the following path: model_output_dir + RdpClassifier.PropertiesFile """ # Three extra pieces of information are required to create # training data. Unless we want built-in support for # versioned training sets, these may be set to sensible # defaults. training_set_id = '1' taxonomy_version = 'version1' modification_info = 'cogent' # The properties file specifies the names of the files in the # training directory. We use the example properties file # directly from the rdp_classifier distribution, which lists # the default set of files created by the application. We # must write this file explicitly after generating the # training data. properties = ( "# Sample ResourceBundle properties file\n" "bergeyTree=bergeyTrainingTree.xml\n" "probabilityList=genus_wordConditionalProbList.txt\n" "probabilityIndex=wordConditionalProbIndexArr.txt\n" "wordPrior=logWordPrior.txt\n" "classifierVersion=Naive Bayesian rRNA Classifier Version 1.0, November 2003\n" ) input_handler = self.InputHandler suppress_stdout = self.SuppressStdout suppress_stderr = self.SuppressStderr if suppress_stdout: outfile = FilePath('/dev/null') else: outfile = self.getTmpFilename(self.TmpDir) if suppress_stderr: errfile = FilePath('/dev/null') else: errfile = FilePath(self.getTmpFilename(self.TmpDir)) input_handler_function = getattr(self, input_handler) taxonomy_filename = input_handler_function(taxonomy_file) training_seqs_filename = input_handler_function(training_seqs_file) # Build up the command, consisting of a BaseCommand followed # by input and output (file) specifications # Example from rdp_classifier/sampledata/README: # java -Xmx400m -cp rdp_classifier-2.0.jar # edu/msu/cme/rdp/classifier/train/ClassifierTraineeMaker # mydata/mytaxon.txt mydata/mytrainseq.fasta 1 version1 test # mydata command = self._commandline_join( [self.BaseCommand, taxonomy_filename, training_seqs_filename, training_set_id, taxonomy_version, modification_info, model_output_dir, '>', outfile, '2>', errfile] ) if self.HaltExec: raise AssertionError, "Halted exec with command:\n" + command # The return value of system is a 16-bit number containing the signal # number that killed the process, and then the exit status. # We only want to keep the exit status so do a right bitwise shift to # get rid of the signal number byte exit_status = system(command) >> 8 # Determine if error should be raised due to exit status of # appliciation if not self._accept_exit_status(exit_status): raise ApplicationError, \ 'Unacceptable application exit status: %s, command: %s'\ % (str(exit_status),command) # must write properties file to output directory manually properties_fp = path.join(model_output_dir, self.PropertiesFile) properties_file = open(properties_fp, 'w') properties_file.write(properties) properties_file.close() # open the stdout and stderr if not being suppressed out = None if not suppress_stdout: out = open(outfile,"r") err = None if not suppress_stderr: err = open(errfile,"r") result = CommandLineAppResult(out, err, exit_status, result_paths=self._get_result_paths(model_output_dir)) # Clean up the input files if remove_tmp: remove(taxonomy_filename) remove(training_seqs_filename) return result
def __call__(self, data=None, remove_tmp=True): """Run the application with the specified kwargs on data data: anything that can be cast into a string or written out to a file. Usually either a list of things or a single string or number. input_handler will be called on this data before it is passed as part of the command-line argument, so by creating your own input handlers you can customize what kind of data you want your application to accept remove_tmp: if True, removes tmp files NOTE: Override of the base class to handle redirected output """ input_handler = self.InputHandler suppress_stderr = self.SuppressStderr outfile = self.getTmpFilename(self.TmpDir) self._outfile = outfile if suppress_stderr: errfile = FilePath("/dev/null") else: errfile = FilePath(self.getTmpFilename(self.TmpDir)) if data is None: input_arg = "" else: input_arg = getattr(self, input_handler)(data) # Build up the command, consisting of a BaseCommand followed by # input and output (file) specifications command = self._command_delimiter.join( filter(None, [self.BaseCommand, str(input_arg), ">", str(outfile), "2>", str(errfile)]) ) if self.HaltExec: raise AssertionError, "Halted exec with command:\n" + command # The return value of system is a 16-bit number containing the signal # number that killed the process, and then the exit status. # We only want to keep the exit status so do a right bitwise shift to # get rid of the signal number byte exit_status = system(command) >> 8 # Determine if error should be raised due to exit status of # appliciation if not self._accept_exit_status(exit_status): raise ApplicationError, "Unacceptable application exit status: %s, command: %s" % ( str(exit_status), command, ) out = open(outfile, "r") err = None if not suppress_stderr: err = open(errfile, "r") result = CommandLineAppResult(out, err, exit_status, result_paths=self._get_result_paths(data)) # Clean up the input file if one was created if remove_tmp: if self._input_filename: remove(self._input_filename) self._input_filename = None return result