Пример #1
0
    def _train_with_rdp_files(self,
                              training_seqs_file,
                              taxonomy_file,
                              model_output_dir,
                              remove_tmp=True):
        """Creates a set of training data for the RDP Classifier

            training_seqs_file: A pre-classified set of training
                sequences, in fasta-like format.  Each sequence must
                be labelled with an identifier (no spaces) and an
                assigned lineage (taxa separated by ';'). Example of
                a valid label: ">seq1 ROOT;Ph1;Fam1;G1;"

            taxonomy_file: A File-like object that specifies a
                taxonomic heirarchy. Each line in the file must
                contain a '*'-separated list of the following items:
                Taxon ID, Taxon Name, Parent Taxon ID, Depth, and
                Rank.  IDs should have an integer format.  Example of
                a valid line: "1*Bacteria*0*0*domain"

            model_output_dir: Directory in which to store training data.

            remove_tmp: if True, removes tmp files

        To use the resulting model with the RdpClassifier, set
        '-training_data' to the following path: model_output_dir +
        RdpClassifier.PropertiesFile
        """
        # Three extra pieces of information are required to create
        # training data.  Unless we want built-in support for
        # versioned training sets, these may be set to sensible
        # defaults.
        training_set_id = '1'
        taxonomy_version = 'version1'
        modification_info = 'cogent'

        # The properties file specifies the names of the files in the
        # training directory.  We use the example properties file
        # directly from the rdp_classifier distribution, which lists
        # the default set of files created by the application.  We
        # must write this file explicitly after generating the
        # training data.
        properties = (
            "# Sample ResourceBundle properties file\n"
            "bergeyTree=bergeyTrainingTree.xml\n"
            "probabilityList=genus_wordConditionalProbList.txt\n"
            "probabilityIndex=wordConditionalProbIndexArr.txt\n"
            "wordPrior=logWordPrior.txt\n"
            "classifierVersion=Naive Bayesian rRNA Classifier Version 1.0, November 2003\n"
        )

        input_handler = self.InputHandler
        suppress_stdout = self.SuppressStdout
        suppress_stderr = self.SuppressStderr
        if suppress_stdout:
            outfile = FilePath('/dev/null')
        else:
            outfile = self.getTmpFilename(self.TmpDir)
        if suppress_stderr:
            errfile = FilePath('/dev/null')
        else:
            errfile = FilePath(self.getTmpFilename(self.TmpDir))

        input_handler_function = getattr(self, input_handler)
        taxonomy_filename = input_handler_function(taxonomy_file)
        training_seqs_filename = input_handler_function(training_seqs_file)

        # Build up the command, consisting of a BaseCommand followed
        # by input and output (file) specifications

        # Example from rdp_classifier/sampledata/README:
        # java -Xmx400m -cp rdp_classifier-2.0.jar
        # edu/msu/cme/rdp/classifier/train/ClassifierTraineeMaker
        # mydata/mytaxon.txt mydata/mytrainseq.fasta 1 version1 test
        # mydata
        command = self._commandline_join([
            self.BaseCommand, taxonomy_filename, training_seqs_filename,
            training_set_id, taxonomy_version, modification_info,
            model_output_dir, '>', outfile, '2>', errfile
        ])

        if self.HaltExec:
            raise AssertionError, "Halted exec with command:\n" + command
        # The return value of system is a 16-bit number containing the signal
        # number that killed the process, and then the exit status.
        # We only want to keep the exit status so do a right bitwise shift to
        # get rid of the signal number byte
        exit_status = system(command) >> 8

        # Determine if error should be raised due to exit status of
        # appliciation
        if not self._accept_exit_status(exit_status):
            raise ApplicationError, \
             'Unacceptable application exit status: %s, command: %s'\
                % (str(exit_status),command)

        # must write properties file to output directory manually
        properties_fp = path.join(model_output_dir, self.PropertiesFile)
        properties_file = open(properties_fp, 'w')
        properties_file.write(properties)
        properties_file.close()

        # open the stdout and stderr if not being suppressed
        out = None
        if not suppress_stdout:
            out = open(outfile, "r")
        err = None
        if not suppress_stderr:
            err = open(errfile, "r")

        result = CommandLineAppResult(
            out,
            err,
            exit_status,
            result_paths=self._get_result_paths(model_output_dir))

        # Clean up the input files
        if remove_tmp:
            remove(taxonomy_filename)
            remove(training_seqs_filename)

        return result
Пример #2
0
    def __call__(self, data=None, remove_tmp=True):
        """Run the application with the specified kwargs on data
        
            data: anything that can be cast into a string or written out to
                a file. Usually either a list of things or a single string or 
                number. input_handler will be called on this data before it 
                is passed as part of the command-line argument, so by creating
                your own input handlers you can customize what kind of data
                you want your application to accept

            remove_tmp: if True, removes tmp files
        """
        input_handler = self.InputHandler
        suppress_stdout = self.SuppressStdout
        suppress_stderr = self.SuppressStderr
        assignment_fp = FilePath(self.getTmpFilename(self.TmpDir))
        if suppress_stdout:
            outfile = FilePath('/dev/null')
        else:
            outfile = FilePath(self.getTmpFilename(self.TmpDir))
        if suppress_stderr:
            errfile = FilePath('/dev/null')
        else:
            errfile = FilePath(self.getTmpFilename(self.TmpDir))
        if data is None:
            input_arg = ''
        else:
            input_arg = getattr(self, input_handler)(data)

        training_data = self.PositionalParameters['-training-data']

        # Build up the command, consisting of a BaseCommand followed by
        # input and output (file) specifications
        command = self._commandline_join([
            self.BaseCommand,
            input_arg,
            assignment_fp,
            training_data,
            '>',
            outfile,
            '2>',
            errfile,
        ])

        if self.HaltExec:
            raise AssertionError, "Halted exec with command:\n" + command
        # The return value of system is a 16-bit number containing the signal
        # number that killed the process, and then the exit status.
        # We only want to keep the exit status so do a right bitwise shift to
        # get rid of the signal number byte
        exit_status = system(command) >> 8

        # Determine if error should be raised due to exit status of
        # appliciation
        if not self._accept_exit_status(exit_status):
            raise ApplicationError, \
             'Unacceptable application exit status: %s, command: %s'\
                % (str(exit_status),command)

        # open the stdout and stderr if not being suppressed
        out = None
        if not suppress_stdout:
            out = open(outfile, "r")
        err = None
        if not suppress_stderr:
            err = open(errfile, "r")

        result_paths = self._get_result_paths(data)
        result_paths['Assignments'] = ResultPath(assignment_fp)
        result = CommandLineAppResult(out,
                                      err,
                                      exit_status,
                                      result_paths=result_paths)

        # Clean up the input file if one was created
        if remove_tmp:
            if self._input_filename:
                remove(self._input_filename)
                self._input_filename = None

        return result
Пример #3
0
    def __call__(self, data=None, remove_tmp=True):
        """Run the application with the specified kwargs on data
        
            data: anything that can be cast into a string or written out to
                a file. Usually either a list of things or a single string or 
                number. input_handler will be called on this data before it 
                is passed as part of the command-line argument, so by creating
                your own input handlers you can customize what kind of data
                you want your application to accept

            remove_tmp: if True, removes tmp files
        """
        input_handler = self.InputHandler
        suppress_stdout = self.SuppressStdout
        suppress_stderr = self.SuppressStderr
        assignment_fp = FilePath(self.getTmpFilename(self.TmpDir))
        if suppress_stdout:
            outfile = FilePath('/dev/null')
        else:
            outfile = FilePath(self.getTmpFilename(self.TmpDir))
        if suppress_stderr:
            errfile = FilePath('/dev/null')
        else:
            errfile = FilePath(self.getTmpFilename(self.TmpDir))
        if data is None:
            input_arg = ''
        else:
            input_arg = getattr(self,input_handler)(data)

        training_data = self.PositionalParameters['-training-data']

        # Build up the command, consisting of a BaseCommand followed by
        # input and output (file) specifications
        command = self._commandline_join(
            [self.BaseCommand, input_arg, assignment_fp, training_data, 
             '>', outfile, '2>', errfile,]
            )

        if self.HaltExec: 
            raise AssertionError, "Halted exec with command:\n" + command
        # The return value of system is a 16-bit number containing the signal 
        # number that killed the process, and then the exit status. 
        # We only want to keep the exit status so do a right bitwise shift to 
        # get rid of the signal number byte
        exit_status = system(command) >> 8
      
        # Determine if error should be raised due to exit status of 
        # appliciation
        if not self._accept_exit_status(exit_status):
            raise ApplicationError, \
             'Unacceptable application exit status: %s, command: %s'\
                % (str(exit_status),command)
        
        # open the stdout and stderr if not being suppressed
        out = None
        if not suppress_stdout:
            out = open(outfile,"r")
        err = None        
        if not suppress_stderr:
            err = open(errfile,"r")

        result_paths = self._get_result_paths(data)
        result_paths['Assignments'] = ResultPath(assignment_fp)
        result = CommandLineAppResult(
            out, err, exit_status, result_paths=result_paths)

        # Clean up the input file if one was created
        if remove_tmp:
            if self._input_filename:
                remove(self._input_filename)
                self._input_filename = None

        return result
Пример #4
0
    def _train_with_rdp_files(self, training_seqs_file, taxonomy_file, 
        model_output_dir, remove_tmp=True):
        """Creates a set of training data for the RDP Classifier

            training_seqs_file: A pre-classified set of training
                sequences, in fasta-like format.  Each sequence must
                be labelled with an identifier (no spaces) and an
                assigned lineage (taxa separated by ';'). Example of
                a valid label: ">seq1 ROOT;Ph1;Fam1;G1;"

            taxonomy_file: A File-like object that specifies a
                taxonomic heirarchy. Each line in the file must
                contain a '*'-separated list of the following items:
                Taxon ID, Taxon Name, Parent Taxon ID, Depth, and
                Rank.  IDs should have an integer format.  Example of
                a valid line: "1*Bacteria*0*0*domain"

            model_output_dir: Directory in which to store training data.

            remove_tmp: if True, removes tmp files

        To use the resulting model with the RdpClassifier, set
        '-training_data' to the following path: model_output_dir +
        RdpClassifier.PropertiesFile
        """
        # Three extra pieces of information are required to create
        # training data.  Unless we want built-in support for
        # versioned training sets, these may be set to sensible
        # defaults.
        training_set_id = '1'
        taxonomy_version = 'version1'
        modification_info = 'cogent'

        # The properties file specifies the names of the files in the
        # training directory.  We use the example properties file
        # directly from the rdp_classifier distribution, which lists
        # the default set of files created by the application.  We
        # must write this file explicitly after generating the
        # training data.
        properties = (
            "# Sample ResourceBundle properties file\n"
            "bergeyTree=bergeyTrainingTree.xml\n"
            "probabilityList=genus_wordConditionalProbList.txt\n"
            "probabilityIndex=wordConditionalProbIndexArr.txt\n"
            "wordPrior=logWordPrior.txt\n"
            "classifierVersion=Naive Bayesian rRNA Classifier Version 1.0, November 2003\n"
            )

        input_handler = self.InputHandler
        suppress_stdout = self.SuppressStdout
        suppress_stderr = self.SuppressStderr
        if suppress_stdout:
            outfile = FilePath('/dev/null')
        else:
            outfile = self.getTmpFilename(self.TmpDir)
        if suppress_stderr:
            errfile = FilePath('/dev/null')
        else:
            errfile = FilePath(self.getTmpFilename(self.TmpDir))

        input_handler_function = getattr(self, input_handler)
        taxonomy_filename = input_handler_function(taxonomy_file)
        training_seqs_filename = input_handler_function(training_seqs_file)

        # Build up the command, consisting of a BaseCommand followed
        # by input and output (file) specifications 

        # Example from rdp_classifier/sampledata/README: 
        # java -Xmx400m -cp rdp_classifier-2.0.jar
        # edu/msu/cme/rdp/classifier/train/ClassifierTraineeMaker
        # mydata/mytaxon.txt mydata/mytrainseq.fasta 1 version1 test
        # mydata
        command = self._commandline_join(
            [self.BaseCommand, taxonomy_filename, training_seqs_filename,
             training_set_id, taxonomy_version, modification_info,
             model_output_dir, '>', outfile, '2>', errfile]
            )

        if self.HaltExec: 
            raise AssertionError, "Halted exec with command:\n" + command
        # The return value of system is a 16-bit number containing the signal 
        # number that killed the process, and then the exit status. 
        # We only want to keep the exit status so do a right bitwise shift to 
        # get rid of the signal number byte
        exit_status = system(command) >> 8

        # Determine if error should be raised due to exit status of 
        # appliciation
        if not self._accept_exit_status(exit_status):
            raise ApplicationError, \
             'Unacceptable application exit status: %s, command: %s'\
                % (str(exit_status),command)

        # must write properties file to output directory manually
        properties_fp = path.join(model_output_dir, self.PropertiesFile)
        properties_file = open(properties_fp, 'w')
        properties_file.write(properties)
        properties_file.close()

        # open the stdout and stderr if not being suppressed
        out = None
        if not suppress_stdout:
            out = open(outfile,"r")
        err = None        
        if not suppress_stderr:
            err = open(errfile,"r")
       
        result = CommandLineAppResult(out, err, exit_status, 
            result_paths=self._get_result_paths(model_output_dir))

        # Clean up the input files
        if remove_tmp:
            remove(taxonomy_filename)
            remove(training_seqs_filename)

        return result
Пример #5
0
    def __call__(self, data=None, remove_tmp=True):
        """Run the application with the specified kwargs on data
        
            data: anything that can be cast into a string or written out to
                a file. Usually either a list of things or a single string or 
                number. input_handler will be called on this data before it 
                is passed as part of the command-line argument, so by creating
                your own input handlers you can customize what kind of data
                you want your application to accept

            remove_tmp: if True, removes tmp files

            NOTE: Override of the base class to handle redirected output
        """
        input_handler = self.InputHandler
        suppress_stderr = self.SuppressStderr

        outfile = self.getTmpFilename(self.TmpDir)
        self._outfile = outfile

        if suppress_stderr:
            errfile = FilePath("/dev/null")
        else:
            errfile = FilePath(self.getTmpFilename(self.TmpDir))
        if data is None:
            input_arg = ""
        else:
            input_arg = getattr(self, input_handler)(data)

        # Build up the command, consisting of a BaseCommand followed by
        # input and output (file) specifications
        command = self._command_delimiter.join(
            filter(None, [self.BaseCommand, str(input_arg), ">", str(outfile), "2>", str(errfile)])
        )
        if self.HaltExec:
            raise AssertionError, "Halted exec with command:\n" + command
        # The return value of system is a 16-bit number containing the signal
        # number that killed the process, and then the exit status.
        # We only want to keep the exit status so do a right bitwise shift to
        # get rid of the signal number byte
        exit_status = system(command) >> 8

        # Determine if error should be raised due to exit status of
        # appliciation
        if not self._accept_exit_status(exit_status):
            raise ApplicationError, "Unacceptable application exit status: %s, command: %s" % (
                str(exit_status),
                command,
            )

        out = open(outfile, "r")

        err = None
        if not suppress_stderr:
            err = open(errfile, "r")

        result = CommandLineAppResult(out, err, exit_status, result_paths=self._get_result_paths(data))

        # Clean up the input file if one was created
        if remove_tmp:
            if self._input_filename:
                remove(self._input_filename)
                self._input_filename = None

        return result