Exemplo n.º 1
0
    def _input_as_lines(self, data):
        """ Write a seq of lines to a temp file and return the filename string.

        This method has been overridden for RdpTrainer so that the
        _input_filename attribute is not assigned.

            data: a sequence to be written to a file, each element of the 
                sequence will compose a line in the file
           * Note: the result will be the filename as a FilePath object 
            (which is a string subclass).

           * Note: '\n' will be stripped off the end of each sequence element
                before writing to a file in order to avoid multiple new lines
                accidentally be written to a file
        """
        filename = FilePath(self.getTmpFilename(self.TmpDir))
        data_file = open(filename, 'w')
        # Parent method does not take advantage of laziness, due to
        # temporary variable that contains entire file contents --
        # better to write explicit loop over lines in the data source,
        # storing only each line in turn.
        for line in data:
            line = str(line).strip('\n')
            data_file.write(line)
            data_file.write('\n')
        data_file.close()
        return filename
Exemplo n.º 2
0
 def append(self, val):
     """Appends val to the current list of values for this parameter.
     """
     if self.Value is None:
         self.Value = []
     if self.IsPath:
         val = FilePath(val)
     self.Value.append(val)
Exemplo n.º 3
0
    def _input_as_multiline_string(self, data):
        """Write a multiline string to a temp file and return the filename.

            data: a multiline string to be written to a file.

           * Note: the result will be the filename as a FilePath object 
            (which is a string subclass).

        """
        filename = self._input_filename = \
            FilePath(self.getTmpFilename(self.WorkingDir))
        data_file = open(filename, 'w')
        data_file.write(data)
        data_file.close()
        return filename
Exemplo n.º 4
0
 def ModelDir(self):
     """Absolute FilePath to the training output directory.
     """
     model_dir = self.Parameters['model_output_dir'].Value
     absolute_model_dir = os.path.abspath(model_dir)
     return FilePath(absolute_model_dir)
Exemplo n.º 5
0
    def _train_with_rdp_files(self,
                              training_seqs_file,
                              taxonomy_file,
                              model_output_dir,
                              remove_tmp=True):
        """Creates a set of training data for the RDP Classifier

            training_seqs_file: A pre-classified set of training
                sequences, in fasta-like format.  Each sequence must
                be labelled with an identifier (no spaces) and an
                assigned lineage (taxa separated by ';'). Example of
                a valid label: ">seq1 ROOT;Ph1;Fam1;G1;"

            taxonomy_file: A File-like object that specifies a
                taxonomic heirarchy. Each line in the file must
                contain a '*'-separated list of the following items:
                Taxon ID, Taxon Name, Parent Taxon ID, Depth, and
                Rank.  IDs should have an integer format.  Example of
                a valid line: "1*Bacteria*0*0*domain"

            model_output_dir: Directory in which to store training data.

            remove_tmp: if True, removes tmp files

        To use the resulting model with the RdpClassifier, set
        '-training_data' to the following path: model_output_dir +
        RdpClassifier.PropertiesFile
        """
        # Three extra pieces of information are required to create
        # training data.  Unless we want built-in support for
        # versioned training sets, these may be set to sensible
        # defaults.
        training_set_id = '1'
        taxonomy_version = 'version1'
        modification_info = 'cogent'

        # The properties file specifies the names of the files in the
        # training directory.  We use the example properties file
        # directly from the rdp_classifier distribution, which lists
        # the default set of files created by the application.  We
        # must write this file explicitly after generating the
        # training data.
        properties = (
            "# Sample ResourceBundle properties file\n"
            "bergeyTree=bergeyTrainingTree.xml\n"
            "probabilityList=genus_wordConditionalProbList.txt\n"
            "probabilityIndex=wordConditionalProbIndexArr.txt\n"
            "wordPrior=logWordPrior.txt\n"
            "classifierVersion=Naive Bayesian rRNA Classifier Version 1.0, November 2003\n"
        )

        input_handler = self.InputHandler
        suppress_stdout = self.SuppressStdout
        suppress_stderr = self.SuppressStderr
        if suppress_stdout:
            outfile = FilePath('/dev/null')
        else:
            outfile = self.getTmpFilename(self.TmpDir)
        if suppress_stderr:
            errfile = FilePath('/dev/null')
        else:
            errfile = FilePath(self.getTmpFilename(self.TmpDir))

        input_handler_function = getattr(self, input_handler)
        taxonomy_filename = input_handler_function(taxonomy_file)
        training_seqs_filename = input_handler_function(training_seqs_file)

        # Build up the command, consisting of a BaseCommand followed
        # by input and output (file) specifications

        # Example from rdp_classifier/sampledata/README:
        # java -Xmx400m -cp rdp_classifier-2.0.jar
        # edu/msu/cme/rdp/classifier/train/ClassifierTraineeMaker
        # mydata/mytaxon.txt mydata/mytrainseq.fasta 1 version1 test
        # mydata
        command = self._commandline_join([
            self.BaseCommand, taxonomy_filename, training_seqs_filename,
            training_set_id, taxonomy_version, modification_info,
            model_output_dir, '>', outfile, '2>', errfile
        ])

        if self.HaltExec:
            raise AssertionError, "Halted exec with command:\n" + command
        # The return value of system is a 16-bit number containing the signal
        # number that killed the process, and then the exit status.
        # We only want to keep the exit status so do a right bitwise shift to
        # get rid of the signal number byte
        exit_status = system(command) >> 8

        # Determine if error should be raised due to exit status of
        # appliciation
        if not self._accept_exit_status(exit_status):
            raise ApplicationError, \
             'Unacceptable application exit status: %s, command: %s'\
                % (str(exit_status),command)

        # must write properties file to output directory manually
        properties_fp = path.join(model_output_dir, self.PropertiesFile)
        properties_file = open(properties_fp, 'w')
        properties_file.write(properties)
        properties_file.close()

        # open the stdout and stderr if not being suppressed
        out = None
        if not suppress_stdout:
            out = open(outfile, "r")
        err = None
        if not suppress_stderr:
            err = open(errfile, "r")

        result = CommandLineAppResult(
            out,
            err,
            exit_status,
            result_paths=self._get_result_paths(model_output_dir))

        # Clean up the input files
        if remove_tmp:
            remove(taxonomy_filename)
            remove(training_seqs_filename)

        return result
Exemplo n.º 6
0
    def __call__(self, data=None, remove_tmp=True):
        """Run the application with the specified kwargs on data
        
            data: anything that can be cast into a string or written out to
                a file. Usually either a list of things or a single string or 
                number. input_handler will be called on this data before it 
                is passed as part of the command-line argument, so by creating
                your own input handlers you can customize what kind of data
                you want your application to accept

            remove_tmp: if True, removes tmp files
        """
        input_handler = self.InputHandler
        suppress_stdout = self.SuppressStdout
        suppress_stderr = self.SuppressStderr
        assignment_fp = FilePath(self.getTmpFilename(self.TmpDir))
        if suppress_stdout:
            outfile = FilePath('/dev/null')
        else:
            outfile = FilePath(self.getTmpFilename(self.TmpDir))
        if suppress_stderr:
            errfile = FilePath('/dev/null')
        else:
            errfile = FilePath(self.getTmpFilename(self.TmpDir))
        if data is None:
            input_arg = ''
        else:
            input_arg = getattr(self, input_handler)(data)

        training_data = self.PositionalParameters['-training-data']

        # Build up the command, consisting of a BaseCommand followed by
        # input and output (file) specifications
        command = self._commandline_join([
            self.BaseCommand,
            input_arg,
            assignment_fp,
            training_data,
            '>',
            outfile,
            '2>',
            errfile,
        ])

        if self.HaltExec:
            raise AssertionError, "Halted exec with command:\n" + command
        # The return value of system is a 16-bit number containing the signal
        # number that killed the process, and then the exit status.
        # We only want to keep the exit status so do a right bitwise shift to
        # get rid of the signal number byte
        exit_status = system(command) >> 8

        # Determine if error should be raised due to exit status of
        # appliciation
        if not self._accept_exit_status(exit_status):
            raise ApplicationError, \
             'Unacceptable application exit status: %s, command: %s'\
                % (str(exit_status),command)

        # open the stdout and stderr if not being suppressed
        out = None
        if not suppress_stdout:
            out = open(outfile, "r")
        err = None
        if not suppress_stderr:
            err = open(errfile, "r")

        result_paths = self._get_result_paths(data)
        result_paths['Assignments'] = ResultPath(assignment_fp)
        result = CommandLineAppResult(out,
                                      err,
                                      exit_status,
                                      result_paths=result_paths)

        # Clean up the input file if one was created
        if remove_tmp:
            if self._input_filename:
                remove(self._input_filename)
                self._input_filename = None

        return result
Exemplo n.º 7
0
    def __call__(self,
                 predictor_fp,
                 response_fp,
                 response_name,
                 model_names,
                 output_dir=None,
                 remove_tmp=True,
                 param_file=None):
        """Run the application with the specified kwargs on data
        
            data: A file nameinput_handler will be called on this data before it 
                is passed as part of the command-line argument, so by creating
                your own input handlers you can customize what kind of data
                you want your application to accept

            remove_tmp: if True, removes tmp files
            
            returns a dict of CommandLineAppResult objects, one for each machine
            learning model, keyed by the model name
        """
        input_handler = self.InputHandler
        suppress_stdout = self.SuppressStdout
        suppress_stderr = self.SuppressStderr
        if suppress_stdout:
            outfile = devnull
        else:
            outfilepath = FilePath(self.getTmpFilename(self.TmpDir))
            outfile = open(outfilepath, 'w')
        if suppress_stderr:
            errfile = devnull
        else:
            errfilepath = FilePath(self.getTmpFilename(self.TmpDir))
            errfile = open(errfilepath, 'w')
        predictor_fp = getattr(self, input_handler)(predictor_fp)
        response_fp = getattr(self, input_handler)(response_fp)
        # create random output dir if needed
        if output_dir is None:
            output_dir = mkdtemp(prefix='R_output_')

        rflags = self.RParameters['flags']
        rscript = self._get_R_script_path()
        base_command = self._get_base_command()
        cd_command, base_command = base_command.split(';')
        cd_command += ';'
        R_source_dir = self._get_R_script_dir()

        # Build up the command, consisting of a BaseCommand followed by
        # input and output (file) specifications
        pre_command = 'cat'

        if param_file is None:
            param_file = ''
        command = self._commandline_join(
            [   cd_command, pre_command, '%s |' %(rscript), base_command,
                '--args', R_source_dir, predictor_fp, response_fp, response_name,
                output_dir, ','.join(model_names), param_file\
                #~ ,'>',outfile,'2>', errfile \
            ]
            )

        if self.HaltExec:
            raise AssertionError, "Halted exec with command:\n" + command

        # run command, wait for output, get exit status
        proc = subprocess.Popen(command,
                                shell=True,
                                stdout=outfile,
                                stderr=errfile)
        proc.wait()
        exit_status = proc.returncode

        # Determine if error should be raised due to exit status of
        # appliciation
        if not self._accept_exit_status(exit_status):
            if exit_status == 2:
                raise ApplicationError, \
                    'R library not installed: \n' + \
                    ''.join(open(errfilepath,'r').readlines()) + '\n'
            else:
                raise ApplicationError, \
                    'Unacceptable application exit status: %s, command: %s'\
                    % (str(exit_status),command) +\
                    ' Program output: \n\n%s\n'\
                     %(''.join(open(errfilepath,'r').readlines()))

        # open the stdout and stderr if not being suppressed
        out = None
        if not suppress_stdout:
            out = open(outfilepath, "r")
        err = None
        if not suppress_stderr:
            err = open(errfilepath, "r")

        result = {}
        for i, model in enumerate(model_names):
            subdir = join(output_dir, model)
            # don't attempt to open the out/err files more than once
            if i == 1:
                out = err = None
            result[model] = CommandLineAppResult(
                out,
                err,
                exit_status,
                result_paths=self._get_result_paths(subdir))

        # Clean up the input file if one was created
        if remove_tmp:
            if self._input_filename:
                remove(self._input_filename)
                self._input_filename = None

        return result
Exemplo n.º 8
0
    def __call__(self, data=None, remove_tmp=True):
        """Run the application with the specified kwargs on data
        
            data: anything that can be cast into a string or written out to
                a file. Usually either a list of things or a single string or 
                number. input_handler will be called on this data before it 
                is passed as part of the command-line argument, so by creating
                your own input handlers you can customize what kind of data
                you want your application to accept

            remove_tmp: if True, removes tmp files
        """
        input_handler = self.InputHandler
        suppress_stdout = self.SuppressStdout
        suppress_stderr = self.SuppressStderr
        if suppress_stdout:
            outfile = FilePath('/dev/null')
        else:
            outfile = self.getTmpFilename(self.TmpDir)
        if suppress_stderr:
            errfile = FilePath('/dev/null')
        else:
            errfile = FilePath(self.getTmpFilename(self.TmpDir))
        if data is None:
            input_arg = ''
        else:
            input_arg = getattr(self, input_handler)(data)

        # Build up the command, consisting of a BaseCommand followed by
        # input and output (file) specifications
        command = self._command_delimiter.join([_f for _f in [self.BaseCommand,str(input_arg),'>',str(outfile),'2>',\
                str(errfile)] if _f])
        if self.HaltExec:
            raise AssertionError("Halted exec with command:\n" + command)

        # copy over data files
        nupack_data_dna_src = '/'.join([nupack_data_dir, nupack_data_dna])
        nupack_data_rna_src = '/'.join([nupack_data_dir, nupack_data_rna])
        shutil.copy(nupack_data_dna_src, self.WorkingDir)
        shutil.copy(nupack_data_rna_src, self.WorkingDir)

        # The return value of system is a 16-bit number containing the signal
        # number that killed the process, and then the exit status.
        # We only want to keep the exit status so do a right bitwise shift to
        # get rid of the signal number byte
        # NOTE: we copy the data files to the working directory first
        exit_status = system(command) >> 8

        # remove data files
        nupack_data_dna_dst = ''.join([self.WorkingDir, nupack_data_dna])
        nupack_data_rna_dst = ''.join([self.WorkingDir, nupack_data_rna])
        remove(nupack_data_dna_dst)
        remove(nupack_data_rna_dst)

        # Determine if error should be raised due to exit status of
        # appliciation
        if not self._accept_exit_status(exit_status):
            raise ApplicationError('Unacceptable application exit status: %s, command: %s'\
                % (str(exit_status),command))

        # open the stdout and stderr if not being suppressed
        out = None
        if not suppress_stdout:
            out = open(outfile, "r")
        err = None
        if not suppress_stderr:
            err = open(errfile, "r")

        result =  CommandLineAppResult(out,err,exit_status,\
            result_paths=self._get_result_paths(data))

        # Clean up the input file if one was created
        if remove_tmp:
            if self._input_filename:
                remove(self._input_filename)
                self._input_filename = None

        return result