Пример #1
0
 def _runWekaCommand(self, command, verbose=False):
     """Set the CLASSPATH necessary to use Weka, then runs a shell `command`."""
     if not settings.WEKA_PATH[self.WEKA_VERSION]:
         raise ImproperlyConfigured(
             "'WEKA_PATH' is not set in settings.py!")
     set_path = "export CLASSPATH=$CLASSPATH:{}; ".format(
         settings.WEKA_PATH[self.WEKA_VERSION])
     command = set_path + command
     logger.debug("Running in Shell:\n{}".format(command))
     if verbose:
         logger.info("Running in Shell:\n{}".format(command))
     subprocess.check_output(command, shell=True)
Пример #2
0
 def _prepareArff(self, reactions, whitelistHeaders, verbose=False):
     """Write an *.arff file using the provided queryset of reactions."""
     logger.debug("Preparing ARFF file...")
     filename = "{}_{}.arff".format(self.statsModel.pk, uuid.uuid4())
     filepath = os.path.join(settings.TMP_DIR, filename)
     # uber paranoid making sure we don't race condition
     while os.path.isfile(filepath):
         filename = "{}_{}.arff".format(self.statsModel.pk, uuid.uuid4())
         filepath = os.path.join(settings.TMP_DIR, filename)
     if verbose:
         logger.info("Writing arff to {}".format(filepath))
     with open(filepath, "w") as f:
         reactions.toArff(f,
                          expanded=True,
                          whitelistHeaders=whitelistHeaders)
     return filepath
Пример #3
0
    def train(self, verbose=False):
        """Train the weka model."""
        reactions = self.statsModel.trainingSet.reactions.all()
        descriptorHeaders = [
            d.csvHeader
            for d in chain(self.statsModel.container.descriptors,
                           self.statsModel.container.outcomeDescriptors)
        ]
        filePath = self.statsModel.outputFile.name
        if not self.statsModel.inputFile.name:
            self.statsModel.inputFile = self._prepareArff(
                reactions, descriptorHeaders, verbose)
            self.statsModel.save(update_fields=['inputFile'])
        elif not os.path.isfile(self.statsModel.inputFile.name):
            if self.invalid:
                raise RuntimeError(
                    'Could not find statsModel arff file and model is invalid')
            else:
                raise warning.warn(
                    'Could not find statsModel arff file, but model is valid, so recreating'
                )
                self.statsModel.inputFile.name = self._prepareArff(
                    reactions, descriptorHeaders, verbose)
                self.statsModel.save(update_fields=['inputFile'])
        elif verbose:
            logger.info("Using existing arff file.")

        arff_file = self.statsModel.inputFile.name

        # Currently, we support only one "response" variable.
        response = list(self.statsModel.container.outcomeDescriptors)[0]
        headers = [
            h for h in reactions.expandedCsvHeaders() if h in descriptorHeaders
        ]
        response_index = headers.index(response.csvHeader) + 1

        if self.BCR:
            cost_matrix_string = self.BCR_cost_matrix(reactions, response)
            command = "java weka.classifiers.meta.CostSensitiveClassifier -cost-matrix {} -W {} -t {} -d {} -p 0 -c {} -- {}".format(
                cost_matrix_string, self.wekaCommand, arff_file, filePath,
                response_index, self.wekaTrainOptions)
        else:
            command = "java {} -t {} -d {} -p 0 -c {} {}".format(
                self.wekaCommand, arff_file, filePath, response_index,
                self.wekaTrainOptions)
        self._runWekaCommand(command, verbose=verbose)
Пример #4
0
    def predict(self, reactions, verbose=False):
        """Create the predictions for these reactions for the model."""
        descriptorHeaders = [
            d.csvHeader
            for d in chain(self.statsModel.container.descriptors,
                           self.statsModel.container.outcomeDescriptors)
        ]

        arff_file = self._prepareArff(reactions,
                                      descriptorHeaders,
                                      verbose=verbose)
        model_file = self.statsModel.outputFile.name

        results_file = "{}_{}.out".format(self.statsModel.pk, uuid.uuid4())
        results_path = os.path.join(settings.TMP_DIR, results_file)

        # Currently, we support only one "response" variable.
        headers = [
            h for h in reactions.expandedCsvHeaders() if h in descriptorHeaders
        ]
        response = list(self.statsModel.container.outcomeDescriptors)[0]
        response_index = headers.index(response.csvHeader) + 1

        command = "java {} -T {} -l {} -p 0 -c {} 1> {}".format(
            self.wekaCommand, arff_file, model_file, response_index,
            results_path)
        if verbose:
            logger.info("Writing results to {}".format(results_path))
        self._runWekaCommand(command, verbose=verbose)

        if isinstance(response, rxnDescriptors.BoolRxnDescriptor):
            typeConversionFunction = booleanConversion
        elif isinstance(response, rxnDescriptors.OrdRxnDescriptor):
            typeConversionFunction = ordConversion
        elif isinstance(response, rxnDescriptors.NumRxnDescriptor):
            typeConversionFunction = numConversion
        elif isinstance(response, rxnDescriptors.CatRxnDescriptor):
            typeConversionFunction = str
        else:
            raise TypeError("Response descriptor is of invalid type {}".format(
                type(response)))
        results = tuple((reaction, result) for reaction, result in zip(
            reactions,
            self._readWekaOutputFile(results_path, typeConversionFunction)))
        return {response: results}