def _runWekaCommand(self, command, verbose=False): """Set the CLASSPATH necessary to use Weka, then runs a shell `command`.""" if not settings.WEKA_PATH[self.WEKA_VERSION]: raise ImproperlyConfigured( "'WEKA_PATH' is not set in settings.py!") set_path = "export CLASSPATH=$CLASSPATH:{}; ".format( settings.WEKA_PATH[self.WEKA_VERSION]) command = set_path + command logger.debug("Running in Shell:\n{}".format(command)) if verbose: logger.info("Running in Shell:\n{}".format(command)) subprocess.check_output(command, shell=True)
def _prepareArff(self, reactions, whitelistHeaders, verbose=False): """Write an *.arff file using the provided queryset of reactions.""" logger.debug("Preparing ARFF file...") filename = "{}_{}.arff".format(self.statsModel.pk, uuid.uuid4()) filepath = os.path.join(settings.TMP_DIR, filename) # uber paranoid making sure we don't race condition while os.path.isfile(filepath): filename = "{}_{}.arff".format(self.statsModel.pk, uuid.uuid4()) filepath = os.path.join(settings.TMP_DIR, filename) if verbose: logger.info("Writing arff to {}".format(filepath)) with open(filepath, "w") as f: reactions.toArff(f, expanded=True, whitelistHeaders=whitelistHeaders) return filepath
def train(self, verbose=False): """Train the weka model.""" reactions = self.statsModel.trainingSet.reactions.all() descriptorHeaders = [ d.csvHeader for d in chain(self.statsModel.container.descriptors, self.statsModel.container.outcomeDescriptors) ] filePath = self.statsModel.outputFile.name if not self.statsModel.inputFile.name: self.statsModel.inputFile = self._prepareArff( reactions, descriptorHeaders, verbose) self.statsModel.save(update_fields=['inputFile']) elif not os.path.isfile(self.statsModel.inputFile.name): if self.invalid: raise RuntimeError( 'Could not find statsModel arff file and model is invalid') else: raise warning.warn( 'Could not find statsModel arff file, but model is valid, so recreating' ) self.statsModel.inputFile.name = self._prepareArff( reactions, descriptorHeaders, verbose) self.statsModel.save(update_fields=['inputFile']) elif verbose: logger.info("Using existing arff file.") arff_file = self.statsModel.inputFile.name # Currently, we support only one "response" variable. response = list(self.statsModel.container.outcomeDescriptors)[0] headers = [ h for h in reactions.expandedCsvHeaders() if h in descriptorHeaders ] response_index = headers.index(response.csvHeader) + 1 if self.BCR: cost_matrix_string = self.BCR_cost_matrix(reactions, response) command = "java weka.classifiers.meta.CostSensitiveClassifier -cost-matrix {} -W {} -t {} -d {} -p 0 -c {} -- {}".format( cost_matrix_string, self.wekaCommand, arff_file, filePath, response_index, self.wekaTrainOptions) else: command = "java {} -t {} -d {} -p 0 -c {} {}".format( self.wekaCommand, arff_file, filePath, response_index, self.wekaTrainOptions) self._runWekaCommand(command, verbose=verbose)
def predict(self, reactions, verbose=False): """Create the predictions for these reactions for the model.""" descriptorHeaders = [ d.csvHeader for d in chain(self.statsModel.container.descriptors, self.statsModel.container.outcomeDescriptors) ] arff_file = self._prepareArff(reactions, descriptorHeaders, verbose=verbose) model_file = self.statsModel.outputFile.name results_file = "{}_{}.out".format(self.statsModel.pk, uuid.uuid4()) results_path = os.path.join(settings.TMP_DIR, results_file) # Currently, we support only one "response" variable. headers = [ h for h in reactions.expandedCsvHeaders() if h in descriptorHeaders ] response = list(self.statsModel.container.outcomeDescriptors)[0] response_index = headers.index(response.csvHeader) + 1 command = "java {} -T {} -l {} -p 0 -c {} 1> {}".format( self.wekaCommand, arff_file, model_file, response_index, results_path) if verbose: logger.info("Writing results to {}".format(results_path)) self._runWekaCommand(command, verbose=verbose) if isinstance(response, rxnDescriptors.BoolRxnDescriptor): typeConversionFunction = booleanConversion elif isinstance(response, rxnDescriptors.OrdRxnDescriptor): typeConversionFunction = ordConversion elif isinstance(response, rxnDescriptors.NumRxnDescriptor): typeConversionFunction = numConversion elif isinstance(response, rxnDescriptors.CatRxnDescriptor): typeConversionFunction = str else: raise TypeError("Response descriptor is of invalid type {}".format( type(response))) results = tuple((reaction, result) for reaction, result in zip( reactions, self._readWekaOutputFile(results_path, typeConversionFunction))) return {response: results}