def runMonteCarloSelection(self, feature_set, trainer, input_folder, num_combos): scores = [] accuracies = [] importances = {} feature_set_as_string = self.generateFeatureSetString(feature_set) outer_perms = self.monteCarloPermsByAlgorithm(trainer.algorithm, True) important_rsen_phrases = {} scores_and_hyperparams = [] for i in range(1, outer_perms + 1): self.log.info("Computing outer Monte Carlo Permutation %s for %s.\n", i, feature_set_as_string) formatted_data = self.formatData(self.inputs, True, True) if self.inputs.analysisType() is AnalysisType.NO_GENE_LISTS: self.logKeptFeatures(formatted_data, i, input_folder, trainer) self.log.info("Creating train and test matrices by feature set: %s.", feature_set_as_string) training_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TRAINING_MATRIX, feature_set, formatted_data, self.inputs.analysisType()) testing_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TESTING_MATRIX, feature_set, formatted_data, self.inputs.analysisType()) optimal_hyperparams = self.determineOptimalHyperparameters(feature_set, formatted_data, trainer) record_diagnostics = self.inputs.record_diagnostics trainer.logIfBestHyperparamsOnRangeThreshold(optimal_hyperparams, record_diagnostics, input_folder) trainer.logOptimalHyperParams(optimal_hyperparams, self.generateFeatureSetString(feature_set), record_diagnostics, input_folder) prediction_data = self.fetchOuterPermutationModelScore(feature_set, trainer, optimal_hyperparams, testing_matrix, training_matrix) scores.append(prediction_data[0]) accuracies.append(prediction_data[1]) for importance in prediction_data[2].keys(): if importances.get(importance) is not None: importances[importance].append(prediction_data[2].get(importance)) else: importances[importance] = [prediction_data[2].get(importance)] if len(prediction_data) == 4 and \ trainer.algorithm == SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET: for phrase in prediction_data[3].keys(): if important_rsen_phrases.get(phrase) is not None: important_rsen_phrases[phrase].append(prediction_data[3].get(phrase)) else: important_rsen_phrases[phrase] = [prediction_data[3].get(phrase)] scores_and_hyperparams.append(self.generateScoreAndHyperParam(prediction_data[0], optimal_hyperparams)) GarbageCollectionUtility.logMemoryUsageAndGarbageCollect(self.log) average_score = numpy.mean(scores) average_accuracy = numpy.mean(accuracies) self.log.debug("Average score and accuracy of all Monte Carlo runs for %s: %s, %s", feature_set_as_string, average_score, average_accuracy) ordered_importances = self.averageAndSortImportances(importances, outer_perms) ordered_phrases = self.averageAndSortImportantRSENPhrases(important_rsen_phrases, trainer) line = self.generateLine(average_accuracy, feature_set_as_string, ordered_importances, ordered_phrases, average_score, scores_and_hyperparams) self.writeToCSVInLock(line, input_folder, trainer.algorithm, num_combos, outer_perms) self.saveOutputToTxtFile(scores, accuracies, feature_set_as_string, input_folder, trainer.algorithm)
def analyzeIndividualGeneListCombo(self, gene_list_combos, input_folder, is_classifier): config = self.inputs.individual_train_config target_combo = config.combo target_algorithm = config.algorithm rsen_config = self.inputs.rsen_config outer_monte_carlo_loops = self.inputs.outer_monte_carlo_permutations for gene_list_combo in gene_list_combos: plain_text_name = self.generateFeatureSetString(gene_list_combo) if plain_text_name == target_combo: trainer = ModelTrainerFactory.createTrainerFromTargetAlgorithm(is_classifier, target_algorithm, rsen_config) hyperparams = self.fetchAndCastHyperparams(config, trainer) for permutation in range(0, outer_monte_carlo_loops): results = self.inputs.results formatted_data = self.formatData(self.inputs, True, True) training_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TRAINING_MATRIX, gene_list_combo, formatted_data, self.inputs.analysisType()) testing_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TESTING_MATRIX, gene_list_combo, formatted_data, self.inputs.analysisType()) features, relevant_results = trainer.populateFeaturesAndResultsByCellLine(training_matrix, results) feature_names = training_matrix.get(ArgumentProcessingService.FEATURE_NAMES) model = trainer.buildModel(relevant_results, features, hyperparams, feature_names) model_score = trainer.fetchPredictionsAndScore(model, testing_matrix, results) score = model_score[0] accuracy = model_score[1] importances = trainer.fetchFeatureImportances(model, feature_names) for key in importances.keys(): importances[key] = [importances[key]] ordered_importances = self.averageAndSortImportances(importances, 1) ordered_phrases = self.averageAndSortImportantRSENPhrases( trainer.fetchModelPhrases(model, gene_list_combo), trainer) numbered_combo = target_combo + " RUN " + SafeCastUtil.safeCast(permutation, str) self.log.debug("Final score and accuracy of individual analysis for feature gene combo %s " "using algorithm %s: %s, %s", numbered_combo, target_algorithm, score, accuracy) score_and_hyperparam = [self.generateScoreAndHyperParam(score, hyperparams)] line = self.generateLine(accuracy, numbered_combo, ordered_importances, ordered_phrases, score, score_and_hyperparam) self.writeToCSVInLock(line, input_folder, target_algorithm, outer_monte_carlo_loops, 1) return self.log.info("Gene list feature file %s combo not found in current dataset.", target_combo) return
def generateSinglePrediction(self, best_model, best_combo, cell_line, all_features, formatted_inputs): ommited_cell_line = formatted_inputs.get( DataFormattingService.TRAINING_MATRIX).get(cell_line) input_wrapper = OrderedDict() input_wrapper[DataFormattingService.TRAINING_MATRIX] = OrderedDict() input_wrapper[DataFormattingService. TRAINING_MATRIX][cell_line] = ommited_cell_line input_wrapper[ArgumentProcessingService.FEATURE_NAMES] = all_features trimmed_matrix = GeneListComboUtility.trimMatrixByFeatureSet( DataFormattingService.TRAINING_MATRIX, best_combo, input_wrapper, AnalysisType.RECOMMENDATIONS) return best_model.predict([trimmed_matrix.get(cell_line)])[0]
def determineInnerHyperparameters(self, feature_set, formatted_data, trainer): inner_model_hyperparams = {} inner_perms = self.monteCarloPermsByAlgorithm(trainer.algorithm, False) for j in range(1, inner_perms + 1): self.log.info("Computing inner Monte Carlo Permutation %s for %s.\n", j, self.generateFeatureSetString(feature_set)) GarbageCollectionUtility.logMemoryUsageAndGarbageCollect(self.log) formatted_inputs = self.reformatInputsByTrainingMatrix( formatted_data.get(DataFormattingService.TRAINING_MATRIX), formatted_data.get(ArgumentProcessingService.FEATURE_NAMES)) further_formatted_data = self.formatData(formatted_inputs, False, False) inner_validation_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TESTING_MATRIX, feature_set, further_formatted_data, formatted_inputs.analysisType()) inner_train_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TRAINING_MATRIX, feature_set, further_formatted_data, formatted_inputs.analysisType()) model_data = trainer.hyperparameterize(inner_train_matrix, inner_validation_matrix, self.inputs.results) for data in model_data.keys(): if inner_model_hyperparams.get(data) is not None: inner_model_hyperparams[data].append(model_data[data]) else: inner_model_hyperparams[data] = [model_data[data]] return inner_model_hyperparams
def trainBestModelWithCombo(self, best_scoring_algo, best_scoring_combo, optimal_hyperparams, trimmed_cell_lines, trimmed_results, processed_arguments): is_classifier = processed_arguments.is_classifier rsen_config = processed_arguments.rsen_config training_matrix = GeneListComboUtility.trimMatrixByFeatureSet( DataFormattingService.TRAINING_MATRIX, best_scoring_combo, trimmed_cell_lines, AnalysisType.RECOMMENDATIONS) trainer = ModelTrainerFactory.createTrainerFromTargetAlgorithm( is_classifier, best_scoring_algo, rsen_config) features, relevant_results = trainer.populateFeaturesAndResultsByCellLine( training_matrix, trimmed_results) params = DictionaryUtility.toDict(optimal_hyperparams) feature_names = training_matrix.get( ArgumentProcessingService.FEATURE_NAMES) model = trainer.buildModel(relevant_results, features, params, feature_names) return model, trainer
def generateMultiplePredictions(self, recs_model_info, formatted_inputs, results, cell_line_predictions_by_drug): trimmed_matrix = GeneListComboUtility.trimMatrixByFeatureSet( DataFormattingService.TRAINING_MATRIX, recs_model_info.combo, formatted_inputs, AnalysisType.RECOMMENDATIONS) features, relevant_results = recs_model_info.trainer.populateFeaturesAndResultsByCellLine( trimmed_matrix, results) cell_lines_in_order = [ key for key in trimmed_matrix.keys() if key is not ArgumentProcessingService.FEATURE_NAMES ] predictions = recs_model_info.model.predict(features) for i in range(0, len(cell_lines_in_order)): cell_line = cell_lines_in_order[i] if cell_line_predictions_by_drug.get(cell_line) is not None: cell_line_predictions_by_drug[cell_line].append(predictions[i]) else: max_dict_length = 2 for key in cell_line_predictions_by_drug.keys(): if key == self.HEADER: continue if len(cell_line_predictions_by_drug[key] ) > max_dict_length: max_dict_length = len( cell_line_predictions_by_drug[key]) row = [cell_line] for _ in range(2, max_dict_length): row.append(MachineLearningService.DELIMITER) row.append(predictions[i]) cell_line_predictions_by_drug[cell_line] = row cell_line_predictions_by_drug[self.STD_DEVIATION].append( numpy.std(predictions)) cell_line_predictions_by_drug[self.MEAN].append( numpy.mean(predictions)) cell_line_predictions_by_drug[self.MEDIAN].append( numpy.median(predictions))