def runMonteCarloSelection(self, feature_set, trainer, input_folder, num_combos): scores = [] accuracies = [] importances = {} feature_set_as_string = self.generateFeatureSetString(feature_set) outer_perms = self.monteCarloPermsByAlgorithm(trainer.algorithm, True) important_rsen_phrases = {} scores_and_hyperparams = [] for i in range(1, outer_perms + 1): self.log.info("Computing outer Monte Carlo Permutation %s for %s.\n", i, feature_set_as_string) formatted_data = self.formatData(self.inputs, True, True) if self.inputs.analysisType() is AnalysisType.NO_GENE_LISTS: self.logKeptFeatures(formatted_data, i, input_folder, trainer) self.log.info("Creating train and test matrices by feature set: %s.", feature_set_as_string) training_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TRAINING_MATRIX, feature_set, formatted_data, self.inputs.analysisType()) testing_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TESTING_MATRIX, feature_set, formatted_data, self.inputs.analysisType()) optimal_hyperparams = self.determineOptimalHyperparameters(feature_set, formatted_data, trainer) record_diagnostics = self.inputs.record_diagnostics trainer.logIfBestHyperparamsOnRangeThreshold(optimal_hyperparams, record_diagnostics, input_folder) trainer.logOptimalHyperParams(optimal_hyperparams, self.generateFeatureSetString(feature_set), record_diagnostics, input_folder) prediction_data = self.fetchOuterPermutationModelScore(feature_set, trainer, optimal_hyperparams, testing_matrix, training_matrix) scores.append(prediction_data[0]) accuracies.append(prediction_data[1]) for importance in prediction_data[2].keys(): if importances.get(importance) is not None: importances[importance].append(prediction_data[2].get(importance)) else: importances[importance] = [prediction_data[2].get(importance)] if len(prediction_data) == 4 and \ trainer.algorithm == SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET: for phrase in prediction_data[3].keys(): if important_rsen_phrases.get(phrase) is not None: important_rsen_phrases[phrase].append(prediction_data[3].get(phrase)) else: important_rsen_phrases[phrase] = [prediction_data[3].get(phrase)] scores_and_hyperparams.append(self.generateScoreAndHyperParam(prediction_data[0], optimal_hyperparams)) GarbageCollectionUtility.logMemoryUsageAndGarbageCollect(self.log) average_score = numpy.mean(scores) average_accuracy = numpy.mean(accuracies) self.log.debug("Average score and accuracy of all Monte Carlo runs for %s: %s, %s", feature_set_as_string, average_score, average_accuracy) ordered_importances = self.averageAndSortImportances(importances, outer_perms) ordered_phrases = self.averageAndSortImportantRSENPhrases(important_rsen_phrases, trainer) line = self.generateLine(average_accuracy, feature_set_as_string, ordered_importances, ordered_phrases, average_score, scores_and_hyperparams) self.writeToCSVInLock(line, input_folder, trainer.algorithm, num_combos, outer_perms) self.saveOutputToTxtFile(scores, accuracies, feature_set_as_string, input_folder, trainer.algorithm)
def determineBestComboFromString(self, best_combo_string, combos, processed_arguments): gene_lists = processed_arguments.gene_lists combine_gene_lists = processed_arguments.rsen_config.combine_gene_lists analysis_type = processed_arguments.analysisType() static_features = processed_arguments.static_features for combo in combos: feature_set_string = GeneListComboUtility.generateFeatureSetString( combo, gene_lists, combine_gene_lists, analysis_type, static_features) if GeneListComboUtility.combosAreEquivalent( feature_set_string, best_combo_string): return combo raise ValueError( "Unable to determine feature set from given combo gene list and feature file combo: " + best_combo_string + ".\n Please make sure all gene lists and feature files in the combo " + "are present in the drug folder.")
def analyzeIndividualGeneListCombo(self, gene_list_combos, input_folder, is_classifier): config = self.inputs.individual_train_config target_combo = config.combo target_algorithm = config.algorithm rsen_config = self.inputs.rsen_config outer_monte_carlo_loops = self.inputs.outer_monte_carlo_permutations for gene_list_combo in gene_list_combos: plain_text_name = self.generateFeatureSetString(gene_list_combo) if plain_text_name == target_combo: trainer = ModelTrainerFactory.createTrainerFromTargetAlgorithm(is_classifier, target_algorithm, rsen_config) hyperparams = self.fetchAndCastHyperparams(config, trainer) for permutation in range(0, outer_monte_carlo_loops): results = self.inputs.results formatted_data = self.formatData(self.inputs, True, True) training_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TRAINING_MATRIX, gene_list_combo, formatted_data, self.inputs.analysisType()) testing_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TESTING_MATRIX, gene_list_combo, formatted_data, self.inputs.analysisType()) features, relevant_results = trainer.populateFeaturesAndResultsByCellLine(training_matrix, results) feature_names = training_matrix.get(ArgumentProcessingService.FEATURE_NAMES) model = trainer.buildModel(relevant_results, features, hyperparams, feature_names) model_score = trainer.fetchPredictionsAndScore(model, testing_matrix, results) score = model_score[0] accuracy = model_score[1] importances = trainer.fetchFeatureImportances(model, feature_names) for key in importances.keys(): importances[key] = [importances[key]] ordered_importances = self.averageAndSortImportances(importances, 1) ordered_phrases = self.averageAndSortImportantRSENPhrases( trainer.fetchModelPhrases(model, gene_list_combo), trainer) numbered_combo = target_combo + " RUN " + SafeCastUtil.safeCast(permutation, str) self.log.debug("Final score and accuracy of individual analysis for feature gene combo %s " "using algorithm %s: %s, %s", numbered_combo, target_algorithm, score, accuracy) score_and_hyperparam = [self.generateScoreAndHyperParam(score, hyperparams)] line = self.generateLine(accuracy, numbered_combo, ordered_importances, ordered_phrases, score, score_and_hyperparam) self.writeToCSVInLock(line, input_folder, target_algorithm, outer_monte_carlo_loops, 1) return self.log.info("Gene list feature file %s combo not found in current dataset.", target_combo) return
def determineGeneListCombos(self): feature_names = self.inputs.features.get(ArgumentProcessingService.FEATURE_NAMES) if self.inputs.analysisType() is AnalysisType.NO_GENE_LISTS: return [[feature_names]] gene_lists = self.inputs.gene_lists static_features = self.inputs.static_features combos, expected_length = GeneListComboUtility.determineCombos(gene_lists, feature_names, static_features) if len(combos) != expected_length: self.log.warning("Unexpected number of combos detected, should be %s but instead created %s.\n%s", expected_length, len(combos), combos) return combos
def generateSinglePrediction(self, best_model, best_combo, cell_line, all_features, formatted_inputs): ommited_cell_line = formatted_inputs.get( DataFormattingService.TRAINING_MATRIX).get(cell_line) input_wrapper = OrderedDict() input_wrapper[DataFormattingService.TRAINING_MATRIX] = OrderedDict() input_wrapper[DataFormattingService. TRAINING_MATRIX][cell_line] = ommited_cell_line input_wrapper[ArgumentProcessingService.FEATURE_NAMES] = all_features trimmed_matrix = GeneListComboUtility.trimMatrixByFeatureSet( DataFormattingService.TRAINING_MATRIX, best_combo, input_wrapper, AnalysisType.RECOMMENDATIONS) return best_model.predict([trimmed_matrix.get(cell_line)])[0]
def determineGeneListCombos(self, processed_arguments): gene_lists = processed_arguments.gene_lists feature_names = processed_arguments.features.get( ArgumentProcessingService.FEATURE_NAMES) static_features = processed_arguments.static_features combos, expected_length = GeneListComboUtility.determineCombos( gene_lists, feature_names, static_features) if len(combos) != expected_length: self.log.warning( "Unexpected number of combos detected, should be %s but instead created %s.\n%s", expected_length, len(combos), combos) return combos
def determineInnerHyperparameters(self, feature_set, formatted_data, trainer): inner_model_hyperparams = {} inner_perms = self.monteCarloPermsByAlgorithm(trainer.algorithm, False) for j in range(1, inner_perms + 1): self.log.info("Computing inner Monte Carlo Permutation %s for %s.\n", j, self.generateFeatureSetString(feature_set)) GarbageCollectionUtility.logMemoryUsageAndGarbageCollect(self.log) formatted_inputs = self.reformatInputsByTrainingMatrix( formatted_data.get(DataFormattingService.TRAINING_MATRIX), formatted_data.get(ArgumentProcessingService.FEATURE_NAMES)) further_formatted_data = self.formatData(formatted_inputs, False, False) inner_validation_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TESTING_MATRIX, feature_set, further_formatted_data, formatted_inputs.analysisType()) inner_train_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TRAINING_MATRIX, feature_set, further_formatted_data, formatted_inputs.analysisType()) model_data = trainer.hyperparameterize(inner_train_matrix, inner_validation_matrix, self.inputs.results) for data in model_data.keys(): if inner_model_hyperparams.get(data) is not None: inner_model_hyperparams[data].append(model_data[data]) else: inner_model_hyperparams[data] = [model_data[data]] return inner_model_hyperparams
def trainBestModelWithCombo(self, best_scoring_algo, best_scoring_combo, optimal_hyperparams, trimmed_cell_lines, trimmed_results, processed_arguments): is_classifier = processed_arguments.is_classifier rsen_config = processed_arguments.rsen_config training_matrix = GeneListComboUtility.trimMatrixByFeatureSet( DataFormattingService.TRAINING_MATRIX, best_scoring_combo, trimmed_cell_lines, AnalysisType.RECOMMENDATIONS) trainer = ModelTrainerFactory.createTrainerFromTargetAlgorithm( is_classifier, best_scoring_algo, rsen_config) features, relevant_results = trainer.populateFeaturesAndResultsByCellLine( training_matrix, trimmed_results) params = DictionaryUtility.toDict(optimal_hyperparams) feature_names = training_matrix.get( ArgumentProcessingService.FEATURE_NAMES) model = trainer.buildModel(relevant_results, features, params, feature_names) return model, trainer
def determineSpecificCombos(self, all_combos): specific_combos = self.inputs.specific_combos selected_combos = {} for specific_combo in specific_combos: for combo in all_combos: combo_string = self.generateFeatureSetString(combo) if specific_combo == combo_string and selected_combos.get(combo_string) is None: selected_combos[combo_string] = combo else: equivalent_combos = GeneListComboUtility.combosAreEquivalent(combo_string, specific_combo) if equivalent_combos and selected_combos.get(combo_string) is None: selected_combos[combo_string] = combo selected_combo_names = SafeCastUtil.safeCast(selected_combos.keys(), list) if len(selected_combo_names) < len(specific_combos): self.log.warning("Not all specified combos were available in this data folder.\n" "Specified combos: %s\n Selected combos: %s", specific_combos, selected_combo_names) else: self.log.info("Only running analysis on following combos:\n %s", selected_combo_names) return SafeCastUtil.safeCast(selected_combos.values(), list)
def generateMultiplePredictions(self, recs_model_info, formatted_inputs, results, cell_line_predictions_by_drug): trimmed_matrix = GeneListComboUtility.trimMatrixByFeatureSet( DataFormattingService.TRAINING_MATRIX, recs_model_info.combo, formatted_inputs, AnalysisType.RECOMMENDATIONS) features, relevant_results = recs_model_info.trainer.populateFeaturesAndResultsByCellLine( trimmed_matrix, results) cell_lines_in_order = [ key for key in trimmed_matrix.keys() if key is not ArgumentProcessingService.FEATURE_NAMES ] predictions = recs_model_info.model.predict(features) for i in range(0, len(cell_lines_in_order)): cell_line = cell_lines_in_order[i] if cell_line_predictions_by_drug.get(cell_line) is not None: cell_line_predictions_by_drug[cell_line].append(predictions[i]) else: max_dict_length = 2 for key in cell_line_predictions_by_drug.keys(): if key == self.HEADER: continue if len(cell_line_predictions_by_drug[key] ) > max_dict_length: max_dict_length = len( cell_line_predictions_by_drug[key]) row = [cell_line] for _ in range(2, max_dict_length): row.append(MachineLearningService.DELIMITER) row.append(predictions[i]) cell_line_predictions_by_drug[cell_line] = row cell_line_predictions_by_drug[self.STD_DEVIATION].append( numpy.std(predictions)) cell_line_predictions_by_drug[self.MEAN].append( numpy.mean(predictions)) cell_line_predictions_by_drug[self.MEDIAN].append( numpy.median(predictions))
def fetchValidGeneListCombos(self, input_folder, gene_list_combos, trainer): valid_combos = [feature_set for feature_set in gene_list_combos if trainer.shouldProcessFeatureSet(feature_set)] rsen_config = self.inputs.rsen_config if trainer.algorithm == SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET and \ rsen_config.combine_gene_lists: all_genes = GeneListComboUtility.fetchAllGeneListGenesDeduped(self.inputs.gene_lists) # TODO: Can fail if "." in feature name. bin_cat_matrix = rsen_config.binary_cat_matrix.get(ArgumentProcessingService.FEATURE_NAMES)[0].split(".")[0] full_gene_list = [bin_cat_matrix + "." + gene for gene in all_genes if len(gene.strip()) > 0] new_combos = [] for combo in valid_combos: new_combo = [] for feature_set in combo: if bin_cat_matrix in feature_set[0]: new_combo.append(full_gene_list) else: new_combo.append(feature_set) if new_combo not in new_combos: new_combos.append(new_combo) return self.trimAnalyzedCombos(input_folder, new_combos, trainer) else: return self.trimAnalyzedCombos(input_folder, valid_combos, trainer)
def generateFeatureSetString(self, feature_set): return GeneListComboUtility.generateFeatureSetString(feature_set, self.inputs.gene_lists, self.inputs.rsen_config.combine_gene_lists, self.inputs.analysisType(), self.inputs.static_features)