def runMonteCarloSelection(self, feature_set, trainer, input_folder, num_combos):
        scores = []
        accuracies = []
        importances = {}
        feature_set_as_string = self.generateFeatureSetString(feature_set)
        outer_perms = self.monteCarloPermsByAlgorithm(trainer.algorithm, True)
        important_rsen_phrases = {}
        scores_and_hyperparams = []

        for i in range(1, outer_perms + 1):
            self.log.info("Computing outer Monte Carlo Permutation %s for %s.\n", i, feature_set_as_string)
            formatted_data = self.formatData(self.inputs, True, True)
            if self.inputs.analysisType() is AnalysisType.NO_GENE_LISTS:
                self.logKeptFeatures(formatted_data, i, input_folder, trainer)

            self.log.info("Creating train and test matrices by feature set: %s.", feature_set_as_string)
            training_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TRAINING_MATRIX, feature_set,
                                                                          formatted_data, self.inputs.analysisType())
            testing_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TESTING_MATRIX, feature_set,
                                                                         formatted_data, self.inputs.analysisType())

            optimal_hyperparams = self.determineOptimalHyperparameters(feature_set, formatted_data, trainer)
            record_diagnostics = self.inputs.record_diagnostics
            trainer.logIfBestHyperparamsOnRangeThreshold(optimal_hyperparams, record_diagnostics, input_folder)
            trainer.logOptimalHyperParams(optimal_hyperparams, self.generateFeatureSetString(feature_set),
                                          record_diagnostics, input_folder)

            prediction_data = self.fetchOuterPermutationModelScore(feature_set, trainer,
                                                                   optimal_hyperparams, testing_matrix,
                                                                   training_matrix)
            scores.append(prediction_data[0])
            accuracies.append(prediction_data[1])
            for importance in prediction_data[2].keys():
                if importances.get(importance) is not None:
                    importances[importance].append(prediction_data[2].get(importance))
                else:
                    importances[importance] = [prediction_data[2].get(importance)]
            if len(prediction_data) == 4 and \
                    trainer.algorithm == SupportedMachineLearningAlgorithms.RANDOM_SUBSET_ELASTIC_NET:
                for phrase in prediction_data[3].keys():
                    if important_rsen_phrases.get(phrase) is not None:
                        important_rsen_phrases[phrase].append(prediction_data[3].get(phrase))
                    else:
                        important_rsen_phrases[phrase] = [prediction_data[3].get(phrase)]
            scores_and_hyperparams.append(self.generateScoreAndHyperParam(prediction_data[0], optimal_hyperparams))

            GarbageCollectionUtility.logMemoryUsageAndGarbageCollect(self.log)

        average_score = numpy.mean(scores)
        average_accuracy = numpy.mean(accuracies)
        self.log.debug("Average score and accuracy of all Monte Carlo runs for %s: %s, %s",
                       feature_set_as_string, average_score, average_accuracy)
        ordered_importances = self.averageAndSortImportances(importances, outer_perms)

        ordered_phrases = self.averageAndSortImportantRSENPhrases(important_rsen_phrases, trainer)

        line = self.generateLine(average_accuracy, feature_set_as_string, ordered_importances, ordered_phrases,
                                 average_score, scores_and_hyperparams)
        self.writeToCSVInLock(line, input_folder, trainer.algorithm, num_combos, outer_perms)
        self.saveOutputToTxtFile(scores, accuracies, feature_set_as_string, input_folder, trainer.algorithm)
    def analyzeIndividualGeneListCombo(self, gene_list_combos, input_folder, is_classifier):
        config = self.inputs.individual_train_config
        target_combo = config.combo
        target_algorithm = config.algorithm
        rsen_config = self.inputs.rsen_config

        outer_monte_carlo_loops = self.inputs.outer_monte_carlo_permutations
        for gene_list_combo in gene_list_combos:
            plain_text_name = self.generateFeatureSetString(gene_list_combo)
            if plain_text_name == target_combo:
                trainer = ModelTrainerFactory.createTrainerFromTargetAlgorithm(is_classifier, target_algorithm, rsen_config)
                hyperparams = self.fetchAndCastHyperparams(config, trainer)

                for permutation in range(0, outer_monte_carlo_loops):
                    results = self.inputs.results
                    formatted_data = self.formatData(self.inputs, True, True)
                    training_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TRAINING_MATRIX,
                                                                                  gene_list_combo, formatted_data, self.inputs.analysisType())
                    testing_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TESTING_MATRIX, gene_list_combo,
                                                                                 formatted_data, self.inputs.analysisType())
                    features, relevant_results = trainer.populateFeaturesAndResultsByCellLine(training_matrix, results)
                    feature_names = training_matrix.get(ArgumentProcessingService.FEATURE_NAMES)
                    model = trainer.buildModel(relevant_results, features, hyperparams, feature_names)
                    model_score = trainer.fetchPredictionsAndScore(model, testing_matrix, results)
                    score = model_score[0]
                    accuracy = model_score[1]
                    importances = trainer.fetchFeatureImportances(model, feature_names)
                    for key in importances.keys():
                        importances[key] = [importances[key]]
                    ordered_importances = self.averageAndSortImportances(importances, 1)
                    ordered_phrases = self.averageAndSortImportantRSENPhrases(
                                                            trainer.fetchModelPhrases(model, gene_list_combo), trainer)

                    numbered_combo = target_combo + " RUN " + SafeCastUtil.safeCast(permutation, str)
                    self.log.debug("Final score and accuracy of individual analysis for feature gene combo %s "
                                   "using algorithm %s: %s, %s", numbered_combo, target_algorithm, score, accuracy)
                    score_and_hyperparam = [self.generateScoreAndHyperParam(score, hyperparams)]
                    line = self.generateLine(accuracy, numbered_combo, ordered_importances, ordered_phrases, score,
                                             score_and_hyperparam)
                    self.writeToCSVInLock(line, input_folder, target_algorithm, outer_monte_carlo_loops, 1)
                return
        self.log.info("Gene list feature file %s combo not found in current dataset.", target_combo)
        return
 def generateSinglePrediction(self, best_model, best_combo, cell_line,
                              all_features, formatted_inputs):
     ommited_cell_line = formatted_inputs.get(
         DataFormattingService.TRAINING_MATRIX).get(cell_line)
     input_wrapper = OrderedDict()
     input_wrapper[DataFormattingService.TRAINING_MATRIX] = OrderedDict()
     input_wrapper[DataFormattingService.
                   TRAINING_MATRIX][cell_line] = ommited_cell_line
     input_wrapper[ArgumentProcessingService.FEATURE_NAMES] = all_features
     trimmed_matrix = GeneListComboUtility.trimMatrixByFeatureSet(
         DataFormattingService.TRAINING_MATRIX, best_combo, input_wrapper,
         AnalysisType.RECOMMENDATIONS)
     return best_model.predict([trimmed_matrix.get(cell_line)])[0]
 def determineInnerHyperparameters(self, feature_set, formatted_data, trainer):
     inner_model_hyperparams = {}
     inner_perms = self.monteCarloPermsByAlgorithm(trainer.algorithm, False)
     for j in range(1, inner_perms + 1):
         self.log.info("Computing inner Monte Carlo Permutation %s for %s.\n", j,
                        self.generateFeatureSetString(feature_set))
         GarbageCollectionUtility.logMemoryUsageAndGarbageCollect(self.log)
         formatted_inputs = self.reformatInputsByTrainingMatrix(
             formatted_data.get(DataFormattingService.TRAINING_MATRIX),
             formatted_data.get(ArgumentProcessingService.FEATURE_NAMES))
         further_formatted_data = self.formatData(formatted_inputs, False, False)
         inner_validation_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TESTING_MATRIX,
                                                                               feature_set, further_formatted_data,
                                                                               formatted_inputs.analysisType())
         inner_train_matrix = GeneListComboUtility.trimMatrixByFeatureSet(DataFormattingService.TRAINING_MATRIX,
                                                                          feature_set, further_formatted_data,
                                                                          formatted_inputs.analysisType())
         model_data = trainer.hyperparameterize(inner_train_matrix, inner_validation_matrix, self.inputs.results)
         for data in model_data.keys():
             if inner_model_hyperparams.get(data) is not None:
                 inner_model_hyperparams[data].append(model_data[data])
             else:
                 inner_model_hyperparams[data] = [model_data[data]]
     return inner_model_hyperparams
    def trainBestModelWithCombo(self, best_scoring_algo, best_scoring_combo,
                                optimal_hyperparams, trimmed_cell_lines,
                                trimmed_results, processed_arguments):
        is_classifier = processed_arguments.is_classifier
        rsen_config = processed_arguments.rsen_config
        training_matrix = GeneListComboUtility.trimMatrixByFeatureSet(
            DataFormattingService.TRAINING_MATRIX, best_scoring_combo,
            trimmed_cell_lines, AnalysisType.RECOMMENDATIONS)
        trainer = ModelTrainerFactory.createTrainerFromTargetAlgorithm(
            is_classifier, best_scoring_algo, rsen_config)

        features, relevant_results = trainer.populateFeaturesAndResultsByCellLine(
            training_matrix, trimmed_results)
        params = DictionaryUtility.toDict(optimal_hyperparams)
        feature_names = training_matrix.get(
            ArgumentProcessingService.FEATURE_NAMES)
        model = trainer.buildModel(relevant_results, features, params,
                                   feature_names)
        return model, trainer
    def generateMultiplePredictions(self, recs_model_info, formatted_inputs,
                                    results, cell_line_predictions_by_drug):
        trimmed_matrix = GeneListComboUtility.trimMatrixByFeatureSet(
            DataFormattingService.TRAINING_MATRIX, recs_model_info.combo,
            formatted_inputs, AnalysisType.RECOMMENDATIONS)

        features, relevant_results = recs_model_info.trainer.populateFeaturesAndResultsByCellLine(
            trimmed_matrix, results)
        cell_lines_in_order = [
            key for key in trimmed_matrix.keys()
            if key is not ArgumentProcessingService.FEATURE_NAMES
        ]
        predictions = recs_model_info.model.predict(features)

        for i in range(0, len(cell_lines_in_order)):
            cell_line = cell_lines_in_order[i]
            if cell_line_predictions_by_drug.get(cell_line) is not None:
                cell_line_predictions_by_drug[cell_line].append(predictions[i])
            else:
                max_dict_length = 2
                for key in cell_line_predictions_by_drug.keys():
                    if key == self.HEADER:
                        continue
                    if len(cell_line_predictions_by_drug[key]
                           ) > max_dict_length:
                        max_dict_length = len(
                            cell_line_predictions_by_drug[key])
                row = [cell_line]
                for _ in range(2, max_dict_length):
                    row.append(MachineLearningService.DELIMITER)
                row.append(predictions[i])
                cell_line_predictions_by_drug[cell_line] = row
        cell_line_predictions_by_drug[self.STD_DEVIATION].append(
            numpy.std(predictions))
        cell_line_predictions_by_drug[self.MEAN].append(
            numpy.mean(predictions))
        cell_line_predictions_by_drug[self.MEDIAN].append(
            numpy.median(predictions))