Пример #1
0
    def train(self):
        """Train the mini-models

        :return: the map of the trained models
        """

        self.model_map = {}

        # Create the results files for the paper
        header = ["OpUnit", "Method"] + [
            target.name for target in data_info.MINI_MODEL_TARGET_LIST
        ]
        summary_file = "{}/mini_runner.csv".format(self.model_metrics_path)
        io_util.create_csv_file(summary_file, header)

        # First get the data for all mini runners
        for filename in sorted(
                glob.glob(os.path.join(self.input_path, '*.csv'))):
            print(filename)
            data_list = opunit_data.get_mini_runner_data(
                filename, self.model_metrics_path, self.txn_sample_interval,
                self.model_map, self.stats_map, self.trim)
            for data in data_list:
                best_y_transformer, best_method = self._train_data(
                    data, summary_file)
                if self.expose_all:
                    self._train_specific_model(data, best_y_transformer,
                                               best_method)

        return self.model_map
Пример #2
0
    def train(self):
        """Train the mini-models

        :return: the map of the trained models
        """

        self.model_map = {}

        # First get the data for all mini runners
        for filename in sorted(glob.glob(os.path.join(self.input_path, '*.csv'))):
            print(filename)
            data_list = opunit_data.get_mini_runner_data(filename, self.model_map, self.stats_map)
            for data in data_list:
                self._train_data(data)

        return self.model_map
Пример #3
0
    def train(self):
        """Train the mini-models

        :return: the map of the trained models
        """

        data_list = []

        # First get the data for all mini runners
        for filename in glob.glob(os.path.join(self.input_path, '*.csv')):
            print(filename)
            data_list += opunit_data.get_mini_runner_data(filename)

        model_map = {}
        # train the models for all the operating units
        for data in data_list:
            x_train, x_test, y_train, y_test = model_selection.train_test_split(data.x, data.y,
                                                                                test_size=self.test_ratio,
                                                                                random_state=0)

            # Write the first header rwo to the result file
            metrics_path = "{}/{}.csv".format(self.model_metrics_path, data.opunit.name.lower())
            prediction_path = "{}/{}_prediction.csv".format(self.model_metrics_path, data.opunit.name.lower())
            result_writing_util.create_metrics_and_prediction_files(metrics_path, prediction_path)

            methods = self.ml_models
            # Only use linear regression for the arithmetic operating units
            if data.opunit in data_info.ARITHMETIC_OPUNITS:
                methods = ["lr"]

            # Also test the prediction with the target transformer (if specified for the operating unit)
            transformers = [None]
            modeling_transformer = data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit]
            if modeling_transformer is not None:
                transformers.append(modeling_transformer)

            min_percentage_error = 1
            pred_results = None
            elapsed_us_index = data_info.TARGET_CSV_INDEX[Target.ELAPSED_US]

            for transformer in transformers:
                for method in methods:
                    # Train the model
                    logging.info("{} {}".format(data.opunit.name, method))
                    regressor = model.Model(method, modeling_transformer=transformer)
                    regressor.train(x_train, y_train)

                    # Evaluate on both the training and test set
                    results = []
                    evaluate_data = [(x_train, y_train), (x_test, y_test)]
                    train_test_label = ["Train", "Test"]
                    for i, d in enumerate(evaluate_data):
                        evaluate_x = d[0]
                        evaluate_y = d[1]

                        y_pred = regressor.predict(evaluate_x)
                        logging.debug("x shape: {}".format(evaluate_x.shape))
                        logging.debug("y shape: {}".format(y_pred.shape))
                        percentage_error = np.average(np.abs(evaluate_y - y_pred) / (evaluate_y + 1), axis=0)
                        results += list(percentage_error) + [""]

                        logging.info('{} Percentage Error: {}'.format(train_test_label[i], percentage_error))

                        # Record the model with the lowest elapsed time prediction (since that might be the most
                        # important prediction)
                        if (i == 1 and percentage_error[elapsed_us_index] < min_percentage_error and transformer ==
                                transformers[-1]):
                            min_percentage_error = percentage_error[elapsed_us_index]
                            model_map[data.opunit] = regressor
                            pred_results = (evaluate_x, y_pred, evaluate_y)

                    # Dump the prediction results
                    transform = " "
                    if transformer is not None:
                        transform = " transform"
                    io_util.write_csv_result(metrics_path, method + transform, results)

                    logging.info("")

                io_util.write_csv_result(metrics_path, "", [])

            # Record the best prediction results on the test data
            result_writing_util.record_predictions(pred_results, prediction_path)

        return model_map