Пример #1
0
def _global_model_training_process(x, y, methods, test_ratio, metrics_path, prediction_path):
    """Training process for the global models

    :param x: input feature
    :param y: labels
    :param methods: ML models to enumerate
    :param test_ratio: train-test split ratio
    :param metrics_path: to store the prediction metrics
    :param prediction_path: to store the raw prediction results
    :return: (the best model, the indices for the test data for additional metric calculation)
    """
    global_model = None
    result_writing_util.create_metrics_and_prediction_files(metrics_path, prediction_path, False)
    n_samples = x.shape[0]
    indices = np.arange(n_samples)

    x_train, x_test, y_train, y_test, indices_train, indices_test = model_selection.train_test_split(
        x, y, indices, test_size=test_ratio, random_state=0)

    min_percentage_error = 1
    pred_results = None
    elapsed_us_index = data_info.instance.target_csv_index[Target.ELAPSED_US]

    for method in methods:
        # Train the model
        logging.info("Training the global model with {}".format(method))
        regressor = model.Model(method)
        regressor.train(x_train, y_train)

        # Evaluate on both the training and test set
        results = []
        evaluate_data = [(x_train, y_train), (x_test, y_test)]
        train_test_label = ["Train", "Test"]
        for i, d in enumerate(evaluate_data):
            evaluate_x = d[0]
            evaluate_y = d[1]

            y_pred = regressor.predict(evaluate_x)
            logging.debug("x shape: {}".format(evaluate_x.shape))
            logging.debug("y shape: {}".format(y_pred.shape))
            percentage_error = np.average(np.abs(evaluate_y - y_pred) / (evaluate_y + 1), axis=0)
            results += list(percentage_error) + [""]

            logging.info('{} Ratio Error: {}'.format(train_test_label[i], percentage_error))

            # Record the model with the lowest elapsed time prediction (since that might be the most
            # important prediction)
            if i == 1 and percentage_error[elapsed_us_index] < min_percentage_error:
                min_percentage_error = percentage_error[elapsed_us_index]
                global_model = regressor
                pred_results = (evaluate_x, y_pred, evaluate_y)

        io_util.write_csv_result(metrics_path, method, results)

        logging.info("")

    # Record the best prediction results on the test data
    result_writing_util.record_predictions(pred_results, prediction_path)

    return global_model, indices_test
Пример #2
0
def write_extended_data(output_path, symbol, index_value_list, data_map):
    # clear the content of the file
    open(output_path, 'w').close()

    io_util.write_csv_result(output_path, symbol, index_value_list)
    for key, value in data_map.items():
        io_util.write_csv_result(output_path, key, value)
Пример #3
0
    def _record_results(self, x, y, y_pred, label):
        """Record the prediction results

        :param x: the input data
        :param y: the actual output
        :param y_pred: the predicted output
        :param label: the result label ("resource" or "impact")
        """
        # Result files
        metrics_path = "{}/global_{}_model_metrics.csv".format(
            self.model_results_path, label)
        prediction_path = "{}/global_{}_model_prediction.csv".format(
            self.model_results_path, label)
        result_writing_util.create_metrics_and_prediction_files(
            metrics_path, prediction_path)

        # Log the prediction results
        ratio_error = np.average(np.abs(y - y_pred) / (y + 1e-6), axis=0)
        io_util.write_csv_result(metrics_path, "Ratio Error", ratio_error)
        result_writing_util.record_predictions((x, y_pred, y), prediction_path)

        # Print Error summary to command line
        if label == "resource":
            original_ratio_error = np.average(np.abs(y - x[:, :y.shape[1]]) /
                                              (y + 1e-6),
                                              axis=0)
        else:
            original_ratio_error = np.average(np.abs(1 / (y + 1e-6) - 1),
                                              axis=0)
        logging.info('Model Original Ratio Error ({}): {}'.format(
            label, original_ratio_error))
        logging.info('Model Ratio Error ({}): {}'.format(label, ratio_error))
        logging.info('')
Пример #4
0
def _txn_get_mini_runner_data(filename, model_results_path,
                              txn_sample_interval):
    # In the default case, the data does not need any pre-processing and the file name indicates the opunit
    df = pd.read_csv(filename)
    file_name = os.path.splitext(os.path.basename(filename))[0]

    # prepending a column of ones as the base transaction data feature
    base_x = pd.DataFrame(data=np.ones((df.shape[0], 1), dtype=int))
    df = pd.concat([base_x, df], axis=1)
    x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values
    y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values
    start_times = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target.
                                                        START_TIME]].values
    cpu_ids = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target.
                                                    CPU_ID]].values

    logging.info("Loaded file: {}".format(OpUnit[file_name.upper()]))

    # change the data based on the interval for the periodically invoked operating units
    prediction_path = "{}/{}_txn_converted_data.csv".format(
        model_results_path, file_name)
    io_util.create_csv_file(prediction_path, [""])

    interval = data_info.CONTENDING_OPUNIT_INTERVAL

    # Map from interval start time to the data in this interval
    interval_x_map = {}
    interval_y_map = {}
    interval_id_map = {}
    n = x.shape[0]
    for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"):
        rounded_time = data_util.round_to_interval(start_times[i], interval)
        if rounded_time not in interval_x_map:
            interval_x_map[rounded_time] = []
            interval_y_map[rounded_time] = []
            interval_id_map[rounded_time] = set()
        interval_x_map[rounded_time].append(x[i])
        interval_y_map[rounded_time].append(y[i])
        interval_id_map[rounded_time].add(cpu_ids[i])

    # Construct the new data
    x_list = []
    y_list = []
    for rounded_time in interval_x_map:
        # Sum the features
        x_new = np.sum(interval_x_map[rounded_time], axis=0)
        # Concatenate the number of different threads
        x_new = np.concatenate((x_new, [len(interval_id_map[rounded_time])]))
        x_new *= txn_sample_interval + 1
        x_list.append(x_new)
        # The prediction is the average behavior
        y_list.append(np.average(interval_y_map[rounded_time], axis=0))
        io_util.write_csv_result(prediction_path, rounded_time,
                                 np.concatenate((x_list[-1], y_list[-1])))

    return [
        OpUnitData(OpUnit[file_name.upper()], np.array(x_list),
                   np.array(y_list))
    ]
Пример #5
0
def _predict_grouped_opunit_data(data_list, mini_model_map,
                                 model_results_path):
    """Use the mini-runner to predict the resource consumptions for all the GlobalData, and record the prediction
    result in place

    :param data_list: The list of the GroupedOpUnitData objects
    :param mini_model_map: The trained mini models
    :param model_results_path: file path to log the prediction results
    """
    prediction_path = "{}/grouped_opunit_prediction.csv".format(
        model_results_path)
    io_util.create_csv_file(
        prediction_path,
        ["Pipeline", "Actual Us", "Predicted Us", "", "Ratio Error"])

    # Have to use a prediction cache when having lots of global data...
    prediction_cache = {}

    # First run a prediction on the global running data with the mini model results
    for i, data in enumerate(
            tqdm.tqdm(data_list, desc="Predict GroupedOpUnitData")):
        y = data.y
        logging.debug("{} pipeline elapsed time: {}".format(data.name, y[-1]))
        pipeline_y_pred = 0
        x = None
        for opunit_feature in data.opunit_features:
            opunit = opunit_feature[0]
            opunit_model = mini_model_map[opunit]
            x = np.array(opunit_feature[1]).reshape(1, -1)
            key = (opunit, x.tobytes())
            if key not in prediction_cache:
                y_pred = opunit_model.predict(x)
                # subtract scan from certain double-counted opunits
                if opunit in data_info.SCAN_SUBSTRACT_UNITS:
                    scan_y_pred = mini_model_map[OpUnit.SEQ_SCAN].predict(x)
                    y_pred -= scan_y_pred
                y_pred = np.clip(y_pred, 0, None)
                prediction_cache[key] = y_pred
            else:
                y_pred = prediction_cache[key]
            logging.debug(
                "Predicted {} elapsed time with feature {}: {}".format(
                    opunit_feature[0].name, x[0], y_pred[0, -1]))
            pipeline_y_pred += y_pred[0]

        # Record the predicted
        data.y_pred = pipeline_y_pred
        logging.debug("{} pipeline predicted time: {}".format(
            data.name, pipeline_y_pred[-1]))
        ratio_error = abs(y - pipeline_y_pred) / (y + 1e-6)
        logging.debug("|Actual - Predict| / Actual: {}".format(
            ratio_error[-1]))

        io_util.write_csv_result(prediction_path,
                                 data.name + " " + str(x[0][-1]),
                                 [y[-1], pipeline_y_pred[-1], "", ratio_error])

        logging.debug("")
Пример #6
0
def _get_global_resource_data(start_time, concurrent_data_list, log_path):
    """Get the input feature and the target output for the global resource utilization metrics during an interval

    The calculation is adjusted by the overlapping ratio between the opunit groups and the time range.

    :param start_time: of the interval
    :param concurrent_data_list: the concurrent running opunit groups
    :param log_path: the file path to log the data construction results
    :return: (the input feature, the resource utilization on the other logical core of the same physical core,
    the output resource targets)
    """
    # Define a secondary_counting_mode corresponding to the concurrent_counting_mode to derive the concurrent operations
    # in different scenarios
    end_time = start_time + global_model_config.INTERVAL_SIZE - 1
    elapsed_us = global_model_config.INTERVAL_SIZE

    # The adjusted resource metrics per logical core.
    # TODO: Assuming each physical core has two logical cores via hyper threading for now. Can extend to other scenarios
    physical_core_num = hardware_info.PHYSICAL_CORE_NUM
    adjusted_x_list = [0] * 2 * physical_core_num
    adjusted_y = 0
    logging.debug(concurrent_data_list)
    logging.debug("{} {}".format(start_time, end_time))

    for data in concurrent_data_list:
        data_start_time = data.get_start_time(ConcurrentCountingMode.ESTIMATED)
        data_end_time = data.get_end_time(ConcurrentCountingMode.ESTIMATED)
        ratio = _calculate_range_overlap(start_time, end_time, data_start_time, data_end_time) / (data_end_time -
                                                                                                  data_start_time + 1)
        logging.debug("{} {} {}".format(data_start_time, data_end_time, ratio))
        logging.debug("{} {}".format(data.y, data.y_pred))
        adjusted_y += data.y * ratio
        cpu_id = data.cpu_id
        if cpu_id > physical_core_num:
            cpu_id -= physical_core_num
        adjusted_x_list[cpu_id] += data.y_pred * ratio

    # change the number to per time unit (us) utilization
    for x in adjusted_x_list:
        x /= elapsed_us
    adjusted_y /= elapsed_us

    sum_adjusted_x = np.sum(adjusted_x_list, axis=0)
    std_adjusted_x = np.std(adjusted_x_list, axis=0)

    ratio_error = abs(adjusted_y - sum_adjusted_x) / (adjusted_y + 1e-6)

    logging.debug(sum_adjusted_x)
    logging.debug(adjusted_y)
    logging.debug("")

    io_util.write_csv_result(log_path, elapsed_us, [len(concurrent_data_list)] + list(sum_adjusted_x) + [""] +
                             list(adjusted_y) + [""] + list(ratio_error))

    adjusted_x = np.concatenate((sum_adjusted_x, std_adjusted_x))

    return global_model_data.GlobalResourceData(start_time, adjusted_x_list, adjusted_x, adjusted_y)
Пример #7
0
    def _record_results(self, x, y, y_pred, raw_y, mini_model_y_pred, label):
        """Record the prediction results

        :param x: the input data
        :param y: the actual output
        :param y_pred: the predicted output
        :param label: the result label ("resource" or "impact")
        """
        # Result files
        metrics_path = "{}/global_{}_model_metrics.csv".format(self.model_results_path, label)
        prediction_path = "{}/global_{}_model_prediction.csv".format(self.model_results_path, label)
        result_writing_util.create_metrics_and_prediction_files(metrics_path, prediction_path, True)

        # Log the prediction results
        ratio_error = np.average(np.abs(y - y_pred) / (y + 1e-6), axis=0)
        io_util.write_csv_result(metrics_path, "Model Ratio Error", ratio_error)
        result_writing_util.record_predictions((x, y_pred, y), prediction_path)

        # Print Error summary to command line
        if label == "resource":
            original_ratio_error = np.average(np.abs(y - x[:, :y.shape[1]]) / (y + 1e-6), axis=0)
        else:
            original_ratio_error = np.average(np.abs(1 / (y + 1e-6) - 1), axis=0)
        logging.info('Model Original Ratio Error ({}): {}'.format(label, original_ratio_error))
        logging.info('Model Ratio Error ({}): {}'.format(label, ratio_error))
        logging.info('')

        if label != "resource":
            # Calculate the accumulated ratio error
            epsilon = global_model_config.RATIO_DIVISION_EPSILON
            mini_model_y_pred = np.array(mini_model_y_pred)
            raw_y = np.array(raw_y)
            raw_y_pred = (mini_model_y_pred + epsilon) * y_pred
            accumulated_raw_y = np.sum(raw_y, axis=0)
            accumulated_raw_y_pred = np.sum(raw_y_pred, axis=0)
            original_ratio_error = np.average(np.abs(raw_y - mini_model_y_pred) / (raw_y + epsilon), axis=0)
            ratio_error = np.average(np.abs(raw_y - raw_y_pred) / (raw_y + epsilon), axis=0)
            accumulated_percentage_error = np.abs(accumulated_raw_y - accumulated_raw_y_pred) / (
                        accumulated_raw_y + epsilon)
            original_accumulated_percentage_error = np.abs(accumulated_raw_y - np.sum(mini_model_y_pred, axis=0)) / (
                    accumulated_raw_y + epsilon)

            logging.info('Original Ratio Error: {}'.format(original_ratio_error))
            io_util.write_csv_result(metrics_path, "Original Ratio Error", original_ratio_error)
            logging.info('Ratio Error: {}'.format(ratio_error))
            io_util.write_csv_result(metrics_path, "Ratio Error", ratio_error)
            logging.info('Original Accumulated Ratio Error: {}'.format(original_accumulated_percentage_error))
            io_util.write_csv_result(metrics_path, "Original Accumulated Ratio Error",
                                     original_accumulated_percentage_error)
            logging.info('Accumulated Ratio Error: {}'.format(accumulated_percentage_error))
            io_util.write_csv_result(metrics_path, "Accumulated Ratio Error", accumulated_percentage_error)
            logging.info('Accumulated Actual: {}'.format(accumulated_raw_y))
            logging.info('Original Accumulated Predict: {}'.format(np.sum(mini_model_y_pred, axis=0)))
            logging.info('Accumulated Predict: {}'.format(accumulated_raw_y_pred))
Пример #8
0
def record_predictions(pred_results, prediction_path):
    """Record the raw prediction results

    :param pred_results: the data
    :param prediction_path: the file path to score
    :return:
    """
    num_data = pred_results[0].shape[0]
    for i in range(num_data):
        result_list = (list(pred_results[0][i]) + [""] + list(pred_results[1][i]) + [""]
                       + list(pred_results[2][i]))
        io_util.write_csv_result(prediction_path, "", result_list)
Пример #9
0
def _interval_get_mini_runner_data(filename, model_results_path):
    # In the default case, the data does not need any pre-processing and the file name indicates the opunit
    df = pd.read_csv(filename, skipinitialspace=True)
    headers = list(df.columns.values)
    data_info.parse_csv_header(headers, False)
    file_name = os.path.splitext(os.path.basename(filename))[0]

    x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values
    y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values
    start_times = df.iloc[:, data_info.RAW_TARGET_CSV_INDEX[Target.
                                                            START_TIME]].values
    logging.info("Loaded file: {}".format(OpUnit[file_name.upper()]))

    # change the data based on the interval for the periodically invoked operating units
    prediction_path = "{}/{}_interval_converted_data.csv".format(
        model_results_path, file_name)
    io_util.create_csv_file(prediction_path, [""])

    interval = data_info.PERIODIC_OPUNIT_INTERVAL

    # Map from interval start time to the data in this interval
    interval_x_map = {}
    interval_y_map = {}
    n = x.shape[0]
    for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"):
        rounded_time = data_util.round_to_interval(start_times[i], interval)
        if rounded_time not in interval_x_map:
            interval_x_map[rounded_time] = []
            interval_y_map[rounded_time] = []
        interval_x_map[rounded_time].append(x[i])
        interval_y_map[rounded_time].append(y[i])

    # Construct the new data
    x_list = []
    y_list = []
    for rounded_time in interval_x_map:
        # Sum the features
        x_new = np.sum(interval_x_map[rounded_time], axis=0)
        # Keep the interval parameter the same
        # TODO: currently the interval parameter is always the last. Change the hard-coding later
        x_new[-1] /= len(interval_x_map[rounded_time])
        x_list.append(x_new)
        # The prediction is the average behavior
        y_list.append(np.average(interval_y_map[rounded_time], axis=0))
        io_util.write_csv_result(prediction_path, rounded_time,
                                 np.concatenate((x_list[-1], y_list[-1])))

    return [
        OpUnitData(OpUnit[file_name.upper()], np.array(x_list),
                   np.array(y_list))
    ]
Пример #10
0
    def _train_data(self, data, summary_file):
        x_train, x_test, y_train, y_test = model_selection.train_test_split(
            data.x, data.y, test_size=self.test_ratio, random_state=0)

        # Write the first header rwo to the result file
        metrics_path = "{}/{}.csv".format(self.model_metrics_path,
                                          data.opunit.name.lower())
        prediction_path = "{}/{}_prediction.csv".format(
            self.model_metrics_path, data.opunit.name.lower())
        result_writing_util.create_metrics_and_prediction_files(
            metrics_path, prediction_path, False)

        methods = self.ml_models

        # Test the prediction with/without the target transformer
        y_transformers = [
            None, data_transforming_util.OPUNIT_Y_TRANSFORMER_MAP[data.opunit]
        ]
        # modeling_transformer = data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit]
        # if modeling_transformer is not None:
        #    transformers.append(modeling_transformer)
        x_transformer = data_transforming_util.OPUNIT_X_TRANSFORMER_MAP[
            data.opunit]

        error_bias = 1
        min_percentage_error = 2
        pred_results = None
        elapsed_us_index = data_info.TARGET_CSV_INDEX[Target.ELAPSED_US]
        memory_b_index = data_info.TARGET_CSV_INDEX[Target.MEMORY_B]

        best_y_transformer = -1
        best_method = -1
        for i, y_transformer in enumerate(y_transformers):
            for m, method in enumerate(methods):
                # Train the model
                label = method if i == 0 else method + " transform"
                logging.info("{} {}".format(data.opunit.name, label))
                regressor = model.Model(method,
                                        y_transformer=y_transformer,
                                        x_transformer=x_transformer)
                regressor.train(x_train, y_train)

                # Evaluate on both the training and test set
                results = []
                evaluate_data = [(x_train, y_train), (x_test, y_test)]
                train_test_label = ["Train", "Test"]
                for j, d in enumerate(evaluate_data):
                    evaluate_x = d[0]
                    evaluate_y = d[1]

                    y_pred = regressor.predict(evaluate_x)
                    logging.debug("x shape: {}".format(evaluate_x.shape))
                    logging.debug("y shape: {}".format(y_pred.shape))
                    # In order to avoid the percentage error to explode when the actual label is very small,
                    # we omit the data point with the actual label <= 5 when calculating the percentage error (by
                    # essentially giving the data points with small labels a very small weight)
                    evaluate_threshold = 5
                    weights = np.where(evaluate_y > evaluate_threshold,
                                       np.ones(evaluate_y.shape),
                                       np.full(evaluate_y.shape, 1e-6))
                    percentage_error = np.average(np.abs(evaluate_y - y_pred) /
                                                  (evaluate_y + error_bias),
                                                  axis=0,
                                                  weights=weights)
                    results += list(percentage_error) + [""]

                    logging.info('{} Percentage Error: {}'.format(
                        train_test_label[j], percentage_error))

                    # The default method of determining whether a model is better is by comparing the model error
                    # on the elapsed us. For any opunits in MEM_EVALUATE_OPUNITS, we evaluate by comparing the
                    # model error on memory_b.
                    eval_error = percentage_error[elapsed_us_index]
                    if data.opunit in data_info.MEM_EVALUATE_OPUNITS:
                        eval_error = percentage_error[memory_b_index]

                    # Record the model with the lowest elapsed time prediction (since that might be the most
                    # important prediction)
                    # Only use linear regression for the arithmetic operating units
                    if (j == 1 and eval_error < min_percentage_error
                            and y_transformer == y_transformers[-1] and
                        (data.opunit not in data_info.ARITHMETIC_OPUNITS
                         or method == 'lr')):
                        min_percentage_error = eval_error
                        if self.expose_all:
                            best_y_transformer = i
                            best_method = m
                        else:
                            self.model_map[data.opunit] = regressor
                        pred_results = (evaluate_x, y_pred, evaluate_y)

                    if j == 1:
                        io_util.write_csv_result(summary_file,
                                                 data.opunit.name, [label] +
                                                 list(percentage_error))

                # Dump the prediction results
                io_util.write_csv_result(metrics_path, label, results)

                logging.info("")

            io_util.write_csv_result(metrics_path, "", [])

        # Record the best prediction results on the test data
        result_writing_util.record_predictions(pred_results, prediction_path)
        return best_y_transformer, best_method
Пример #11
0
def _predict_grouped_opunit_data(data_list, mini_model_map, model_results_path):
    """Use the mini-runner to predict the resource consumptions for all the GlobalData, and record the prediction
    result in place

    :param data_list: The list of the GroupedOpUnitData objects
    :param mini_model_map: The trained mini models
    :param model_results_path: file path to log the prediction results
    """
    prediction_path = "{}/grouped_opunit_prediction.csv".format(model_results_path)
    io_util.create_csv_file(prediction_path, ["Pipeline", "", "Actual", "", "Predicted", "", "Ratio Error"])

    # Have to use a prediction cache when having lots of global data...
    prediction_cache = {}

    # First run a prediction on the global running data with the mini model results
    for i, data in enumerate(tqdm.tqdm(data_list, desc="Predict GroupedOpUnitData")):
        y = data.y
        logging.debug("{} pipeline elapsed time: {}".format(data.name, y[-1]))
        pipeline_y_pred = 0
        x = None
        for opunit_feature in data.opunit_features:
            opunit = opunit_feature[0]
            opunit_model = mini_model_map[opunit]
            x = np.array(opunit_feature[1]).reshape(1, -1)
            key = (opunit, x.tobytes())
            if key not in prediction_cache:
                y_pred = opunit_model.predict(x)
                y_pred = np.clip(y_pred, 0, None)
                prediction_cache[key] = y_pred
            else:
                y_pred = prediction_cache[key]
            logging.debug("Predicted {} elapsed time with feature {}: {}".format(opunit_feature[0].name,
                                                                                 x[0], y_pred[0, -1]))

            if opunit in data_info.MEM_ADJUST_OPUNITS:
                # Compute the number of "slots" (based on row feature or cardinality feature
                num_tuple = opunit_feature[1][data_info.TUPLE_NUM_INDEX]
                if opunit == OpUnit.AGG_BUILD:
                    num_tuple = opunit_feature[1][data_info.CARDINALITY_INDEX]

                # SORT/AGG/HASHJOIN_BUILD all allocate a "pointer" buffer
                # that contains the first pow2 larger than num_tuple entries
                pow_high = 2 ** math.ceil(math.log(num_tuple, 2))
                buffer_size = pow_high * data_info.POINTER_SIZE
                if opunit == OpUnit.AGG_BUILD and num_tuple <= 256:
                    # For AGG_BUILD, if slots <= AggregationHashTable::K_DEFAULT_INITIAL_TABLE_SIZE
                    # the buffer is not recorded as part of the pipeline
                    buffer_size = 0

                pred_mem = y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]]
                if pred_mem <= buffer_size:
                    logging.warning("{} feature {} {} with prediction {} exceeds buffer {}"
                            .format(data.name, opunit_feature, opunit_feature[1], y_pred[0], buffer_size))

                # Poorly encapsulated, but memory scaling factor is located as the 2nd last of feature
                # slightly inaccurate since ignores load factors for hash tables
                adj_mem = (pred_mem - buffer_size) * opunit_feature[1][-2] + buffer_size

                # Don't modify prediction cache
                y_pred = copy.deepcopy(y_pred)
                y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]] = adj_mem

            pipeline_y_pred += y_pred[0]

        # Record the predicted
        data.y_pred = pipeline_y_pred
        logging.debug("{} pipeline predicted time: {}".format(data.name, pipeline_y_pred[-1]))
        ratio_error = abs(y - pipeline_y_pred) / (y + 1)
        logging.debug("|Actual - Predict| / Actual: {}".format(ratio_error[-1]))

        io_util.write_csv_result(prediction_path, data.name, [""] + list(y) + [""] + list(pipeline_y_pred) + [""] +
                                 list(ratio_error))

        logging.debug("")
Пример #12
0
    def _record_results(self, x, y, y_pred, raw_y, mini_model_y_pred, label,
                        data_list):
        """Record the prediction results

        :param x: the input data
        :param y: the actual output
        :param y_pred: the predicted output
        :param label: the result label ("resource" or "impact")
        """
        # Result files
        metrics_path = "{}/global_{}_model_metrics.csv".format(
            self.model_results_path, label)
        prediction_path = "{}/global_{}_model_prediction.csv".format(
            self.model_results_path, label)
        result_writing_util.create_metrics_and_prediction_files(
            metrics_path, prediction_path, True)

        # Log the prediction results
        ratio_error = np.average(np.abs(y - y_pred) / (y + 1e-6), axis=0)
        io_util.write_csv_result(metrics_path, "Model Ratio Error",
                                 ratio_error)
        result_writing_util.record_predictions((x, y_pred, y), prediction_path)

        # Print Error summary to command line
        if label == "resource":
            avg_original_ratio_error = np.average(
                np.abs(y - x[:, :y.shape[1]]) / (y + 1e-6), axis=0)
        else:
            avg_original_ratio_error = np.average(np.abs(1 / (y + 1e-6) - 1),
                                                  axis=0)
        logging.info('Model Original Ratio Error ({}): {}'.format(
            label, avg_original_ratio_error))
        logging.info('Model Ratio Error ({}): {}'.format(label, ratio_error))
        logging.info('')

        if label != "resource":
            # Calculate the accumulated ratio error
            epsilon = global_model_config.RATIO_DIVISION_EPSILON
            mini_model_y_pred = np.array(mini_model_y_pred)
            raw_y = np.array(raw_y)
            raw_y_pred = (mini_model_y_pred + epsilon) * y_pred
            accumulated_raw_y = np.sum(raw_y, axis=0)
            accumulated_raw_y_pred = np.sum(raw_y_pred, axis=0)
            original_ratio_error = np.abs(raw_y - mini_model_y_pred) / (
                raw_y + epsilon)
            avg_original_ratio_error = np.average(original_ratio_error, axis=0)
            ratio_error = np.abs(raw_y - raw_y_pred) / (raw_y + epsilon)
            avg_ratio_error = np.average(ratio_error, axis=0)
            accumulated_percentage_error = np.abs(
                accumulated_raw_y -
                accumulated_raw_y_pred) / (accumulated_raw_y + epsilon)
            original_accumulated_percentage_error = np.abs(
                accumulated_raw_y - np.sum(mini_model_y_pred, axis=0)) / (
                    accumulated_raw_y + epsilon)

            logging.info(
                'Original Ratio Error: {}'.format(avg_original_ratio_error))
            io_util.write_csv_result(metrics_path, "Original Ratio Error",
                                     avg_original_ratio_error)
            logging.info('Ratio Error: {}'.format(avg_ratio_error))
            io_util.write_csv_result(metrics_path, "Ratio Error",
                                     avg_ratio_error)
            logging.info('Original Accumulated Ratio Error: {}'.format(
                original_accumulated_percentage_error))
            io_util.write_csv_result(metrics_path,
                                     "Original Accumulated Ratio Error",
                                     original_accumulated_percentage_error)
            logging.info('Accumulated Ratio Error: {}'.format(
                accumulated_percentage_error))
            io_util.write_csv_result(metrics_path, "Accumulated Ratio Error",
                                     accumulated_percentage_error)
            logging.info('Accumulated Actual: {}'.format(accumulated_raw_y))
            logging.info('Original Accumulated Predict: {}'.format(
                np.sum(mini_model_y_pred, axis=0)))
            logging.info(
                'Accumulated Predict: {}'.format(accumulated_raw_y_pred))

            if label == 'direct':
                prediction_path = "{}/grouped_opunit_prediction.csv".format(
                    self.model_results_path)
                io_util.create_csv_file(prediction_path, [
                    "Pipeline", "", "Actual", "", "Predicted", "",
                    "Ratio Error"
                ])
                for i, data in enumerate(data_list):
                    io_util.write_csv_result(prediction_path, data.name,
                                             [""] + list(raw_y[i]) + [""] +
                                             list(raw_y_pred[i]) + [""] +
                                             list(ratio_error[i]))

                average_result_path = "{}/interval_average_prediction.csv".format(
                    self.model_results_path)
                io_util.create_csv_file(
                    average_result_path,
                    ["Timestamp", "Actual Average", "Predicted Average"])

                interval_y_map = {}
                interval_y_pred_map = {}
                mark_list = None
                #mark_list = _generate_mark_list(data_list)
                for i, data in enumerate(data_list):
                    # Don't count the create index OU
                    # TODO(lin): needs better way to evaluate... maybe add a id_query field to GroupedOpunitData
                    if data.concurrency > 0:
                        continue
                    if mark_list is not None and not mark_list[i]:
                        continue
                    interval_time = _round_to_interval(
                        data.start_time,
                        global_model_config.AVERAGING_INTERVAL)
                    if interval_time not in interval_y_map:
                        interval_y_map[interval_time] = []
                        interval_y_pred_map[interval_time] = []
                    interval_y_map[interval_time].append(raw_y[i][-5])
                    interval_y_pred_map[interval_time].append(
                        raw_y_pred[i][-5])

                for time in sorted(interval_y_map.keys()):
                    if mark_list is None:
                        io_util.write_csv_result(average_result_path, time, [
                            np.average(interval_y_map[time]),
                            np.average(interval_y_pred_map[time])
                        ])
                    else:
                        io_util.write_csv_result(average_result_path, time, [
                            np.sum(interval_y_map[time]),
                            np.sum(interval_y_pred_map[time])
                        ])
Пример #13
0
def _predict_grouped_opunit_data(data_list, mini_model_map, model_results_path):
    """Use the mini-runner to predict the resource consumptions for all the GlobalData, and record the prediction
    result in place

    :param data_list: The list of the GroupedOpUnitData objects
    :param mini_model_map: The trained mini models
    :param model_results_path: file path to log the prediction results
    """
    prediction_path = "{}/grouped_opunit_prediction.csv".format(model_results_path)
    pipeline_path = "{}/grouped_pipeline.csv".format(model_results_path)
    io_util.create_csv_file(prediction_path, ["Pipeline", "", "Actual", "", "Predicted", "", "Ratio Error"])
    io_util.create_csv_file(pipeline_path, ["Number", "Percentage", "Pipeline", "Actual Us", "Predicted Us", "Us Error", "Absolute Us", "Assolute Us %"])

    # Track pipeline cumulative numbers
    num_pipelines = 0
    total_actual = None
    total_predicted = []
    actual_pipelines = {}
    predicted_pipelines = {}
    count_pipelines = {}

    query_prediction_path = "{}/grouped_query_prediction.csv".format(model_results_path)
    io_util.create_csv_file(query_prediction_path, ["Query", "", "Actual", "", "Predicted", "", "Ratio Error"])
    current_query_id = None
    query_y = None
    query_y_pred = None

    # Have to use a prediction cache when having lots of global data...
    prediction_cache = {}

    # First run a prediction on the global running data with the mini model results
    for i, data in enumerate(tqdm.tqdm(data_list, desc="Predict GroupedOpUnitData")):
        y = data.y
        logging.debug("{} pipeline elapsed time: {}".format(data.name, y[-1]))

        pipeline_y_pred = 0
        x = None
        for opunit_feature in data.opunit_features:
            opunit = opunit_feature[0]
            opunit_model = mini_model_map[opunit]
            x = np.array(opunit_feature[1]).reshape(1, -1)
            key = (opunit, x.tobytes())
            if key not in prediction_cache:
                y_pred = opunit_model.predict(x)
                y_pred = np.clip(y_pred, 0, None)
                prediction_cache[key] = y_pred
            else:
                y_pred = prediction_cache[key]
            logging.debug("Predicted {} elapsed time with feature {}: {}".format(opunit_feature[0].name,
                                                                                 x[0], y_pred[0, -1]))

            if opunit in data_info.MEM_ADJUST_OPUNITS:
                # Compute the number of "slots" (based on row feature or cardinality feature
                num_tuple = opunit_feature[1][data_info.TUPLE_NUM_INDEX]
                if opunit == OpUnit.AGG_BUILD:
                    num_tuple = opunit_feature[1][data_info.CARDINALITY_INDEX]

                # SORT/AGG/HASHJOIN_BUILD all allocate a "pointer" buffer
                # that contains the first pow2 larger than num_tuple entries
                pow_high = 2 ** math.ceil(math.log(num_tuple, 2))
                buffer_size = pow_high * data_info.POINTER_SIZE
                if opunit == OpUnit.AGG_BUILD and num_tuple <= 256:
                    # For AGG_BUILD, if slots <= AggregationHashTable::K_DEFAULT_INITIAL_TABLE_SIZE
                    # the buffer is not recorded as part of the pipeline
                    buffer_size = 0

                pred_mem = y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]]
                if pred_mem <= buffer_size:
                    logging.warning("{} feature {} {} with prediction {} exceeds buffer {}"
                                    .format(data.name, opunit_feature, opunit_feature[1], y_pred[0], buffer_size))

                # Poorly encapsulated, but memory scaling factor is located as the 2nd last of feature
                # slightly inaccurate since ignores load factors for hash tables
                adj_mem = (pred_mem - buffer_size) * opunit_feature[1][-3] + buffer_size

                # Don't modify prediction cache
                y_pred = copy.deepcopy(y_pred)
                y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]] = adj_mem

            pipeline_y_pred += y_pred[0]

        pipeline_y = copy.deepcopy(pipeline_y_pred)

        # Grouping when we're predicting queries
        if data.name[0] == 'q':
            query_id = data.name[1:data.name.rfind(" p")]
            if query_id != current_query_id:
                if current_query_id is not None:
                    io_util.write_csv_result(query_prediction_path, current_query_id, [""] + list(query_y) + [""] +
                                             list(query_y_pred) + [""] +
                                             list(abs(query_y - query_y_pred) / (query_y + 1)))

                current_query_id = query_id
                query_y = y
                query_y_pred = pipeline_y_pred
            else:
                query_y += y
                query_y_pred += pipeline_y_pred

        data.y_pred = pipeline_y
        logging.debug("{} pipeline prediction: {}".format(data.name, pipeline_y))
        logging.debug("{} pipeline predicted time: {}".format(data.name, pipeline_y[-1]))
        ratio_error = abs(y - pipeline_y) / (y + 1)
        logging.debug("|Actual - Predict| / Actual: {}".format(ratio_error[-1]))

        io_util.write_csv_result(prediction_path, data.name, [""] + list(y) + [""] + list(pipeline_y) + [""] +
                                 list(ratio_error))

        logging.debug("")

        # Record cumulative numbers
        if data.name not in actual_pipelines:
            actual_pipelines[data.name] = copy.deepcopy(y)
            predicted_pipelines[data.name] = copy.deepcopy(pipeline_y)
            count_pipelines[data.name] = 1
        else:
            actual_pipelines[data.name] += y
            predicted_pipelines[data.name] += pipeline_y
            count_pipelines[data.name] += 1

        # Update totals
        if total_actual is None:
            total_actual = copy.deepcopy(y)
            total_predicted = copy.deepcopy(pipeline_y)
        else:
            total_actual += y
            total_predicted += pipeline_y

        num_pipelines += 1

    total_elapsed_err = 0
    for pipeline in actual_pipelines:
        actual = actual_pipelines[pipeline]
        predicted = predicted_pipelines[pipeline]
        total_elapsed_err = total_elapsed_err + (abs(actual - predicted))[-1]

    for pipeline in actual_pipelines:
        actual = actual_pipelines[pipeline]
        predicted = predicted_pipelines[pipeline]
        num = count_pipelines[pipeline]

        ratio_error = abs(actual - predicted) / (actual + 1)
        abs_error = abs(actual - predicted)[-1]
        pabs_error = abs_error / total_elapsed_err
        io_util.write_csv_result(pipeline_path, pipeline, [num, num*1.0/num_pipelines, actual[-1],
                                 predicted[-1], ratio_error[-1], abs_error, pabs_error] +
                                 [""] + list(actual) + [""] + list(predicted) + [""] + list(ratio_error))

    ratio_error = abs(total_actual - total_predicted) / (total_actual + 1)
    io_util.write_csv_result(pipeline_path, "Total Pipeline", [num_pipelines, 1, total_actual[-1],
                             total_predicted[-1], ratio_error[-1], total_elapsed_err, 1] +
                             [""] + list(total_actual) + [""] + list(total_predicted) + [""] + list(ratio_error))
Пример #14
0
    def _train_data(self, data, summary_file):
        x_train, x_test, y_train, y_test = model_selection.train_test_split(
            data.x, data.y, test_size=self.test_ratio, random_state=0)

        # Write the first header rwo to the result file
        metrics_path = "{}/{}.csv".format(self.model_metrics_path,
                                          data.opunit.name.lower())
        prediction_path = "{}/{}_prediction.csv".format(
            self.model_metrics_path, data.opunit.name.lower())
        result_writing_util.create_metrics_and_prediction_files(
            metrics_path, prediction_path)

        methods = self.ml_models

        # Test the prediction with/without the target transformer
        transformers = [
            None,
            data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit]
        ]
        # modeling_transformer = data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit]
        # if modeling_transformer is not None:
        #    transformers.append(modeling_transformer)

        error_bias = 1
        min_percentage_error = 2
        pred_results = None
        elapsed_us_index = data_info.TARGET_CSV_INDEX[Target.ELAPSED_US]

        for i, transformer in enumerate(transformers):
            for method in methods:
                # Train the model
                label = method if i == 0 else method + " transform"
                logging.info("{} {}".format(data.opunit.name, label))
                regressor = model.Model(method,
                                        modeling_transformer=transformer)
                regressor.train(x_train, y_train)

                # Evaluate on both the training and test set
                results = []
                evaluate_data = [(x_train, y_train), (x_test, y_test)]
                train_test_label = ["Train", "Test"]
                for j, d in enumerate(evaluate_data):
                    evaluate_x = d[0]
                    evaluate_y = d[1]

                    for x, y in zip(evaluate_x, evaluate_y):
                        stat_vec = [data.opunit]
                        stat_vec.extend(x)
                        self.stats_map[tuple(stat_vec)] = y

                    y_pred = regressor.predict(evaluate_x)
                    logging.debug("x shape: {}".format(evaluate_x.shape))
                    logging.debug("y shape: {}".format(y_pred.shape))
                    percentage_error = np.average(
                        np.abs(evaluate_y - y_pred) /
                        (evaluate_y + 1 + error_bias),
                        axis=0)
                    results += list(percentage_error) + [""]

                    logging.info('{} Percentage Error: {}'.format(
                        train_test_label[j], percentage_error))

                    # Record the model with the lowest elapsed time prediction (since that might be the most
                    # important prediction)
                    # Only use linear regression for the arithmetic operating units
                    if (j == 1 and percentage_error[elapsed_us_index] <
                            min_percentage_error
                            and transformer == transformers[-1] and
                        (data.opunit not in data_info.ARITHMETIC_OPUNITS
                         or method == 'lr')):
                        min_percentage_error = percentage_error[
                            elapsed_us_index]
                        self.model_map[data.opunit] = regressor
                        pred_results = (evaluate_x, y_pred, evaluate_y)

                    if j == 1:
                        io_util.write_csv_result(summary_file,
                                                 data.opunit.name, [label] +
                                                 list(percentage_error))

                # Dump the prediction results
                io_util.write_csv_result(metrics_path, label, results)

                logging.info("")

            io_util.write_csv_result(metrics_path, "", [])

        # Record the best prediction results on the test data
        result_writing_util.record_predictions(pred_results, prediction_path)
Пример #15
0
    def train(self):
        """Train the mini-models

        :return: the map of the trained models
        """

        data_list = []

        # First get the data for all mini runners
        for filename in glob.glob(os.path.join(self.input_path, '*.csv')):
            print(filename)
            data_list += opunit_data.get_mini_runner_data(filename)

        model_map = {}
        # train the models for all the operating units
        for data in data_list:
            x_train, x_test, y_train, y_test = model_selection.train_test_split(data.x, data.y,
                                                                                test_size=self.test_ratio,
                                                                                random_state=0)

            # Write the first header rwo to the result file
            metrics_path = "{}/{}.csv".format(self.model_metrics_path, data.opunit.name.lower())
            prediction_path = "{}/{}_prediction.csv".format(self.model_metrics_path, data.opunit.name.lower())
            result_writing_util.create_metrics_and_prediction_files(metrics_path, prediction_path)

            methods = self.ml_models
            # Only use linear regression for the arithmetic operating units
            if data.opunit in data_info.ARITHMETIC_OPUNITS:
                methods = ["lr"]

            # Also test the prediction with the target transformer (if specified for the operating unit)
            transformers = [None]
            modeling_transformer = data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit]
            if modeling_transformer is not None:
                transformers.append(modeling_transformer)

            min_percentage_error = 1
            pred_results = None
            elapsed_us_index = data_info.TARGET_CSV_INDEX[Target.ELAPSED_US]

            for transformer in transformers:
                for method in methods:
                    # Train the model
                    logging.info("{} {}".format(data.opunit.name, method))
                    regressor = model.Model(method, modeling_transformer=transformer)
                    regressor.train(x_train, y_train)

                    # Evaluate on both the training and test set
                    results = []
                    evaluate_data = [(x_train, y_train), (x_test, y_test)]
                    train_test_label = ["Train", "Test"]
                    for i, d in enumerate(evaluate_data):
                        evaluate_x = d[0]
                        evaluate_y = d[1]

                        y_pred = regressor.predict(evaluate_x)
                        logging.debug("x shape: {}".format(evaluate_x.shape))
                        logging.debug("y shape: {}".format(y_pred.shape))
                        percentage_error = np.average(np.abs(evaluate_y - y_pred) / (evaluate_y + 1), axis=0)
                        results += list(percentage_error) + [""]

                        logging.info('{} Percentage Error: {}'.format(train_test_label[i], percentage_error))

                        # Record the model with the lowest elapsed time prediction (since that might be the most
                        # important prediction)
                        if (i == 1 and percentage_error[elapsed_us_index] < min_percentage_error and transformer ==
                                transformers[-1]):
                            min_percentage_error = percentage_error[elapsed_us_index]
                            model_map[data.opunit] = regressor
                            pred_results = (evaluate_x, y_pred, evaluate_y)

                    # Dump the prediction results
                    transform = " "
                    if transformer is not None:
                        transform = " transform"
                    io_util.write_csv_result(metrics_path, method + transform, results)

                    logging.info("")

                io_util.write_csv_result(metrics_path, "", [])

            # Record the best prediction results on the test data
            result_writing_util.record_predictions(pred_results, prediction_path)

        return model_map