def _global_model_training_process(x, y, methods, test_ratio, metrics_path, prediction_path): """Training process for the global models :param x: input feature :param y: labels :param methods: ML models to enumerate :param test_ratio: train-test split ratio :param metrics_path: to store the prediction metrics :param prediction_path: to store the raw prediction results :return: (the best model, the indices for the test data for additional metric calculation) """ global_model = None result_writing_util.create_metrics_and_prediction_files(metrics_path, prediction_path, False) n_samples = x.shape[0] indices = np.arange(n_samples) x_train, x_test, y_train, y_test, indices_train, indices_test = model_selection.train_test_split( x, y, indices, test_size=test_ratio, random_state=0) min_percentage_error = 1 pred_results = None elapsed_us_index = data_info.instance.target_csv_index[Target.ELAPSED_US] for method in methods: # Train the model logging.info("Training the global model with {}".format(method)) regressor = model.Model(method) regressor.train(x_train, y_train) # Evaluate on both the training and test set results = [] evaluate_data = [(x_train, y_train), (x_test, y_test)] train_test_label = ["Train", "Test"] for i, d in enumerate(evaluate_data): evaluate_x = d[0] evaluate_y = d[1] y_pred = regressor.predict(evaluate_x) logging.debug("x shape: {}".format(evaluate_x.shape)) logging.debug("y shape: {}".format(y_pred.shape)) percentage_error = np.average(np.abs(evaluate_y - y_pred) / (evaluate_y + 1), axis=0) results += list(percentage_error) + [""] logging.info('{} Ratio Error: {}'.format(train_test_label[i], percentage_error)) # Record the model with the lowest elapsed time prediction (since that might be the most # important prediction) if i == 1 and percentage_error[elapsed_us_index] < min_percentage_error: min_percentage_error = percentage_error[elapsed_us_index] global_model = regressor pred_results = (evaluate_x, y_pred, evaluate_y) io_util.write_csv_result(metrics_path, method, results) logging.info("") # Record the best prediction results on the test data result_writing_util.record_predictions(pred_results, prediction_path) return global_model, indices_test
def write_extended_data(output_path, symbol, index_value_list, data_map): # clear the content of the file open(output_path, 'w').close() io_util.write_csv_result(output_path, symbol, index_value_list) for key, value in data_map.items(): io_util.write_csv_result(output_path, key, value)
def _record_results(self, x, y, y_pred, label): """Record the prediction results :param x: the input data :param y: the actual output :param y_pred: the predicted output :param label: the result label ("resource" or "impact") """ # Result files metrics_path = "{}/global_{}_model_metrics.csv".format( self.model_results_path, label) prediction_path = "{}/global_{}_model_prediction.csv".format( self.model_results_path, label) result_writing_util.create_metrics_and_prediction_files( metrics_path, prediction_path) # Log the prediction results ratio_error = np.average(np.abs(y - y_pred) / (y + 1e-6), axis=0) io_util.write_csv_result(metrics_path, "Ratio Error", ratio_error) result_writing_util.record_predictions((x, y_pred, y), prediction_path) # Print Error summary to command line if label == "resource": original_ratio_error = np.average(np.abs(y - x[:, :y.shape[1]]) / (y + 1e-6), axis=0) else: original_ratio_error = np.average(np.abs(1 / (y + 1e-6) - 1), axis=0) logging.info('Model Original Ratio Error ({}): {}'.format( label, original_ratio_error)) logging.info('Model Ratio Error ({}): {}'.format(label, ratio_error)) logging.info('')
def _txn_get_mini_runner_data(filename, model_results_path, txn_sample_interval): # In the default case, the data does not need any pre-processing and the file name indicates the opunit df = pd.read_csv(filename) file_name = os.path.splitext(os.path.basename(filename))[0] # prepending a column of ones as the base transaction data feature base_x = pd.DataFrame(data=np.ones((df.shape[0], 1), dtype=int)) df = pd.concat([base_x, df], axis=1) x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values start_times = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target. START_TIME]].values cpu_ids = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target. CPU_ID]].values logging.info("Loaded file: {}".format(OpUnit[file_name.upper()])) # change the data based on the interval for the periodically invoked operating units prediction_path = "{}/{}_txn_converted_data.csv".format( model_results_path, file_name) io_util.create_csv_file(prediction_path, [""]) interval = data_info.CONTENDING_OPUNIT_INTERVAL # Map from interval start time to the data in this interval interval_x_map = {} interval_y_map = {} interval_id_map = {} n = x.shape[0] for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"): rounded_time = data_util.round_to_interval(start_times[i], interval) if rounded_time not in interval_x_map: interval_x_map[rounded_time] = [] interval_y_map[rounded_time] = [] interval_id_map[rounded_time] = set() interval_x_map[rounded_time].append(x[i]) interval_y_map[rounded_time].append(y[i]) interval_id_map[rounded_time].add(cpu_ids[i]) # Construct the new data x_list = [] y_list = [] for rounded_time in interval_x_map: # Sum the features x_new = np.sum(interval_x_map[rounded_time], axis=0) # Concatenate the number of different threads x_new = np.concatenate((x_new, [len(interval_id_map[rounded_time])])) x_new *= txn_sample_interval + 1 x_list.append(x_new) # The prediction is the average behavior y_list.append(np.average(interval_y_map[rounded_time], axis=0)) io_util.write_csv_result(prediction_path, rounded_time, np.concatenate((x_list[-1], y_list[-1]))) return [ OpUnitData(OpUnit[file_name.upper()], np.array(x_list), np.array(y_list)) ]
def _predict_grouped_opunit_data(data_list, mini_model_map, model_results_path): """Use the mini-runner to predict the resource consumptions for all the GlobalData, and record the prediction result in place :param data_list: The list of the GroupedOpUnitData objects :param mini_model_map: The trained mini models :param model_results_path: file path to log the prediction results """ prediction_path = "{}/grouped_opunit_prediction.csv".format( model_results_path) io_util.create_csv_file( prediction_path, ["Pipeline", "Actual Us", "Predicted Us", "", "Ratio Error"]) # Have to use a prediction cache when having lots of global data... prediction_cache = {} # First run a prediction on the global running data with the mini model results for i, data in enumerate( tqdm.tqdm(data_list, desc="Predict GroupedOpUnitData")): y = data.y logging.debug("{} pipeline elapsed time: {}".format(data.name, y[-1])) pipeline_y_pred = 0 x = None for opunit_feature in data.opunit_features: opunit = opunit_feature[0] opunit_model = mini_model_map[opunit] x = np.array(opunit_feature[1]).reshape(1, -1) key = (opunit, x.tobytes()) if key not in prediction_cache: y_pred = opunit_model.predict(x) # subtract scan from certain double-counted opunits if opunit in data_info.SCAN_SUBSTRACT_UNITS: scan_y_pred = mini_model_map[OpUnit.SEQ_SCAN].predict(x) y_pred -= scan_y_pred y_pred = np.clip(y_pred, 0, None) prediction_cache[key] = y_pred else: y_pred = prediction_cache[key] logging.debug( "Predicted {} elapsed time with feature {}: {}".format( opunit_feature[0].name, x[0], y_pred[0, -1])) pipeline_y_pred += y_pred[0] # Record the predicted data.y_pred = pipeline_y_pred logging.debug("{} pipeline predicted time: {}".format( data.name, pipeline_y_pred[-1])) ratio_error = abs(y - pipeline_y_pred) / (y + 1e-6) logging.debug("|Actual - Predict| / Actual: {}".format( ratio_error[-1])) io_util.write_csv_result(prediction_path, data.name + " " + str(x[0][-1]), [y[-1], pipeline_y_pred[-1], "", ratio_error]) logging.debug("")
def _get_global_resource_data(start_time, concurrent_data_list, log_path): """Get the input feature and the target output for the global resource utilization metrics during an interval The calculation is adjusted by the overlapping ratio between the opunit groups and the time range. :param start_time: of the interval :param concurrent_data_list: the concurrent running opunit groups :param log_path: the file path to log the data construction results :return: (the input feature, the resource utilization on the other logical core of the same physical core, the output resource targets) """ # Define a secondary_counting_mode corresponding to the concurrent_counting_mode to derive the concurrent operations # in different scenarios end_time = start_time + global_model_config.INTERVAL_SIZE - 1 elapsed_us = global_model_config.INTERVAL_SIZE # The adjusted resource metrics per logical core. # TODO: Assuming each physical core has two logical cores via hyper threading for now. Can extend to other scenarios physical_core_num = hardware_info.PHYSICAL_CORE_NUM adjusted_x_list = [0] * 2 * physical_core_num adjusted_y = 0 logging.debug(concurrent_data_list) logging.debug("{} {}".format(start_time, end_time)) for data in concurrent_data_list: data_start_time = data.get_start_time(ConcurrentCountingMode.ESTIMATED) data_end_time = data.get_end_time(ConcurrentCountingMode.ESTIMATED) ratio = _calculate_range_overlap(start_time, end_time, data_start_time, data_end_time) / (data_end_time - data_start_time + 1) logging.debug("{} {} {}".format(data_start_time, data_end_time, ratio)) logging.debug("{} {}".format(data.y, data.y_pred)) adjusted_y += data.y * ratio cpu_id = data.cpu_id if cpu_id > physical_core_num: cpu_id -= physical_core_num adjusted_x_list[cpu_id] += data.y_pred * ratio # change the number to per time unit (us) utilization for x in adjusted_x_list: x /= elapsed_us adjusted_y /= elapsed_us sum_adjusted_x = np.sum(adjusted_x_list, axis=0) std_adjusted_x = np.std(adjusted_x_list, axis=0) ratio_error = abs(adjusted_y - sum_adjusted_x) / (adjusted_y + 1e-6) logging.debug(sum_adjusted_x) logging.debug(adjusted_y) logging.debug("") io_util.write_csv_result(log_path, elapsed_us, [len(concurrent_data_list)] + list(sum_adjusted_x) + [""] + list(adjusted_y) + [""] + list(ratio_error)) adjusted_x = np.concatenate((sum_adjusted_x, std_adjusted_x)) return global_model_data.GlobalResourceData(start_time, adjusted_x_list, adjusted_x, adjusted_y)
def _record_results(self, x, y, y_pred, raw_y, mini_model_y_pred, label): """Record the prediction results :param x: the input data :param y: the actual output :param y_pred: the predicted output :param label: the result label ("resource" or "impact") """ # Result files metrics_path = "{}/global_{}_model_metrics.csv".format(self.model_results_path, label) prediction_path = "{}/global_{}_model_prediction.csv".format(self.model_results_path, label) result_writing_util.create_metrics_and_prediction_files(metrics_path, prediction_path, True) # Log the prediction results ratio_error = np.average(np.abs(y - y_pred) / (y + 1e-6), axis=0) io_util.write_csv_result(metrics_path, "Model Ratio Error", ratio_error) result_writing_util.record_predictions((x, y_pred, y), prediction_path) # Print Error summary to command line if label == "resource": original_ratio_error = np.average(np.abs(y - x[:, :y.shape[1]]) / (y + 1e-6), axis=0) else: original_ratio_error = np.average(np.abs(1 / (y + 1e-6) - 1), axis=0) logging.info('Model Original Ratio Error ({}): {}'.format(label, original_ratio_error)) logging.info('Model Ratio Error ({}): {}'.format(label, ratio_error)) logging.info('') if label != "resource": # Calculate the accumulated ratio error epsilon = global_model_config.RATIO_DIVISION_EPSILON mini_model_y_pred = np.array(mini_model_y_pred) raw_y = np.array(raw_y) raw_y_pred = (mini_model_y_pred + epsilon) * y_pred accumulated_raw_y = np.sum(raw_y, axis=0) accumulated_raw_y_pred = np.sum(raw_y_pred, axis=0) original_ratio_error = np.average(np.abs(raw_y - mini_model_y_pred) / (raw_y + epsilon), axis=0) ratio_error = np.average(np.abs(raw_y - raw_y_pred) / (raw_y + epsilon), axis=0) accumulated_percentage_error = np.abs(accumulated_raw_y - accumulated_raw_y_pred) / ( accumulated_raw_y + epsilon) original_accumulated_percentage_error = np.abs(accumulated_raw_y - np.sum(mini_model_y_pred, axis=0)) / ( accumulated_raw_y + epsilon) logging.info('Original Ratio Error: {}'.format(original_ratio_error)) io_util.write_csv_result(metrics_path, "Original Ratio Error", original_ratio_error) logging.info('Ratio Error: {}'.format(ratio_error)) io_util.write_csv_result(metrics_path, "Ratio Error", ratio_error) logging.info('Original Accumulated Ratio Error: {}'.format(original_accumulated_percentage_error)) io_util.write_csv_result(metrics_path, "Original Accumulated Ratio Error", original_accumulated_percentage_error) logging.info('Accumulated Ratio Error: {}'.format(accumulated_percentage_error)) io_util.write_csv_result(metrics_path, "Accumulated Ratio Error", accumulated_percentage_error) logging.info('Accumulated Actual: {}'.format(accumulated_raw_y)) logging.info('Original Accumulated Predict: {}'.format(np.sum(mini_model_y_pred, axis=0))) logging.info('Accumulated Predict: {}'.format(accumulated_raw_y_pred))
def record_predictions(pred_results, prediction_path): """Record the raw prediction results :param pred_results: the data :param prediction_path: the file path to score :return: """ num_data = pred_results[0].shape[0] for i in range(num_data): result_list = (list(pred_results[0][i]) + [""] + list(pred_results[1][i]) + [""] + list(pred_results[2][i])) io_util.write_csv_result(prediction_path, "", result_list)
def _interval_get_mini_runner_data(filename, model_results_path): # In the default case, the data does not need any pre-processing and the file name indicates the opunit df = pd.read_csv(filename, skipinitialspace=True) headers = list(df.columns.values) data_info.parse_csv_header(headers, False) file_name = os.path.splitext(os.path.basename(filename))[0] x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values start_times = df.iloc[:, data_info.RAW_TARGET_CSV_INDEX[Target. START_TIME]].values logging.info("Loaded file: {}".format(OpUnit[file_name.upper()])) # change the data based on the interval for the periodically invoked operating units prediction_path = "{}/{}_interval_converted_data.csv".format( model_results_path, file_name) io_util.create_csv_file(prediction_path, [""]) interval = data_info.PERIODIC_OPUNIT_INTERVAL # Map from interval start time to the data in this interval interval_x_map = {} interval_y_map = {} n = x.shape[0] for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"): rounded_time = data_util.round_to_interval(start_times[i], interval) if rounded_time not in interval_x_map: interval_x_map[rounded_time] = [] interval_y_map[rounded_time] = [] interval_x_map[rounded_time].append(x[i]) interval_y_map[rounded_time].append(y[i]) # Construct the new data x_list = [] y_list = [] for rounded_time in interval_x_map: # Sum the features x_new = np.sum(interval_x_map[rounded_time], axis=0) # Keep the interval parameter the same # TODO: currently the interval parameter is always the last. Change the hard-coding later x_new[-1] /= len(interval_x_map[rounded_time]) x_list.append(x_new) # The prediction is the average behavior y_list.append(np.average(interval_y_map[rounded_time], axis=0)) io_util.write_csv_result(prediction_path, rounded_time, np.concatenate((x_list[-1], y_list[-1]))) return [ OpUnitData(OpUnit[file_name.upper()], np.array(x_list), np.array(y_list)) ]
def _train_data(self, data, summary_file): x_train, x_test, y_train, y_test = model_selection.train_test_split( data.x, data.y, test_size=self.test_ratio, random_state=0) # Write the first header rwo to the result file metrics_path = "{}/{}.csv".format(self.model_metrics_path, data.opunit.name.lower()) prediction_path = "{}/{}_prediction.csv".format( self.model_metrics_path, data.opunit.name.lower()) result_writing_util.create_metrics_and_prediction_files( metrics_path, prediction_path, False) methods = self.ml_models # Test the prediction with/without the target transformer y_transformers = [ None, data_transforming_util.OPUNIT_Y_TRANSFORMER_MAP[data.opunit] ] # modeling_transformer = data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit] # if modeling_transformer is not None: # transformers.append(modeling_transformer) x_transformer = data_transforming_util.OPUNIT_X_TRANSFORMER_MAP[ data.opunit] error_bias = 1 min_percentage_error = 2 pred_results = None elapsed_us_index = data_info.TARGET_CSV_INDEX[Target.ELAPSED_US] memory_b_index = data_info.TARGET_CSV_INDEX[Target.MEMORY_B] best_y_transformer = -1 best_method = -1 for i, y_transformer in enumerate(y_transformers): for m, method in enumerate(methods): # Train the model label = method if i == 0 else method + " transform" logging.info("{} {}".format(data.opunit.name, label)) regressor = model.Model(method, y_transformer=y_transformer, x_transformer=x_transformer) regressor.train(x_train, y_train) # Evaluate on both the training and test set results = [] evaluate_data = [(x_train, y_train), (x_test, y_test)] train_test_label = ["Train", "Test"] for j, d in enumerate(evaluate_data): evaluate_x = d[0] evaluate_y = d[1] y_pred = regressor.predict(evaluate_x) logging.debug("x shape: {}".format(evaluate_x.shape)) logging.debug("y shape: {}".format(y_pred.shape)) # In order to avoid the percentage error to explode when the actual label is very small, # we omit the data point with the actual label <= 5 when calculating the percentage error (by # essentially giving the data points with small labels a very small weight) evaluate_threshold = 5 weights = np.where(evaluate_y > evaluate_threshold, np.ones(evaluate_y.shape), np.full(evaluate_y.shape, 1e-6)) percentage_error = np.average(np.abs(evaluate_y - y_pred) / (evaluate_y + error_bias), axis=0, weights=weights) results += list(percentage_error) + [""] logging.info('{} Percentage Error: {}'.format( train_test_label[j], percentage_error)) # The default method of determining whether a model is better is by comparing the model error # on the elapsed us. For any opunits in MEM_EVALUATE_OPUNITS, we evaluate by comparing the # model error on memory_b. eval_error = percentage_error[elapsed_us_index] if data.opunit in data_info.MEM_EVALUATE_OPUNITS: eval_error = percentage_error[memory_b_index] # Record the model with the lowest elapsed time prediction (since that might be the most # important prediction) # Only use linear regression for the arithmetic operating units if (j == 1 and eval_error < min_percentage_error and y_transformer == y_transformers[-1] and (data.opunit not in data_info.ARITHMETIC_OPUNITS or method == 'lr')): min_percentage_error = eval_error if self.expose_all: best_y_transformer = i best_method = m else: self.model_map[data.opunit] = regressor pred_results = (evaluate_x, y_pred, evaluate_y) if j == 1: io_util.write_csv_result(summary_file, data.opunit.name, [label] + list(percentage_error)) # Dump the prediction results io_util.write_csv_result(metrics_path, label, results) logging.info("") io_util.write_csv_result(metrics_path, "", []) # Record the best prediction results on the test data result_writing_util.record_predictions(pred_results, prediction_path) return best_y_transformer, best_method
def _predict_grouped_opunit_data(data_list, mini_model_map, model_results_path): """Use the mini-runner to predict the resource consumptions for all the GlobalData, and record the prediction result in place :param data_list: The list of the GroupedOpUnitData objects :param mini_model_map: The trained mini models :param model_results_path: file path to log the prediction results """ prediction_path = "{}/grouped_opunit_prediction.csv".format(model_results_path) io_util.create_csv_file(prediction_path, ["Pipeline", "", "Actual", "", "Predicted", "", "Ratio Error"]) # Have to use a prediction cache when having lots of global data... prediction_cache = {} # First run a prediction on the global running data with the mini model results for i, data in enumerate(tqdm.tqdm(data_list, desc="Predict GroupedOpUnitData")): y = data.y logging.debug("{} pipeline elapsed time: {}".format(data.name, y[-1])) pipeline_y_pred = 0 x = None for opunit_feature in data.opunit_features: opunit = opunit_feature[0] opunit_model = mini_model_map[opunit] x = np.array(opunit_feature[1]).reshape(1, -1) key = (opunit, x.tobytes()) if key not in prediction_cache: y_pred = opunit_model.predict(x) y_pred = np.clip(y_pred, 0, None) prediction_cache[key] = y_pred else: y_pred = prediction_cache[key] logging.debug("Predicted {} elapsed time with feature {}: {}".format(opunit_feature[0].name, x[0], y_pred[0, -1])) if opunit in data_info.MEM_ADJUST_OPUNITS: # Compute the number of "slots" (based on row feature or cardinality feature num_tuple = opunit_feature[1][data_info.TUPLE_NUM_INDEX] if opunit == OpUnit.AGG_BUILD: num_tuple = opunit_feature[1][data_info.CARDINALITY_INDEX] # SORT/AGG/HASHJOIN_BUILD all allocate a "pointer" buffer # that contains the first pow2 larger than num_tuple entries pow_high = 2 ** math.ceil(math.log(num_tuple, 2)) buffer_size = pow_high * data_info.POINTER_SIZE if opunit == OpUnit.AGG_BUILD and num_tuple <= 256: # For AGG_BUILD, if slots <= AggregationHashTable::K_DEFAULT_INITIAL_TABLE_SIZE # the buffer is not recorded as part of the pipeline buffer_size = 0 pred_mem = y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]] if pred_mem <= buffer_size: logging.warning("{} feature {} {} with prediction {} exceeds buffer {}" .format(data.name, opunit_feature, opunit_feature[1], y_pred[0], buffer_size)) # Poorly encapsulated, but memory scaling factor is located as the 2nd last of feature # slightly inaccurate since ignores load factors for hash tables adj_mem = (pred_mem - buffer_size) * opunit_feature[1][-2] + buffer_size # Don't modify prediction cache y_pred = copy.deepcopy(y_pred) y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]] = adj_mem pipeline_y_pred += y_pred[0] # Record the predicted data.y_pred = pipeline_y_pred logging.debug("{} pipeline predicted time: {}".format(data.name, pipeline_y_pred[-1])) ratio_error = abs(y - pipeline_y_pred) / (y + 1) logging.debug("|Actual - Predict| / Actual: {}".format(ratio_error[-1])) io_util.write_csv_result(prediction_path, data.name, [""] + list(y) + [""] + list(pipeline_y_pred) + [""] + list(ratio_error)) logging.debug("")
def _record_results(self, x, y, y_pred, raw_y, mini_model_y_pred, label, data_list): """Record the prediction results :param x: the input data :param y: the actual output :param y_pred: the predicted output :param label: the result label ("resource" or "impact") """ # Result files metrics_path = "{}/global_{}_model_metrics.csv".format( self.model_results_path, label) prediction_path = "{}/global_{}_model_prediction.csv".format( self.model_results_path, label) result_writing_util.create_metrics_and_prediction_files( metrics_path, prediction_path, True) # Log the prediction results ratio_error = np.average(np.abs(y - y_pred) / (y + 1e-6), axis=0) io_util.write_csv_result(metrics_path, "Model Ratio Error", ratio_error) result_writing_util.record_predictions((x, y_pred, y), prediction_path) # Print Error summary to command line if label == "resource": avg_original_ratio_error = np.average( np.abs(y - x[:, :y.shape[1]]) / (y + 1e-6), axis=0) else: avg_original_ratio_error = np.average(np.abs(1 / (y + 1e-6) - 1), axis=0) logging.info('Model Original Ratio Error ({}): {}'.format( label, avg_original_ratio_error)) logging.info('Model Ratio Error ({}): {}'.format(label, ratio_error)) logging.info('') if label != "resource": # Calculate the accumulated ratio error epsilon = global_model_config.RATIO_DIVISION_EPSILON mini_model_y_pred = np.array(mini_model_y_pred) raw_y = np.array(raw_y) raw_y_pred = (mini_model_y_pred + epsilon) * y_pred accumulated_raw_y = np.sum(raw_y, axis=0) accumulated_raw_y_pred = np.sum(raw_y_pred, axis=0) original_ratio_error = np.abs(raw_y - mini_model_y_pred) / ( raw_y + epsilon) avg_original_ratio_error = np.average(original_ratio_error, axis=0) ratio_error = np.abs(raw_y - raw_y_pred) / (raw_y + epsilon) avg_ratio_error = np.average(ratio_error, axis=0) accumulated_percentage_error = np.abs( accumulated_raw_y - accumulated_raw_y_pred) / (accumulated_raw_y + epsilon) original_accumulated_percentage_error = np.abs( accumulated_raw_y - np.sum(mini_model_y_pred, axis=0)) / ( accumulated_raw_y + epsilon) logging.info( 'Original Ratio Error: {}'.format(avg_original_ratio_error)) io_util.write_csv_result(metrics_path, "Original Ratio Error", avg_original_ratio_error) logging.info('Ratio Error: {}'.format(avg_ratio_error)) io_util.write_csv_result(metrics_path, "Ratio Error", avg_ratio_error) logging.info('Original Accumulated Ratio Error: {}'.format( original_accumulated_percentage_error)) io_util.write_csv_result(metrics_path, "Original Accumulated Ratio Error", original_accumulated_percentage_error) logging.info('Accumulated Ratio Error: {}'.format( accumulated_percentage_error)) io_util.write_csv_result(metrics_path, "Accumulated Ratio Error", accumulated_percentage_error) logging.info('Accumulated Actual: {}'.format(accumulated_raw_y)) logging.info('Original Accumulated Predict: {}'.format( np.sum(mini_model_y_pred, axis=0))) logging.info( 'Accumulated Predict: {}'.format(accumulated_raw_y_pred)) if label == 'direct': prediction_path = "{}/grouped_opunit_prediction.csv".format( self.model_results_path) io_util.create_csv_file(prediction_path, [ "Pipeline", "", "Actual", "", "Predicted", "", "Ratio Error" ]) for i, data in enumerate(data_list): io_util.write_csv_result(prediction_path, data.name, [""] + list(raw_y[i]) + [""] + list(raw_y_pred[i]) + [""] + list(ratio_error[i])) average_result_path = "{}/interval_average_prediction.csv".format( self.model_results_path) io_util.create_csv_file( average_result_path, ["Timestamp", "Actual Average", "Predicted Average"]) interval_y_map = {} interval_y_pred_map = {} mark_list = None #mark_list = _generate_mark_list(data_list) for i, data in enumerate(data_list): # Don't count the create index OU # TODO(lin): needs better way to evaluate... maybe add a id_query field to GroupedOpunitData if data.concurrency > 0: continue if mark_list is not None and not mark_list[i]: continue interval_time = _round_to_interval( data.start_time, global_model_config.AVERAGING_INTERVAL) if interval_time not in interval_y_map: interval_y_map[interval_time] = [] interval_y_pred_map[interval_time] = [] interval_y_map[interval_time].append(raw_y[i][-5]) interval_y_pred_map[interval_time].append( raw_y_pred[i][-5]) for time in sorted(interval_y_map.keys()): if mark_list is None: io_util.write_csv_result(average_result_path, time, [ np.average(interval_y_map[time]), np.average(interval_y_pred_map[time]) ]) else: io_util.write_csv_result(average_result_path, time, [ np.sum(interval_y_map[time]), np.sum(interval_y_pred_map[time]) ])
def _predict_grouped_opunit_data(data_list, mini_model_map, model_results_path): """Use the mini-runner to predict the resource consumptions for all the GlobalData, and record the prediction result in place :param data_list: The list of the GroupedOpUnitData objects :param mini_model_map: The trained mini models :param model_results_path: file path to log the prediction results """ prediction_path = "{}/grouped_opunit_prediction.csv".format(model_results_path) pipeline_path = "{}/grouped_pipeline.csv".format(model_results_path) io_util.create_csv_file(prediction_path, ["Pipeline", "", "Actual", "", "Predicted", "", "Ratio Error"]) io_util.create_csv_file(pipeline_path, ["Number", "Percentage", "Pipeline", "Actual Us", "Predicted Us", "Us Error", "Absolute Us", "Assolute Us %"]) # Track pipeline cumulative numbers num_pipelines = 0 total_actual = None total_predicted = [] actual_pipelines = {} predicted_pipelines = {} count_pipelines = {} query_prediction_path = "{}/grouped_query_prediction.csv".format(model_results_path) io_util.create_csv_file(query_prediction_path, ["Query", "", "Actual", "", "Predicted", "", "Ratio Error"]) current_query_id = None query_y = None query_y_pred = None # Have to use a prediction cache when having lots of global data... prediction_cache = {} # First run a prediction on the global running data with the mini model results for i, data in enumerate(tqdm.tqdm(data_list, desc="Predict GroupedOpUnitData")): y = data.y logging.debug("{} pipeline elapsed time: {}".format(data.name, y[-1])) pipeline_y_pred = 0 x = None for opunit_feature in data.opunit_features: opunit = opunit_feature[0] opunit_model = mini_model_map[opunit] x = np.array(opunit_feature[1]).reshape(1, -1) key = (opunit, x.tobytes()) if key not in prediction_cache: y_pred = opunit_model.predict(x) y_pred = np.clip(y_pred, 0, None) prediction_cache[key] = y_pred else: y_pred = prediction_cache[key] logging.debug("Predicted {} elapsed time with feature {}: {}".format(opunit_feature[0].name, x[0], y_pred[0, -1])) if opunit in data_info.MEM_ADJUST_OPUNITS: # Compute the number of "slots" (based on row feature or cardinality feature num_tuple = opunit_feature[1][data_info.TUPLE_NUM_INDEX] if opunit == OpUnit.AGG_BUILD: num_tuple = opunit_feature[1][data_info.CARDINALITY_INDEX] # SORT/AGG/HASHJOIN_BUILD all allocate a "pointer" buffer # that contains the first pow2 larger than num_tuple entries pow_high = 2 ** math.ceil(math.log(num_tuple, 2)) buffer_size = pow_high * data_info.POINTER_SIZE if opunit == OpUnit.AGG_BUILD and num_tuple <= 256: # For AGG_BUILD, if slots <= AggregationHashTable::K_DEFAULT_INITIAL_TABLE_SIZE # the buffer is not recorded as part of the pipeline buffer_size = 0 pred_mem = y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]] if pred_mem <= buffer_size: logging.warning("{} feature {} {} with prediction {} exceeds buffer {}" .format(data.name, opunit_feature, opunit_feature[1], y_pred[0], buffer_size)) # Poorly encapsulated, but memory scaling factor is located as the 2nd last of feature # slightly inaccurate since ignores load factors for hash tables adj_mem = (pred_mem - buffer_size) * opunit_feature[1][-3] + buffer_size # Don't modify prediction cache y_pred = copy.deepcopy(y_pred) y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]] = adj_mem pipeline_y_pred += y_pred[0] pipeline_y = copy.deepcopy(pipeline_y_pred) # Grouping when we're predicting queries if data.name[0] == 'q': query_id = data.name[1:data.name.rfind(" p")] if query_id != current_query_id: if current_query_id is not None: io_util.write_csv_result(query_prediction_path, current_query_id, [""] + list(query_y) + [""] + list(query_y_pred) + [""] + list(abs(query_y - query_y_pred) / (query_y + 1))) current_query_id = query_id query_y = y query_y_pred = pipeline_y_pred else: query_y += y query_y_pred += pipeline_y_pred data.y_pred = pipeline_y logging.debug("{} pipeline prediction: {}".format(data.name, pipeline_y)) logging.debug("{} pipeline predicted time: {}".format(data.name, pipeline_y[-1])) ratio_error = abs(y - pipeline_y) / (y + 1) logging.debug("|Actual - Predict| / Actual: {}".format(ratio_error[-1])) io_util.write_csv_result(prediction_path, data.name, [""] + list(y) + [""] + list(pipeline_y) + [""] + list(ratio_error)) logging.debug("") # Record cumulative numbers if data.name not in actual_pipelines: actual_pipelines[data.name] = copy.deepcopy(y) predicted_pipelines[data.name] = copy.deepcopy(pipeline_y) count_pipelines[data.name] = 1 else: actual_pipelines[data.name] += y predicted_pipelines[data.name] += pipeline_y count_pipelines[data.name] += 1 # Update totals if total_actual is None: total_actual = copy.deepcopy(y) total_predicted = copy.deepcopy(pipeline_y) else: total_actual += y total_predicted += pipeline_y num_pipelines += 1 total_elapsed_err = 0 for pipeline in actual_pipelines: actual = actual_pipelines[pipeline] predicted = predicted_pipelines[pipeline] total_elapsed_err = total_elapsed_err + (abs(actual - predicted))[-1] for pipeline in actual_pipelines: actual = actual_pipelines[pipeline] predicted = predicted_pipelines[pipeline] num = count_pipelines[pipeline] ratio_error = abs(actual - predicted) / (actual + 1) abs_error = abs(actual - predicted)[-1] pabs_error = abs_error / total_elapsed_err io_util.write_csv_result(pipeline_path, pipeline, [num, num*1.0/num_pipelines, actual[-1], predicted[-1], ratio_error[-1], abs_error, pabs_error] + [""] + list(actual) + [""] + list(predicted) + [""] + list(ratio_error)) ratio_error = abs(total_actual - total_predicted) / (total_actual + 1) io_util.write_csv_result(pipeline_path, "Total Pipeline", [num_pipelines, 1, total_actual[-1], total_predicted[-1], ratio_error[-1], total_elapsed_err, 1] + [""] + list(total_actual) + [""] + list(total_predicted) + [""] + list(ratio_error))
def _train_data(self, data, summary_file): x_train, x_test, y_train, y_test = model_selection.train_test_split( data.x, data.y, test_size=self.test_ratio, random_state=0) # Write the first header rwo to the result file metrics_path = "{}/{}.csv".format(self.model_metrics_path, data.opunit.name.lower()) prediction_path = "{}/{}_prediction.csv".format( self.model_metrics_path, data.opunit.name.lower()) result_writing_util.create_metrics_and_prediction_files( metrics_path, prediction_path) methods = self.ml_models # Test the prediction with/without the target transformer transformers = [ None, data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit] ] # modeling_transformer = data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit] # if modeling_transformer is not None: # transformers.append(modeling_transformer) error_bias = 1 min_percentage_error = 2 pred_results = None elapsed_us_index = data_info.TARGET_CSV_INDEX[Target.ELAPSED_US] for i, transformer in enumerate(transformers): for method in methods: # Train the model label = method if i == 0 else method + " transform" logging.info("{} {}".format(data.opunit.name, label)) regressor = model.Model(method, modeling_transformer=transformer) regressor.train(x_train, y_train) # Evaluate on both the training and test set results = [] evaluate_data = [(x_train, y_train), (x_test, y_test)] train_test_label = ["Train", "Test"] for j, d in enumerate(evaluate_data): evaluate_x = d[0] evaluate_y = d[1] for x, y in zip(evaluate_x, evaluate_y): stat_vec = [data.opunit] stat_vec.extend(x) self.stats_map[tuple(stat_vec)] = y y_pred = regressor.predict(evaluate_x) logging.debug("x shape: {}".format(evaluate_x.shape)) logging.debug("y shape: {}".format(y_pred.shape)) percentage_error = np.average( np.abs(evaluate_y - y_pred) / (evaluate_y + 1 + error_bias), axis=0) results += list(percentage_error) + [""] logging.info('{} Percentage Error: {}'.format( train_test_label[j], percentage_error)) # Record the model with the lowest elapsed time prediction (since that might be the most # important prediction) # Only use linear regression for the arithmetic operating units if (j == 1 and percentage_error[elapsed_us_index] < min_percentage_error and transformer == transformers[-1] and (data.opunit not in data_info.ARITHMETIC_OPUNITS or method == 'lr')): min_percentage_error = percentage_error[ elapsed_us_index] self.model_map[data.opunit] = regressor pred_results = (evaluate_x, y_pred, evaluate_y) if j == 1: io_util.write_csv_result(summary_file, data.opunit.name, [label] + list(percentage_error)) # Dump the prediction results io_util.write_csv_result(metrics_path, label, results) logging.info("") io_util.write_csv_result(metrics_path, "", []) # Record the best prediction results on the test data result_writing_util.record_predictions(pred_results, prediction_path)
def train(self): """Train the mini-models :return: the map of the trained models """ data_list = [] # First get the data for all mini runners for filename in glob.glob(os.path.join(self.input_path, '*.csv')): print(filename) data_list += opunit_data.get_mini_runner_data(filename) model_map = {} # train the models for all the operating units for data in data_list: x_train, x_test, y_train, y_test = model_selection.train_test_split(data.x, data.y, test_size=self.test_ratio, random_state=0) # Write the first header rwo to the result file metrics_path = "{}/{}.csv".format(self.model_metrics_path, data.opunit.name.lower()) prediction_path = "{}/{}_prediction.csv".format(self.model_metrics_path, data.opunit.name.lower()) result_writing_util.create_metrics_and_prediction_files(metrics_path, prediction_path) methods = self.ml_models # Only use linear regression for the arithmetic operating units if data.opunit in data_info.ARITHMETIC_OPUNITS: methods = ["lr"] # Also test the prediction with the target transformer (if specified for the operating unit) transformers = [None] modeling_transformer = data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit] if modeling_transformer is not None: transformers.append(modeling_transformer) min_percentage_error = 1 pred_results = None elapsed_us_index = data_info.TARGET_CSV_INDEX[Target.ELAPSED_US] for transformer in transformers: for method in methods: # Train the model logging.info("{} {}".format(data.opunit.name, method)) regressor = model.Model(method, modeling_transformer=transformer) regressor.train(x_train, y_train) # Evaluate on both the training and test set results = [] evaluate_data = [(x_train, y_train), (x_test, y_test)] train_test_label = ["Train", "Test"] for i, d in enumerate(evaluate_data): evaluate_x = d[0] evaluate_y = d[1] y_pred = regressor.predict(evaluate_x) logging.debug("x shape: {}".format(evaluate_x.shape)) logging.debug("y shape: {}".format(y_pred.shape)) percentage_error = np.average(np.abs(evaluate_y - y_pred) / (evaluate_y + 1), axis=0) results += list(percentage_error) + [""] logging.info('{} Percentage Error: {}'.format(train_test_label[i], percentage_error)) # Record the model with the lowest elapsed time prediction (since that might be the most # important prediction) if (i == 1 and percentage_error[elapsed_us_index] < min_percentage_error and transformer == transformers[-1]): min_percentage_error = percentage_error[elapsed_us_index] model_map[data.opunit] = regressor pred_results = (evaluate_x, y_pred, evaluate_y) # Dump the prediction results transform = " " if transformer is not None: transform = " transform" io_util.write_csv_result(metrics_path, method + transform, results) logging.info("") io_util.write_csv_result(metrics_path, "", []) # Record the best prediction results on the test data result_writing_util.record_predictions(pred_results, prediction_path) return model_map