def _txn_get_mini_runner_data(filename, model_results_path, txn_sample_interval): # In the default case, the data does not need any pre-processing and the file name indicates the opunit df = pd.read_csv(filename) file_name = os.path.splitext(os.path.basename(filename))[0] # prepending a column of ones as the base transaction data feature base_x = pd.DataFrame(data=np.ones((df.shape[0], 1), dtype=int)) df = pd.concat([base_x, df], axis=1) x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values start_times = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target. START_TIME]].values cpu_ids = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target. CPU_ID]].values logging.info("Loaded file: {}".format(OpUnit[file_name.upper()])) # change the data based on the interval for the periodically invoked operating units prediction_path = "{}/{}_txn_converted_data.csv".format( model_results_path, file_name) io_util.create_csv_file(prediction_path, [""]) interval = data_info.CONTENDING_OPUNIT_INTERVAL # Map from interval start time to the data in this interval interval_x_map = {} interval_y_map = {} interval_id_map = {} n = x.shape[0] for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"): rounded_time = data_util.round_to_interval(start_times[i], interval) if rounded_time not in interval_x_map: interval_x_map[rounded_time] = [] interval_y_map[rounded_time] = [] interval_id_map[rounded_time] = set() interval_x_map[rounded_time].append(x[i]) interval_y_map[rounded_time].append(y[i]) interval_id_map[rounded_time].add(cpu_ids[i]) # Construct the new data x_list = [] y_list = [] for rounded_time in interval_x_map: # Sum the features x_new = np.sum(interval_x_map[rounded_time], axis=0) # Concatenate the number of different threads x_new = np.concatenate((x_new, [len(interval_id_map[rounded_time])])) x_new *= txn_sample_interval + 1 x_list.append(x_new) # The prediction is the average behavior y_list.append(np.average(interval_y_map[rounded_time], axis=0)) io_util.write_csv_result(prediction_path, rounded_time, np.concatenate((x_list[-1], y_list[-1]))) return [ OpUnitData(OpUnit[file_name.upper()], np.array(x_list), np.array(y_list)) ]
def _interval_get_mini_runner_data(filename, model_results_path): # In the default case, the data does not need any pre-processing and the file name indicates the opunit df = pd.read_csv(filename, skipinitialspace=True) headers = list(df.columns.values) data_info.parse_csv_header(headers, False) file_name = os.path.splitext(os.path.basename(filename))[0] x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values start_times = df.iloc[:, data_info.RAW_TARGET_CSV_INDEX[Target. START_TIME]].values logging.info("Loaded file: {}".format(OpUnit[file_name.upper()])) # change the data based on the interval for the periodically invoked operating units prediction_path = "{}/{}_interval_converted_data.csv".format( model_results_path, file_name) io_util.create_csv_file(prediction_path, [""]) interval = data_info.PERIODIC_OPUNIT_INTERVAL # Map from interval start time to the data in this interval interval_x_map = {} interval_y_map = {} n = x.shape[0] for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"): rounded_time = data_util.round_to_interval(start_times[i], interval) if rounded_time not in interval_x_map: interval_x_map[rounded_time] = [] interval_y_map[rounded_time] = [] interval_x_map[rounded_time].append(x[i]) interval_y_map[rounded_time].append(y[i]) # Construct the new data x_list = [] y_list = [] for rounded_time in interval_x_map: # Sum the features x_new = np.sum(interval_x_map[rounded_time], axis=0) # Keep the interval parameter the same # TODO: currently the interval parameter is always the last. Change the hard-coding later x_new[-1] /= len(interval_x_map[rounded_time]) x_list.append(x_new) # The prediction is the average behavior y_list.append(np.average(interval_y_map[rounded_time], axis=0)) io_util.write_csv_result(prediction_path, rounded_time, np.concatenate((x_list[-1], y_list[-1]))) return [ OpUnitData(OpUnit[file_name.upper()], np.array(x_list), np.array(y_list)) ]
def _interval_get_grouped_op_unit_data(filename): # In the default case, the data does not need any pre-processing and the file name indicates the opunit df = pd.read_csv(filename) file_name = os.path.splitext(os.path.basename(filename))[0] x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values start_times = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target. START_TIME]].values cpu_ids = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target. CPU_ID]].values interval = data_info.PERIODIC_OPUNIT_INTERVAL # Map from interval start time to the data in this interval interval_x_map = {} interval_y_map = {} interval_cpu_map = {} n = x.shape[0] for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"): rounded_time = data_util.round_to_interval(start_times[i], interval) if rounded_time not in interval_x_map: interval_x_map[rounded_time] = [] interval_y_map[rounded_time] = [] interval_x_map[rounded_time].append(x[i]) interval_y_map[rounded_time].append(y[i]) interval_cpu_map[rounded_time] = cpu_ids[i] # Construct the new data opunit = OpUnit[file_name.upper()] data_list = [] for rounded_time in interval_x_map: # Sum the features x_new = np.sum(interval_x_map[rounded_time], axis=0) # Keep the interval parameter the same # TODO: currently the interval parameter is always the last. Change the hard-coding later x_new[-1] /= len(interval_x_map[rounded_time]) # Change all the opunits in the group for this interval to be the new feature opunits = [(opunit, x_new)] # The prediction is the average behavior y_new = np.average(interval_y_map[rounded_time], axis=0) n = len(interval_x_map[rounded_time]) for i in range(n): metrics = np.concatenate(([rounded_time + i * interval // n], [interval_cpu_map[rounded_time]], y_new)) data_list.append( GroupedOpUnitData("{} {}".format(file_name, opunits), opunits, metrics)) return data_list
def _txn_get_mini_runner_data(filename, txn_sample_interval): # In the default case, the data does not need any pre-processing and the file name indicates the opunit df = pd.read_csv(filename) file_name = os.path.splitext(os.path.basename(filename))[0] # prepending a column of ones as the base transaction data feature base_x = pd.DataFrame(data=np.ones((df.shape[0], 1), dtype=int)) df = pd.concat([base_x, df], axis=1) x = df.iloc[:, :-data_info.instance.METRICS_OUTPUT_NUM].values y = df.iloc[:, -data_info.instance.MINI_MODEL_TARGET_NUM:].values start_times = df.iloc[:, data_info.instance.target_csv_index[ data_info.instance.Target.START_TIME]].values cpu_ids = df.iloc[:, data_info.instance.target_csv_index[ data_info.instance.Target.CPU_ID]].values logging.info("Loaded file: {}".format(OpUnit[file_name.upper()])) interval = data_info.instance.CONTENDING_OPUNIT_INTERVAL # Map from interval start time to the data in this interval interval_x_map = {} interval_y_map = {} interval_cpu_id_map = {} interval_start_time_map = {} n = x.shape[0] for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"): rounded_time = data_util.round_to_interval(start_times[i], interval) if rounded_time not in interval_x_map: interval_x_map[rounded_time] = [] interval_y_map[rounded_time] = [] interval_cpu_id_map[rounded_time] = [] interval_start_time_map[rounded_time] = [] interval_x_map[rounded_time].append(x[i]) interval_y_map[rounded_time].append(y[i]) interval_cpu_id_map[rounded_time].append(cpu_ids[i]) interval_start_time_map[rounded_time].append(start_times[i]) # Construct the new data opunit = OpUnit[file_name.upper()] data_list = [] for rounded_time in interval_x_map: # Sum the features x_new = np.sum(interval_x_map[rounded_time], axis=0) # Concatenate the number of different threads x_new = np.concatenate( (x_new, [len(set(interval_cpu_id_map[rounded_time]))])) x_new *= txn_sample_interval + 1 # Change all the opunits in the group for this interval to be the new feature opunits = [(opunit, x_new)] # The prediction is the average behavior y_new = np.average(interval_y_map[rounded_time], axis=0) n = len(interval_x_map[rounded_time]) for i in range(n): metrics = np.concatenate( ([interval_start_time_map[rounded_time][i]], [interval_cpu_id_map[rounded_time][i]], y_new)) data_list.append( GroupedOpUnitData("{}".format(file_name), opunits, metrics, txn_sample_interval)) return data_list