예제 #1
0
def _txn_get_mini_runner_data(filename, model_results_path,
                              txn_sample_interval):
    # In the default case, the data does not need any pre-processing and the file name indicates the opunit
    df = pd.read_csv(filename)
    file_name = os.path.splitext(os.path.basename(filename))[0]

    # prepending a column of ones as the base transaction data feature
    base_x = pd.DataFrame(data=np.ones((df.shape[0], 1), dtype=int))
    df = pd.concat([base_x, df], axis=1)
    x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values
    y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values
    start_times = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target.
                                                        START_TIME]].values
    cpu_ids = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target.
                                                    CPU_ID]].values

    logging.info("Loaded file: {}".format(OpUnit[file_name.upper()]))

    # change the data based on the interval for the periodically invoked operating units
    prediction_path = "{}/{}_txn_converted_data.csv".format(
        model_results_path, file_name)
    io_util.create_csv_file(prediction_path, [""])

    interval = data_info.CONTENDING_OPUNIT_INTERVAL

    # Map from interval start time to the data in this interval
    interval_x_map = {}
    interval_y_map = {}
    interval_id_map = {}
    n = x.shape[0]
    for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"):
        rounded_time = data_util.round_to_interval(start_times[i], interval)
        if rounded_time not in interval_x_map:
            interval_x_map[rounded_time] = []
            interval_y_map[rounded_time] = []
            interval_id_map[rounded_time] = set()
        interval_x_map[rounded_time].append(x[i])
        interval_y_map[rounded_time].append(y[i])
        interval_id_map[rounded_time].add(cpu_ids[i])

    # Construct the new data
    x_list = []
    y_list = []
    for rounded_time in interval_x_map:
        # Sum the features
        x_new = np.sum(interval_x_map[rounded_time], axis=0)
        # Concatenate the number of different threads
        x_new = np.concatenate((x_new, [len(interval_id_map[rounded_time])]))
        x_new *= txn_sample_interval + 1
        x_list.append(x_new)
        # The prediction is the average behavior
        y_list.append(np.average(interval_y_map[rounded_time], axis=0))
        io_util.write_csv_result(prediction_path, rounded_time,
                                 np.concatenate((x_list[-1], y_list[-1])))

    return [
        OpUnitData(OpUnit[file_name.upper()], np.array(x_list),
                   np.array(y_list))
    ]
예제 #2
0
def _interval_get_mini_runner_data(filename, model_results_path):
    # In the default case, the data does not need any pre-processing and the file name indicates the opunit
    df = pd.read_csv(filename, skipinitialspace=True)
    headers = list(df.columns.values)
    data_info.parse_csv_header(headers, False)
    file_name = os.path.splitext(os.path.basename(filename))[0]

    x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values
    y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values
    start_times = df.iloc[:, data_info.RAW_TARGET_CSV_INDEX[Target.
                                                            START_TIME]].values
    logging.info("Loaded file: {}".format(OpUnit[file_name.upper()]))

    # change the data based on the interval for the periodically invoked operating units
    prediction_path = "{}/{}_interval_converted_data.csv".format(
        model_results_path, file_name)
    io_util.create_csv_file(prediction_path, [""])

    interval = data_info.PERIODIC_OPUNIT_INTERVAL

    # Map from interval start time to the data in this interval
    interval_x_map = {}
    interval_y_map = {}
    n = x.shape[0]
    for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"):
        rounded_time = data_util.round_to_interval(start_times[i], interval)
        if rounded_time not in interval_x_map:
            interval_x_map[rounded_time] = []
            interval_y_map[rounded_time] = []
        interval_x_map[rounded_time].append(x[i])
        interval_y_map[rounded_time].append(y[i])

    # Construct the new data
    x_list = []
    y_list = []
    for rounded_time in interval_x_map:
        # Sum the features
        x_new = np.sum(interval_x_map[rounded_time], axis=0)
        # Keep the interval parameter the same
        # TODO: currently the interval parameter is always the last. Change the hard-coding later
        x_new[-1] /= len(interval_x_map[rounded_time])
        x_list.append(x_new)
        # The prediction is the average behavior
        y_list.append(np.average(interval_y_map[rounded_time], axis=0))
        io_util.write_csv_result(prediction_path, rounded_time,
                                 np.concatenate((x_list[-1], y_list[-1])))

    return [
        OpUnitData(OpUnit[file_name.upper()], np.array(x_list),
                   np.array(y_list))
    ]
예제 #3
0
def _interval_get_grouped_op_unit_data(filename):
    # In the default case, the data does not need any pre-processing and the file name indicates the opunit
    df = pd.read_csv(filename)
    file_name = os.path.splitext(os.path.basename(filename))[0]

    x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values
    y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values
    start_times = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target.
                                                        START_TIME]].values
    cpu_ids = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target.
                                                    CPU_ID]].values

    interval = data_info.PERIODIC_OPUNIT_INTERVAL

    # Map from interval start time to the data in this interval
    interval_x_map = {}
    interval_y_map = {}
    interval_cpu_map = {}
    n = x.shape[0]
    for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"):
        rounded_time = data_util.round_to_interval(start_times[i], interval)
        if rounded_time not in interval_x_map:
            interval_x_map[rounded_time] = []
            interval_y_map[rounded_time] = []
        interval_x_map[rounded_time].append(x[i])
        interval_y_map[rounded_time].append(y[i])
        interval_cpu_map[rounded_time] = cpu_ids[i]

    # Construct the new data
    opunit = OpUnit[file_name.upper()]
    data_list = []
    for rounded_time in interval_x_map:
        # Sum the features
        x_new = np.sum(interval_x_map[rounded_time], axis=0)
        # Keep the interval parameter the same
        # TODO: currently the interval parameter is always the last. Change the hard-coding later
        x_new[-1] /= len(interval_x_map[rounded_time])
        # Change all the opunits in the group for this interval to be the new feature
        opunits = [(opunit, x_new)]
        # The prediction is the average behavior
        y_new = np.average(interval_y_map[rounded_time], axis=0)
        n = len(interval_x_map[rounded_time])
        for i in range(n):
            metrics = np.concatenate(([rounded_time + i * interval // n],
                                      [interval_cpu_map[rounded_time]], y_new))
            data_list.append(
                GroupedOpUnitData("{} {}".format(file_name, opunits), opunits,
                                  metrics))

    return data_list
예제 #4
0
def _txn_get_mini_runner_data(filename, txn_sample_interval):
    # In the default case, the data does not need any pre-processing and the file name indicates the opunit
    df = pd.read_csv(filename)
    file_name = os.path.splitext(os.path.basename(filename))[0]

    # prepending a column of ones as the base transaction data feature
    base_x = pd.DataFrame(data=np.ones((df.shape[0], 1), dtype=int))
    df = pd.concat([base_x, df], axis=1)
    x = df.iloc[:, :-data_info.instance.METRICS_OUTPUT_NUM].values
    y = df.iloc[:, -data_info.instance.MINI_MODEL_TARGET_NUM:].values
    start_times = df.iloc[:, data_info.instance.target_csv_index[
        data_info.instance.Target.START_TIME]].values
    cpu_ids = df.iloc[:, data_info.instance.target_csv_index[
        data_info.instance.Target.CPU_ID]].values

    logging.info("Loaded file: {}".format(OpUnit[file_name.upper()]))

    interval = data_info.instance.CONTENDING_OPUNIT_INTERVAL

    # Map from interval start time to the data in this interval
    interval_x_map = {}
    interval_y_map = {}
    interval_cpu_id_map = {}
    interval_start_time_map = {}
    n = x.shape[0]
    for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"):
        rounded_time = data_util.round_to_interval(start_times[i], interval)
        if rounded_time not in interval_x_map:
            interval_x_map[rounded_time] = []
            interval_y_map[rounded_time] = []
            interval_cpu_id_map[rounded_time] = []
            interval_start_time_map[rounded_time] = []
        interval_x_map[rounded_time].append(x[i])
        interval_y_map[rounded_time].append(y[i])
        interval_cpu_id_map[rounded_time].append(cpu_ids[i])
        interval_start_time_map[rounded_time].append(start_times[i])

    # Construct the new data
    opunit = OpUnit[file_name.upper()]
    data_list = []
    for rounded_time in interval_x_map:
        # Sum the features
        x_new = np.sum(interval_x_map[rounded_time], axis=0)
        # Concatenate the number of different threads
        x_new = np.concatenate(
            (x_new, [len(set(interval_cpu_id_map[rounded_time]))]))
        x_new *= txn_sample_interval + 1
        # Change all the opunits in the group for this interval to be the new feature
        opunits = [(opunit, x_new)]
        # The prediction is the average behavior
        y_new = np.average(interval_y_map[rounded_time], axis=0)
        n = len(interval_x_map[rounded_time])
        for i in range(n):
            metrics = np.concatenate(
                ([interval_start_time_map[rounded_time][i]],
                 [interval_cpu_id_map[rounded_time][i]], y_new))
            data_list.append(
                GroupedOpUnitData("{}".format(file_name), opunits, metrics,
                                  txn_sample_interval))

    return data_list