def main():
    # only training GPRs for offline loads
    dataset = Dataset(file_path=DATASET_PATHS['offline_workload'])
    # load the pruned metric headers
    pruned_metrics = Dataset.load_pruned_metrics()
    # prune the dataset
    dataset = dataset.prune_columns(pruned_metrics + ['workload id'] +
                                    dataset.get_tuning_knob_headers())

    # build the GPRs
    start = time()
    gprs = WorkloadGPR(dataset=dataset)
    LOG.info(f"Finished building GPRs in {round(time() - start)} seconds.")

    # pickle 'em
    LOG.info("Pickling GPRs...")
    start = time()
    gprs.pickle_models()
    LOG.info(f"Finished pickling models in {round(time() - start)} seconds.")
示例#2
0
def split_online_b(online_b_data, test_idx):
    primer = pd.DataFrame(columns=online_b_data.get_dataframe().columns)
    eval = pd.DataFrame(columns=online_b_data.get_dataframe().columns)
    for wl_id in online_b_data.get_workload_ids():
        curr_ds = online_b_data.get_specific_workload(wl_id)
        for idx in range(curr_ds.get_dataframe().values.shape[0]):
            if idx == test_idx:
                eval = eval.append(curr_ds.get_dataframe().iloc[idx:idx + 1],
                                   ignore_index=True)
            else:
                primer = primer.append(curr_ds.get_dataframe().iloc[idx:idx +
                                                                    1],
                                       ignore_index=True)
    primer = Dataset(dataframe=primer)
    eval = Dataset(dataframe=eval)
    latency_gt = eval.get_column_values('latency')
    eval = eval.prune_columns(['workload id'] + eval.get_tuning_knob_headers())

    return primer, eval, latency_gt
示例#3
0
    def _build_models_from_dataset(self, dataset: Dataset, scaler=None):
        """
        Build all of the GPR models from scratch
        """
        df = dataset.get_dataframe()
        metrics = dataset.get_metric_headers()
        workload_ids = dataset.get_workload_ids()
        knob_headers = dataset.get_tuning_knob_headers()
        total_gprs = len(workload_ids) * len(metrics)

        with tqdm(total=total_gprs) as pbar:
            for w in workload_ids:
                workloads = df[df['workload id'] == w]
                for m in metrics:
                    X = workloads[knob_headers].values

                    if scaler is not None:
                        X = scaler.transform(X)

                    y = workloads[m].values
                    m_file_name = m \
                        .replace('_', '-') \
                        .replace('/', '-') \
                        .replace('%', '-')

                    # krasserm.github.io/2018/03/19/gaussian-processes#effect-of-kernel-parameters-and-noise-parameter
                    restarts = 10
                    # sigma_f, l
                    kernel = ConstantKernel(10.0) * RBF(y.std())
                    # sigma_y
                    alpha = 0.1
                    model = GaussianProcessRegressor(
                        kernel=kernel,
                        n_restarts_optimizer=restarts,
                        alpha=alpha,
                        normalize_y=True)
                    model.fit(X, y)
                    self.models[f"wl_{w}_{m_file_name}.pickle"] = model
                    pbar.update(1)
示例#4
0
def main():
    LOG.debug('Clearing out all of the workload models.')
    clear_wl_models()

    dataset = Dataset(file_path=DATASET_PATHS['offline_workload'])
    pruned_metrics = Dataset.load_pruned_metrics()
    pruned_dataset = dataset.prune_columns(pruned_metrics + ['workload id'] +
                                           dataset.get_tuning_knob_headers())
    df = pruned_dataset.get_dataframe()

    # pick the ith data to use as validation
    i = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
    workload_ids = pruned_dataset.get_workload_ids()
    validation_df = pd.concat(
        [df[df['workload id'] == wid].iloc[i] for wid in workload_ids])
    validation_idx = validation_df.index
    valid_dataset = Dataset(dataframe=validation_df)

    diff_idx = df.index.difference(validation_df.index)

    train_df = df.iloc[diff_idx]
    train_dataset = Dataset(dataframe=train_df)

    #  LOG.info("Fitting input scaler...")
    #  scaler = StandardScaler()
    #  scaler.fit(train_df[dataset.get_tuning_knob_headers()].values)
    scaler = None

    LOG.info("Training workload GPRs...")
    gprs = WorkloadGPR(dataset=train_dataset, scaler=scaler)

    LOG.info("Validating GPRs...")
    train = {}
    result = {}
    for pm in pruned_metrics:
        for wid in workload_ids:
            name = f"{pm}|{wid}"
            model = gprs.get_model(wid, pm)

            # train
            #  X = train_df[dataset.get_tuning_knob_headers()].values
            #  X = scaler.transform(X)
            #  y = train_df[pm].values
            #  y_hat = model.predict(X)
            #  mape = np.mean(np.abs((y - y_hat) / y)) * 100
            #  train[name] = mape

            # validation
            X = validation_df[dataset.get_tuning_knob_headers()].values
            if scaler is not None:
                X = scaler.transform(X)

            y = validation_df[pm].values
            y_hat = model.predict(X)
            mape = np.mean(np.abs((y - y_hat) / y)) * 100
            result[name] = mape
            #  LOG.info('%s: %s', name, mape)

    #  LOG.info('Training average MAPE: %s',
    #  np.array(list(train.values())).mean())
    LOG.info('Validation average MAPE: %s',
             np.array(list(result.values())).mean())