Пример #1
0
    def _build_models_from_dataset(self, dataset: Dataset, scaler=None):
        """
        Build all of the GPR models from scratch
        """
        df = dataset.get_dataframe()
        metrics = dataset.get_metric_headers()
        workload_ids = dataset.get_workload_ids()
        knob_headers = dataset.get_tuning_knob_headers()
        total_gprs = len(workload_ids) * len(metrics)

        with tqdm(total=total_gprs) as pbar:
            for w in workload_ids:
                workloads = df[df['workload id'] == w]
                for m in metrics:
                    X = workloads[knob_headers].values

                    if scaler is not None:
                        X = scaler.transform(X)

                    y = workloads[m].values
                    m_file_name = m \
                        .replace('_', '-') \
                        .replace('/', '-') \
                        .replace('%', '-')

                    # krasserm.github.io/2018/03/19/gaussian-processes#effect-of-kernel-parameters-and-noise-parameter
                    restarts = 10
                    # sigma_f, l
                    kernel = ConstantKernel(10.0) * RBF(y.std())
                    # sigma_y
                    alpha = 0.1
                    model = GaussianProcessRegressor(
                        kernel=kernel,
                        n_restarts_optimizer=restarts,
                        alpha=alpha,
                        normalize_y=True)
                    model.fit(X, y)
                    self.models[f"wl_{w}_{m_file_name}.pickle"] = model
                    pbar.update(1)
Пример #2
0
def main():
    """
    Main method for the script.
    """
    dataset = Dataset(file_path=DATASET_PATHS[CONFIG.dataset])
    df = dataset.get_dataframe()

    # remove columns that are constant values
    metric_headers = dataset.get_metric_headers()
    constant_headers = []
    variable_headers = []
    for header in metric_headers:
        if np.unique(df[header].values).size > 1:
            variable_headers.append(header)
        else:
            constant_headers.append(header)

    metric_headers = variable_headers
    dataset = Dataset(dataframe=df.drop(constant_headers, axis=1))
    raw_metrics = dataset.get_metrics()
    metrics = raw_metrics.T

    # factor analysis
    LOG.info('Starting factor analysis with %s factors...', CONFIG.num_factors)
    start = time()
    # model = FactorAnalysis(n_components=CONFIG.num_factors)
    # factors = model.fit_transform(metrics)  # num_metrics * num_factors
    rng = np.random.RandomState(74)
    model = GaussianRandomProjection(eps=0.999, random_state=rng)
    factors = model.fit_transform(metrics)
    LOG.debug('Dimension before factor analysis: %s', metrics.shape)
    LOG.debug('Dimension after factor analysis: %s', factors.shape)
    LOG.info('Finished factor analysis in %s seconds.', round(time() - start))

    # clustering
    if CONFIG.model == 'kmeans':
        model = build_k_means(factors)
    elif CONFIG.model == 'kmedoids':
        model = build_k_medoids(factors)
    else:
        raise ValueError('Unrecognized model: %s', CONFIG.model)

    # find cluster center
    labels = model.labels_
    # each dimension in transformed_data is the distance to the cluster
    # centers.
    transformed_data = model.transform(factors)
    leftover_metrics = []
    for i in np.unique(labels):
        # index of the points for the ith cluster
        cluster_member_idx = np.argwhere(labels == i).squeeze(1)
        cluster_members = transformed_data[cluster_member_idx]
        # find the index of the minimum-distance point to the center
        closest_member = cluster_member_idx[np.argmin(cluster_members[:, i])]
        leftover_metrics.append(metric_headers[closest_member])

    # latency needs to be in the metrics
    if 'latency' not in leftover_metrics:
        leftover_metrics += ['latency']

    with open(CONFIG.output_path, 'w') as file:
        file.writelines('\n'.join(leftover_metrics))