def _build_models_from_dataset(self, dataset: Dataset, scaler=None): """ Build all of the GPR models from scratch """ df = dataset.get_dataframe() metrics = dataset.get_metric_headers() workload_ids = dataset.get_workload_ids() knob_headers = dataset.get_tuning_knob_headers() total_gprs = len(workload_ids) * len(metrics) with tqdm(total=total_gprs) as pbar: for w in workload_ids: workloads = df[df['workload id'] == w] for m in metrics: X = workloads[knob_headers].values if scaler is not None: X = scaler.transform(X) y = workloads[m].values m_file_name = m \ .replace('_', '-') \ .replace('/', '-') \ .replace('%', '-') # krasserm.github.io/2018/03/19/gaussian-processes#effect-of-kernel-parameters-and-noise-parameter restarts = 10 # sigma_f, l kernel = ConstantKernel(10.0) * RBF(y.std()) # sigma_y alpha = 0.1 model = GaussianProcessRegressor( kernel=kernel, n_restarts_optimizer=restarts, alpha=alpha, normalize_y=True) model.fit(X, y) self.models[f"wl_{w}_{m_file_name}.pickle"] = model pbar.update(1)
def main(): """ Main method for the script. """ dataset = Dataset(file_path=DATASET_PATHS[CONFIG.dataset]) df = dataset.get_dataframe() # remove columns that are constant values metric_headers = dataset.get_metric_headers() constant_headers = [] variable_headers = [] for header in metric_headers: if np.unique(df[header].values).size > 1: variable_headers.append(header) else: constant_headers.append(header) metric_headers = variable_headers dataset = Dataset(dataframe=df.drop(constant_headers, axis=1)) raw_metrics = dataset.get_metrics() metrics = raw_metrics.T # factor analysis LOG.info('Starting factor analysis with %s factors...', CONFIG.num_factors) start = time() # model = FactorAnalysis(n_components=CONFIG.num_factors) # factors = model.fit_transform(metrics) # num_metrics * num_factors rng = np.random.RandomState(74) model = GaussianRandomProjection(eps=0.999, random_state=rng) factors = model.fit_transform(metrics) LOG.debug('Dimension before factor analysis: %s', metrics.shape) LOG.debug('Dimension after factor analysis: %s', factors.shape) LOG.info('Finished factor analysis in %s seconds.', round(time() - start)) # clustering if CONFIG.model == 'kmeans': model = build_k_means(factors) elif CONFIG.model == 'kmedoids': model = build_k_medoids(factors) else: raise ValueError('Unrecognized model: %s', CONFIG.model) # find cluster center labels = model.labels_ # each dimension in transformed_data is the distance to the cluster # centers. transformed_data = model.transform(factors) leftover_metrics = [] for i in np.unique(labels): # index of the points for the ith cluster cluster_member_idx = np.argwhere(labels == i).squeeze(1) cluster_members = transformed_data[cluster_member_idx] # find the index of the minimum-distance point to the center closest_member = cluster_member_idx[np.argmin(cluster_members[:, i])] leftover_metrics.append(metric_headers[closest_member]) # latency needs to be in the metrics if 'latency' not in leftover_metrics: leftover_metrics += ['latency'] with open(CONFIG.output_path, 'w') as file: file.writelines('\n'.join(leftover_metrics))