def map_workload(target_data): newest_result = Result.objects.get(pk=target_data['newest_result_id']) dbms = newest_result.dbms.pk hardware = newest_result.application.hardware.pk workload_data = PipelineResult.get_latest( dbms, hardware, PipelineTaskType.WORKLOAD_MAPPING_DATA) if workload_data is None: target_data['scores'] = None return target_data data_values = JSONUtil.loads(workload_data.value) X_scaler = np.load(data_values['X_scaler']) y_scaler = np.load(data_values['y_scaler']) y_deciles = np.load(data_values['y_deciles'])['deciles'] X_columnlabels = data_values['X_columnlabels'] y_columnlabels = data_values['y_columnlabels'] X_idxs = [ i for i in range(target_data['X_matrix'].shape[1]) if target_data['X_columnlabels'][i] in X_columnlabels ] y_idxs = [ i for i in range(target_data['y_matrix'].shape[1]) if target_data['y_columnlabels'][i] in y_columnlabels ] X_target = target_data['X_matrix'][:, X_idxs] y_target = target_data['y_matrix'][:, y_idxs] X_target = (X_target - X_scaler['mean']) / X_scaler['scale'] y_target = (y_target - y_scaler['mean']) / y_scaler['scale'] y_binned = np.empty_like(y_target) for i in range(y_target.shape[1]): y_binned[:, i] = bin_by_decile(y_target[:, i], y_deciles[i]) scores = {} for wkld_id, wkld_entry_path in data_values['data'].iteritems(): wkld_entry = np.load(wkld_entry_path) preds = np.empty_like(y_target) X_wkld = wkld_entry['X_matrix'] for j in range(y_target.shape[1]): y_col = wkld_entry['y_matrix'][:, j].reshape(X_wkld.shape[0], 1) model = GPR() model.fit(X_wkld, y_col, ridge=0.01) preds[:, j] = bin_by_decile( model.predict(X_target).ypreds.ravel(), y_deciles[j]) dists = np.sqrt(np.sum(np.square(np.subtract(preds, y_target)), axis=1)) scores[wkld_id] = np.mean(dists) # Find the best (minimum) score best_score = np.inf best_wkld_id = None for wkld_id, similarity_score in scores.iteritems(): if similarity_score < best_score: best_score = similarity_score best_wkld_id = wkld_id target_data['mapped_workload'] = (best_wkld_id, best_score) target_data['scores'] = scores return target_data
def map_workload(target_data): # Get the latest version of pipeline data that's been computed so far. latest_pipeline_run = PipelineRun.objects.get_latest() assert latest_pipeline_run is not None newest_result = Result.objects.get(pk=target_data['newest_result_id']) target_workload = newest_result.workload X_columnlabels = np.array(target_data['X_columnlabels']) y_columnlabels = np.array(target_data['y_columnlabels']) # Find all pipeline data belonging to the latest version with the same # DBMS and hardware as the target pipeline_data = PipelineData.objects.filter( pipeline_run=latest_pipeline_run, workload__dbms=target_workload.dbms, workload__hardware=target_workload.hardware) # FIXME (dva): we should also compute the global (i.e., overall) ranked_knobs # and pruned metrics but we just use those from the first workload for now initialized = False global_ranked_knobs = None global_pruned_metrics = None ranked_knob_idxs = None pruned_metric_idxs = None # Compute workload mapping data for each unique workload unique_workloads = pipeline_data.values_list('workload', flat=True).distinct() assert len(unique_workloads) > 0 workload_data = {} for unique_workload in unique_workloads: # Load knob & metric data for this workload knob_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.KNOB_DATA) metric_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.METRIC_DATA) X_matrix = np.array(knob_data["data"]) y_matrix = np.array(metric_data["data"]) rowlabels = np.array(knob_data["rowlabels"]) assert np.array_equal(rowlabels, metric_data["rowlabels"]) if not initialized: # For now set ranked knobs & pruned metrics to be those computed # for the first workload global_ranked_knobs = load_data_helper( pipeline_data, unique_workload, PipelineTaskType.RANKED_KNOBS)[:10] # FIXME (dva) global_pruned_metrics = load_data_helper( pipeline_data, unique_workload, PipelineTaskType.PRUNED_METRICS) ranked_knob_idxs = [ i for i in range(X_matrix.shape[1]) if X_columnlabels[i] in global_ranked_knobs ] pruned_metric_idxs = [ i for i in range(y_matrix.shape[1]) if y_columnlabels[i] in global_pruned_metrics ] # Filter X & y columnlabels by top 10 ranked_knobs & pruned_metrics X_columnlabels = X_columnlabels[ranked_knob_idxs] y_columnlabels = y_columnlabels[pruned_metric_idxs] initialized = True # Filter X & y matrices by top 10 ranked_knobs & pruned_metrics X_matrix = X_matrix[:, ranked_knob_idxs] y_matrix = y_matrix[:, pruned_metric_idxs] # Combine duplicate rows (rows with same knob settings) X_matrix, y_matrix, rowlabels = DataUtil.combine_duplicate_rows( X_matrix, y_matrix, rowlabels) workload_data[unique_workload] = { 'X_matrix': X_matrix, 'y_matrix': y_matrix, 'rowlabels': rowlabels, } # Stack all X & y matrices for preprocessing Xs = np.vstack([entry['X_matrix'] for entry in workload_data.values()]) ys = np.vstack([entry['y_matrix'] for entry in workload_data.values()]) # Scale the X & y values, then compute the deciles for each column in y X_scaler = StandardScaler(copy=False) X_scaler.fit(Xs) y_scaler = StandardScaler(copy=False) y_scaler.fit_transform(ys) y_binner = Bin(bin_start=1, axis=0) y_binner.fit(ys) del Xs del ys # Filter the target's X & y data by the ranked knobs & pruned metrics. X_target = target_data['X_matrix'][:, ranked_knob_idxs] y_target = target_data['y_matrix'][:, pruned_metric_idxs] # Now standardize the target's data and bin it by the deciles we just # calculated X_target = X_scaler.transform(X_target) y_target = y_scaler.transform(y_target) y_target = y_binner.transform(y_target) scores = {} for workload_id, workload_entry in workload_data.iteritems(): predictions = np.empty_like(y_target) X_workload = workload_entry['X_matrix'] y_workload = workload_entry['y_matrix'] for j, y_col in enumerate(y_workload.T): # Using this workload's data, train a Gaussian process model # and then predict the performance of each metric for each of # the knob configurations attempted so far by the target. y_col = y_col.reshape(-1, 1) model = GPR() model.fit(X_workload, y_col, ridge=0.01) predictions[:, j] = model.predict(X_target).ypreds.ravel() # Bin each of the predicted metric columns by deciles and then # compute the score (i.e., distance) between the target workload # and each of the known workloads predictions = y_binner.transform(predictions) dists = np.sqrt( np.sum(np.square(np.subtract(predictions, y_target)), axis=1)) scores[workload_id] = np.mean(dists) # Find the best (minimum) score best_score = np.inf best_workload_id = None for workload_id, similarity_score in scores.iteritems(): if similarity_score < best_score: best_score = similarity_score best_workload_id = workload_id target_data['mapped_workload'] = (best_workload_id, best_score) target_data['scores'] = scores return target_data