def train_gp_models(training_data, pruned_metrics_idxs, length_scale, magnitude, rdg): """ Trains separate model per workload """ td = {} for job in training_data.keys(): td[job] = {} td[job]['X_matrix'] = training_data[job]['X_matrix'].copy() td[job]['y_matrix'] = training_data[job][ 'y_matrix'][:, pruned_metrics_idxs].copy() # Stack all X & y matrices for preprocessing Xs = np.vstack([entry['X_matrix'] for entry in list(td.values())]) ys = np.vstack([entry['y_matrix'] for entry in list(td.values())]) # Scale the X & y values, then compute the deciles for each column in y X_scaler = StandardScaler(copy=False) X_scaler.fit(Xs) y_scaler = StandardScaler(copy=False) y_scaler.fit_transform(ys) y_binner = Bin(bin_start=1, axis=0) y_binner.fit(ys) del Xs del ys models = {} for workload_id, workload_entry in list(td.items()): # FIXME: this can be parallelized models[workload_id] = [] X_workload = workload_entry['X_matrix'] X_scaled = X_scaler.transform(X_workload) y_workload = workload_entry['y_matrix'] y_scaled = y_scaler.transform(y_workload) # [KZ]: looping over the columns of the metrics for j, y_col in enumerate(y_scaled.T): # Using this workload's data, train a Gaussian process model # and then predict the performance of each metric for each of # the knob configurations attempted so far by the target. y_col = y_col.reshape(-1, 1) model = GPRNP(length_scale=length_scale, magnitude=magnitude, max_train_size=MAX_TRAIN_SIZE, batch_size=BATCH_SIZE) model.fit(X_scaled, y_col, ridge=rdg) models[workload_id].append(model) scalers = { 'X_scaler': X_scaler, 'y_scaler': y_scaler, 'y_binner': y_binner } return models, scalers
def train_and_evaluate(job_id, proxy_id, training_data, observed_data, test_data, obj_idx, length_scale, magnitude, rdg): """ Trains a separate model on mixed traces from current workload and mapped workload job_id: id the job on which we'd like to evaluate the model right now. proxy_id: id of the training job to which the workload is mapped. training_data: training data for training jobs: will be used to fetch the proxy's job data... observed_data: observed data for the evaluation jobs. test_data: test data for the evaluation jobs. obj_idx: objective index in the metrics vector. (proxy_job is the job to which the current test workload has been mapped) """ # Load mapped workload data X_workload = training_data[proxy_id]['X_matrix'].copy() y_workload = training_data[proxy_id]['y_matrix'][:, obj_idx].copy() # Target workload data (observed) X_target = observed_data[job_id]['X_matrix'].copy() y_target = observed_data[job_id]['y_matrix'][:, obj_idx].copy() # Target workload data on which we'll evaluate the model error... X_target_eval = test_data[job_id]['X_matrix'].copy() y_target_eval = test_data[job_id]['y_matrix'][:, obj_idx].copy() if np.ndim(y_workload) == 1: y_workload = np.expand_dims(y_workload, axis=1) if np.ndim(y_target) == 1: y_target = np.expand_dims(y_target, axis=1) if np.ndim(y_target_eval) == 1: y_target_eval = np.expand_dims(y_target_eval, axis=1) # Delete any rows that appear in both the workload data and the target # data from the workload data dups_filter = np.ones(X_workload.shape[0], dtype=bool) target_row_tups = [tuple(row) for row in X_target] for i, row in enumerate(X_workload): if tuple(row) in target_row_tups: dups_filter[i] = False X_workload = X_workload[dups_filter, :] y_workload = y_workload[dups_filter, :] # Combine target (observed) & workload (mapped) Xs for preprocessing X_matrix = np.vstack([X_target, X_workload]) # Scale to N(0, 1) X_scaler = StandardScaler() X_scaled = X_scaler.fit_transform(X_matrix) X_target_eval_scaled = X_scaler.transform(X_target_eval) # (KZ) Fitting scaler on both observed data as well as data from proxy job y_workload_scaler = StandardScaler() y_matrix = np.vstack([y_target, y_workload]) y_scaled = y_workload_scaler.fit_transform(y_matrix) y_target_eval_scaled = y_workload_scaler.transform(y_target_eval) ###################### GP tensorflow training (fails) ##################### # X_min = np.empty(X_scaled.shape[1]) # X_max = np.empty(X_scaled.shape[1]) # # Determine min/max for knob values # for i in range(X_scaled.shape[1]): # col_min = X_scaled[:, i].min() # col_max = X_scaled[:, i].max() # X_min[i] = col_min # X_max[i] = col_max # model = GPRGD(length_scale=DEFAULT_LENGTH_SCALE, # magnitude=DEFAULT_MAGNITUDE, # max_train_size=MAX_TRAIN_SIZE, # batch_size=BATCH_SIZE, # num_threads=NUM_THREADS, # learning_rate=DEFAULT_LEARNING_RATE, # epsilon=DEFAULT_EPSILON, # max_iter=MAX_ITER, # sigma_multiplier=DEFAULT_SIGMA_MULTIPLIER, # mu_multiplier=DEFAULT_MU_MULTIPLIER) # model.fit(X_scaled, y_scaled, X_min, X_max, DEFAULT_RIDGE) # y_target_eval_pred = y_workload_scaler.inverse_transform( # model.predict(X_target_eval_scaled).ypreds) ########################################################################### ###################### sklearn's RF as a regressor ######################## if REG == "RF": raise NotImplementedError # from sklearn.ensemble import RandomForestRegressor # model = RandomForestRegressor(n_estimators=500) # model.fit(X_scaled, y_scaled) # y_target_eval_pred = y_workload_scaler.inverse_transform( # model.predict(X_target_eval_scaled)) # y_train_pred = y_workload_scaler.inverse_transform( # model.predict(X_scaled)) # training_mape = MAPE(y_matrix, y_train_pred) ########################################################################### # Numpy implementation of GP: elif REG == "GPNP": model = GPRNP(length_scale=length_scale, magnitude=magnitude, max_train_size=MAX_TRAIN_SIZE, batch_size=BATCH_SIZE) model.fit(X_scaled, y_scaled, ridge=rdg) y_target_eval_pred = y_workload_scaler.inverse_transform( model.predict(X_target_eval_scaled).ypreds ) # we're returning the mean of the distribution here... logging.debug("job {}: y_target_eval_pred: {}".format( job_id, y_target_eval_pred)) y_train_pred = y_workload_scaler.inverse_transform( model.predict(X_scaled).ypreds) training_mape = MAPE(y_matrix, y_train_pred) else: raise NotImplementedError("This regressor is not implemented...") if np.ndim(y_target_eval_pred) > 1: y_target_eval_pred = np.squeeze(y_target_eval_pred) y_target_eval = np.squeeze(y_target_eval) logging.info("test workload: {} \t proxy: {} \t MAPE: {:.2f}%".format( job_id, proxy_id, MAPE(y_target_eval, y_target_eval_pred))) return MAPE(y_target_eval, y_target_eval_pred), training_mape