def get_full_data(): df, features = data_loader.get_dataset("data/darshan_theta_2017_2020.csv", "POSIX") df.reset_index(inplace=True) df.drop(columns=['index', 'level_0'], inplace=True) X_train, X_test, y_train, y_test = test_set_utils.random_split( df, "POSIX_AGG_PERF_BY_SLOWEST_LOG10", keep_columns=features, test_size=0.3) regressor = xgb.XGBRegressor(obj=huber_approx_obj, n_estimators=2**11, max_depth=7, colsample_bytree=0.8, subsample=1) regressor.fit(X_train, y_train, eval_metric=huber_approx_obj) y_pred_test = regressor.predict(X_test) df = pd.DataFrame({ 'POSIX_AGG_PERF_BY_SLOWEST_LOG10': y_test, 'prediction': y_pred_test }) return df
def predict(split_time): df, columns = data_loader.get_dataset('data/darshan_theta_2017_2020.csv', 'POSIX', min_job_volume=0) df_before = df[df.START_TIME <= split_time] df_after = df[df.START_TIME > split_time] X_train, X_test_before, y_train, y_test_before = \ sklearn.model_selection.train_test_split(df_before[columns + ["START_TIME"]], df_before.POSIX_AGG_PERF_BY_SLOWEST_LOG10, test_size=0.3) timestamps_before, timestamps_after = X_test_before.START_TIME.to_numpy( ), df_after.START_TIME.to_numpy() X_train, X_test_before = X_train[columns], X_test_before[columns] X_test_after, y_test_after = df_after[ columns], df_after.POSIX_AGG_PERF_BY_SLOWEST_LOG10 X_train, X_test_before, X_test_after, y_train, y_test_before, y_test_after = \ X_train.to_numpy(), X_test_before.to_numpy(), X_test_after.to_numpy(), y_train.to_numpy(), y_test_before.to_numpy(), y_test_after.to_numpy() len_before = len(y_test_before) y_pred_train, y_pred_test = prediction_results( X_train, y_train, np.concatenate((X_test_before, X_test_after)), np.concatenate((y_test_before, y_test_after))) y_pred_test_before, y_pred_test_after = y_pred_test[: len_before], y_pred_test[ len_before:] return timestamps_before, y_test_before, y_pred_test_before, timestamps_after, y_test_after, y_pred_test_after
def load_dataset(): df, features = data_loader.get_dataset('data/darshan_theta_2017_2020.csv', 'POSIX', min_job_volume=0) df = df[df.POSIX_TOTAL_BYTES >= 10 * 1024**2] df = df.sample(100000, random_state=0) return df, features
def load_data(self): dataset = get_dataset(self.args.data, normalize=self.args.normalize) self.args.num_features, self.args.num_classes, self.args.avg_num_nodes = dataset.num_features, dataset.num_classes, np.ceil( np.mean([data.num_nodes for data in dataset])) print('# %s: [FEATURES]-%d [NUM_CLASSES]-%d [AVG_NODES]-%d' % (dataset, self.args.num_features, self.args.num_classes, self.args.avg_num_nodes)) return dataset
def load_dataset(module, remove_runtime): df, features = data_loader.get_dataset('data/darshan_theta_2017_2020.csv', module, min_job_volume=0) if module == "POSIX": features.remove("POSIX_FDSYNCS_LOG10") if remove_runtime: features.remove("RUNTIME_LOG10") return df, features
def get_duplicate_data(): df, features = data_loader.get_dataset("data/darshan_theta_2017_2020.csv", "POSIX") df = df[df.duplicated(features, keep=False)] df['prediction'] = -1 df['time_diff'] = -1 df.reset_index(inplace=True) df.drop(columns=['index', 'level_0'], inplace=True) for f, duplicate_set in df.groupby(features): group_size = duplicate_set.shape[0] sum_throughput = duplicate_set.POSIX_AGG_PERF_BY_SLOWEST_LOG10.sum() sum_time = duplicate_set.START_TIME.sum() for idx, row in duplicate_set.iterrows(): df.iloc[idx, -2] = (sum_throughput - row.POSIX_AGG_PERF_BY_SLOWEST_LOG10) / (group_size - 1) df.iloc[idx, -1] = np.abs((sum_time - row.START_TIME) / (group_size - 1) - row.START_TIME) return df
def grid_search(max_log2_trees, max_log2_depth): """ Run a grid search over two parameters: tree depth and number of trees. For each configuration, train an XGBoost regressor and evaluate its performance on the test set. Plot a matrix of the results. """ df, features = data_loader.get_dataset('data/darshan_theta_2017_2020.csv', 'POSIX', min_job_volume=0) # Don't include runtime in the set of input features features = [f for f in features if f != 'RUNTIME_LOG10'] df_train, df_test = sklearn.model_selection.train_test_split(df, test_size=0.2) X_train, X_test = df_train[features], df_test[features] # POSIX_AGG_PERF_BY_SLOWEST_LOG10 is log10 of Darshan's I/O throughput estimate y_train, y_test = df_train["POSIX_AGG_PERF_BY_SLOWEST_LOG10"], df_test[ "POSIX_AGG_PERF_BY_SLOWEST_LOG10"] results = {"depth": [], "trees": [], "error": []} def evaluate_configuration(depth, trees): regressor = xgb.XGBRegressor(obj=huber_approx_obj, n_estimators=trees, max_log2_depth=depth) regressor.fit(X_train, y_train, eval_metric=huber_approx_obj) y_pred_test = regressor.predict(X_test) error = np.median(10**np.abs(y_test - y_pred_test)) return error for trees in [2**x for x in range(1, max_log2_trees + 1)]: for depth in range(1, max_log2_depth + 1): error = evaluate_configuration(depth, trees) print(f"Trees: {trees}, depth: {depth}, error: {error}") results['depth'].append(depth) results['trees'].append(trees) results['error'].append(error) return pd.DataFrame(results)
def calculate_duplicate_errors(): """ Get all duplicates, take their mean, and predict the throughput Returns the real (target) throughputs and relative prediction errors. Returns: a list of target throughputs and a list of relative errors """ df, features = data_loader.get_dataset("data/darshan_theta_2017_2020.csv", "POSIX") duplicated = df.duplicated(features, keep=False) apps = df[duplicated]["apps_short"] regressor = xgb.XGBRegressor(n_estimators=4000, depth=8) regressor.fit(df[duplicated][features], df[duplicated]['POSIX_AGG_PERF_BY_SLOWEST_LOG10']) y_pred = regressor.predict(df[duplicated][features]) return df[duplicated]['POSIX_AGG_PERF_BY_SLOWEST_LOG10'], y_pred, apps
def load_dataset(): df, features = data_loader.get_dataset('data/darshan_theta_2017_2020.csv', 'POSIX', min_job_volume=0) return df, features