def run_log_reg(scaled_df): raw_train_arr = [] raw_test_arr = [] # Init metrics metrics = ['accuracy', 'f1', 'roc_auc_ovr'] # Set c vals and penalty C_vals = range(-8, 5) C_vals = [10**val for val in C_vals] penalty = ['none', 'l1', 'l2'] # Init params params = {'penalty': penalty, 'C': C_vals} # Over five trials for i in range(5): # Train test split X_train, X_test, y_train, y_test = train_test_split( scaled_df.iloc[:, :-1], scaled_df.y, train_size=5000) # Init clf clf = LogisticRegression() # Init gridsearch and run search_results = GridSearchCV(clf, params, scoring=metrics, refit=False) search_results.fit(X_train, y_train) # Get results and organize results = pd.DataFrame(search_results.cv_results_['params']) results['mean_accuracy'] = search_results.cv_results_[ 'mean_test_accuracy'] results['mean_f1'] = search_results.cv_results_['mean_test_f1'] results['mean_auc'] = search_results.cv_results_[ 'mean_test_roc_auc_ovr'] # Get optimal clfs opt_acc_inf = results.sort_values(by='mean_accuracy', ascending=False).iloc[0] opt_f1_inf = results.sort_values(by='mean_f1', ascending=False).iloc[0] opt_auc_inf = results.sort_values(by='mean_auc', ascending=False).iloc[0] # Init optimal clfs opt_acc_clf = LogisticRegression(C=opt_acc_inf.C, penalty=opt_acc_inf.penalty, max_iter=100000) opt_f1_clf = LogisticRegression(C=opt_f1_inf.C, penalty=opt_f1_inf.penalty, max_iter=100000) opt_auc_clf = LogisticRegression(C=opt_auc_inf.C, penalty=opt_auc_inf.penalty, max_iter=100000) # Fit clfs opt_acc_clf.fit(X_train, y_train) opt_f1_clf.fit(X_train, y_train) opt_auc_clf.fit(X_train, y_train) # Get train and test metrics train_score_acc = opt_acc_clf.score(X_train, y_train) train_score_f1 = f1_score(y_train, opt_f1_clf.predict(X_train)) train_score_auc = roc_auc_score(y_train, opt_auc_clf.predict(X_train)) test_score_acc = opt_acc_clf.score(X_test, y_test) test_score_f1 = f1_score(y_test, opt_f1_clf.predict(X_test)) test_score_auc = roc_auc_score(y_test, opt_auc_clf.predict(X_test)) raw_train_arr.append( [train_score_acc, train_score_f1, train_score_auc]) raw_test_arr.append([test_score_acc, test_score_f1, test_score_auc]) # Create dataframe from results raw_train_arr = np.array(raw_train_arr).reshape(5, 3) raw_test_arr = np.array(raw_test_arr).reshape(5, 3) raw_train_df = pd.DataFrame(data=raw_train_arr, columns=['accuracy', 'f1', 'auc']) raw_test_df = pd.DataFrame(data=raw_test_arr, columns=['accuracy', 'f1', 'auc']) # Return results return raw_train_df, raw_test_df
class CumlLRFitter(FitterBase): def __init__(self, label='label', metric='error', opt: LROpt = None, max_eval=10): super(CumlLRFitter, self).__init__(label, metric, max_eval) if opt is not None: self.opt = opt else: self.opt = LROpt() self.clf = None def train(self, train_df, eval_df, params=None): train_df, eval_df = cudf.DataFrame(train_df), cudf.DataFrame(eval_df) x_train, y_train, x_eval, y_eval = train_df.drop(columns=[self.label]), train_df[self.label], \ eval_df.drop(columns=[self.label]), eval_df[self.label], if params is None: use_params = deepcopy(self.opt_params) else: use_params = deepcopy(params) self.clf = LogisticRegression(**use_params) self.clf.fit(X=x_train, y=y_train) preds = self.clf.predict(X=x_eval) output = self.get_loss(y_pred=preds, y=y_eval) return output def search(self, train_df, eval_df): self.opt_params = dict() def train_impl(params): self.train(train_df, eval_df, params) if self.metric == 'auc': y_pred = self.clf.predict(eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) return self.get_loss(eval_df[self.label], y_pred) self.opt_params = fmin(train_impl, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval) def search_k_fold(self, k_fold, data): self.opt_params = dict() def train_impl_nfold(params): loss = list() for train_id, eval_id in k_fold.split(data): train_df = data.iloc[train_id, :] eval_df = data.iloc[eval_id, :] self.train(train_df, eval_df, params) if self.metric == 'auc': y_pred = self.clf.predict( eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) loss.append(self.get_loss(eval_df[self.label], y_pred)) return np.mean(loss) self.opt_params = fmin(train_impl_nfold, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval) def train_k_fold(self, k_fold, train_data, test_data, params=None, drop_test_y=True): acc_result = list() train_pred = cudf.Series(np.empty(train_data.shape[0])) test_pred = cudf.Series(np.empty(test_data.shape[0])) if drop_test_y: dtest = test_data.drop(columns=self.label) else: dtest = test_data for train_id, eval_id in k_fold.split(train_data): train_df = train_data.iloc[train_id, :] eval_df = train_data.iloc[eval_id, :] self.train(train_df, eval_df, params) train_pred[eval_id] = self.clf.predict_proba( eval_df.drop(columns=self.label)).iloc[:, 1].values if self.metric == 'auc': y_pred = self.clf.predict(eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) acc_result.append(self.get_loss(eval_df[self.label], y_pred)) test_pred += self.clf.predict_proba(dtest).iloc[:, 1] test_pred /= k_fold.n_splits return train_pred, test_pred, acc_result