def oneIteration(): timings = [] for att, obj in combos: #subsample the objects/observations this_df = train.sample(n=obj).copy() this_y = labels.loc[this_df.index.values].copy() # this_df.reset_index(inplace=True, drop=True) # this_y.reset_index(inplace=True, drop=True) #extend the attributes if att != 500: extraAttLen = att - 500 extraAttributes = pd.DataFrame( np.random.randint(0, 100, size=(obj, extraAttLen)), columns=['Ext' + str(x) for x in range(extraAttLen)]) #Bind the dataframes together bound = pd.concat([this_df, extraAttributes], axis=1, ignore_index=True) else: bound = this_df.copy() start = time.time() br = BoostARoota(metric='logloss') br.fit(bound, this_y) timings.append(time.time() - start) return timings
def __init__(self, parameter_tuning=False, classifier_list=None): self.results = None self.new_results = None if classifier_list is None: self.classifiers = [ make_pipeline(DiaPoly(), lgb.LGBMClassifier()), make_pipeline(DiaPoly(), LogisticRegression()), make_pipeline(BoostARoota(metric='logloss'), lgb.LGBMClassifier()), make_pipeline(BoostARoota(metric='logloss'), DiaPoly(), lgb.LGBMClassifier()), # make_pipeline(BoostARoota(metric='logloss', max_rounds=1), lgb.LGBMClassifier()), # make_pipeline(BoostARoota(metric='logloss', max_rounds=1), GradientBoostingClassifier()), ElasticNet(), LogisticRegression(), Lasso(), # Ridge(), lgb.LGBMClassifier(), CatBoostClassifier(iterations=100, logging_level='Silent'), # make_pipeline(BoostARoota(metric='logloss'), CatBoostClassifier(iterations=1000, logging_level='Silent')), CatBoostClassifier(iterations=1000, logging_level='Silent') ] else: self.classifiers = classifier_list
def trainKFolds(X, Y, eval, folds=5): if eval == "reg": eval_metric = "rmse" else: eval_metric = "logloss" np.random.seed( None ) # removing any seed to ensure that the folds are created differently # initialize empty lists - not the most efficient, but it works bar_times = [] y_hat = [] y_hat_BR = [] y_actual = [] fold = [] # Start the cross validation kf = KFold(n_splits=folds) i = 1 for train, test in kf.split(X): X_train, X_test, y_train, y_test = ( X.iloc[train], X.iloc[test], Y[train], Y[test], ) # Get predictions on all features y_pred = TrainGetPreds(X_train, y_train, X_test, metric=eval_metric) # BoostARoota - tune to metric 1 tmp = time.time() br = BoostARoota(eval_metric) br.fit(X_train, y_train) bar_times.append(time.time() - tmp) BR_X = br.transform(X_train) BR_test = br.transform(X_test) BR_preds = TrainGetPreds(BR_X, y_train, BR_test, metric=eval_metric) # evaluate predictions and append to lists y_hat.extend(y_pred) y_hat_BR.extend(BR_preds) y_actual.extend(y_test) # Set the fold it is trained on fold.extend([i] * len(y_pred)) i += 1 # Start building the array to be passed out; first is the timings, then the eval results values = [np.mean(bar_times)] # Build the dataframe to pass into the evaluation functions results = pd.DataFrame( {"y_hat": y_hat, "Fold": fold, "y_hat_BR": y_hat_BR, "y_actual": y_actual} ) # then append the evaluation results to values values.extend(evalResults(results, eval=eval)) return values
################# #Madelon Dataset train_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data' # download the file raw_data = urllib.request.urlopen(train_url) train = pd.read_csv(raw_data, delim_whitespace=True, header=None) train.columns = ["Var" + str(x) for x in range(len(train.columns))] labels_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels' raw_data = urllib.request.urlopen(labels_url) labels = pd.read_csv(raw_data, delimiter=",", header=None) labels.columns = ["Y"] ######################################################################################################################## # # Test that BoostARoota is working # ######################################################################################################################## br = BoostARoota(metric='logloss') br.fit(train, labels) len(train.columns) len(br.keep_vars_) new_train = br.transform(train) new_train2 = br.fit_transform(train, labels) #Dimension Reduction print("Original training set has " + str(train.shape) + " dimensions. \n" +\ "BoostARoota with .fit() and .transform() reduces to " + str(new_train.shape) + " dimensions. \n" +\ "BoostARoota with .fit_transform() reduces to " + str(new_train2.shape) + " dimensions.\n" +\ "The two methods may give a slightly different dimensions because of random variation as it is being refit")
target_entity='obs', agg_primitives=['min', 'max', 'mean', 'count', 'sum', 'std', 'trend'], trans_primitives=['divide_numeric_scalar', 'multiply_numeric'], max_depth=1, n_jobs=1, verbose=1) feature_matrix = feature_selection(feature_matrix, correlation_threshold=0.8) df_ = feature_matrix # make a copy of this df_ = df_.dropna(how='any', axis=1) y = df.pop('Class') X = df_ # and another copy. Might not need this br = BoostARoota(metric='logloss') br.fit(X, y) print() print('keep vars') print(list(br.keep_vars_)) chosen_features = br.keep_vars_ print() print(' Final chosen features:') print(' {}'.format(chosen_features)) # split these out # this isn't going to be shuffled because that's a mess. X_temp, X_holdout, y_temp, y_holdout = train_test_split( X[chosen_features].to_numpy(), y.to_numpy(), test_size=0.10)
df_.drop(inf_list, axis=1) # would it go faster if we can drop all the original columns? df_ = df_.drop(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28'], axis=1) df_ = df_.dropna(how='any', axis=1) X = df_ # and another copy. Might not need this print() print('Starting Boruta') br = BoostARoota(metric='logloss', silent=True) #cutoff=.5, delta=0.7, silent=True) br.fit(X, y) chosen_features = br.keep_vars_ if len(chosen_features) == 0: print(' No chosen features!') continue ind = range(len(chosen_features)) # let's do multiple rounds of this fitting and average avg_dict = {} for i in range(num_its): xgb_for_feat_imp = xgb.train(dtrain = xgb.DMatrix(X[chosen_features], label=y), params={})
train_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data" # download the file raw_data = urllib.request.urlopen(train_url) train = pd.read_csv(raw_data, delim_whitespace=True, header=None) train.columns = ["Var" + str(x) for x in range(len(train.columns))] labels_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels" raw_data = urllib.request.urlopen(labels_url) labels = pd.read_csv(raw_data, delimiter=",", header=None) labels.columns = ["Y"] ######################################################################################################################## # # Test that BoostARoota is working # ######################################################################################################################## br = BoostARoota(metric="logloss") br.fit(train, labels) len(train.columns) len(br.keep_vars_) new_train = br.transform(train) new_train2 = br.fit_transform(train, labels) # Dimension Reduction print( "Original training set has " + str(train.shape) + " dimensions. \n" + "BoostARoota with .fit() and .transform() reduces to " + str(new_train.shape) + " dimensions. \n" + "BoostARoota with .fit_transform() reduces to " + str(new_train2.shape) + " dimensions.\n" + "The two methods may give a slightly different dimensions because of random variation as it is being refit"
def trainKFolds(X, Y, eval, folds=5): if eval == "reg": eval_metric = "rmse" eval_metric2 = "mae" else: eval_metric = "logloss" eval_metric2 = "auc" np.random.seed( None ) # removing any seed to ensure that the folds are created differently # initialize empty lists - not the most efficient, but it works bar_times = [] boruta_times = [] y_hat = [] y_hat2 = [] y_hat_BR = [] y_hat_BR2 = [] y_hat_boruta = [] y_hat_boruta2 = [] y_actual = [] fold = [] # Start the cross validation kf = KFold(n_splits=folds) i = 1 for train, test in kf.split(X): X_train, X_test, y_train, y_test = ( X.iloc[train], X.iloc[test], Y[train], Y[test], ) # Get predictions on all features y_pred = TrainGetPreds(X_train, y_train, X_test, metric=eval_metric) y_pred2 = TrainGetPreds(X_train, y_train, X_test, metric=eval_metric2) # BoostARoota - tune to metric 1 tmp = time.time() br_one = BoostARoota(param={"eval_metric": eval_metric}) br_one.fit(X_train, y_train) BR_vars = br_one.keep_vars_ bar_times.append(time.time() - tmp) BR_X = X_train[BR_vars] BR_test = X_test[BR_vars] BR_preds = TrainGetPreds(BR_X, y_train, BR_test, metric=eval_metric) # BoostARoota - tune to metric 2 tmp = time.time() br_two = BoostARoota(param={"eval_metric": eval_metric2}) br_two.fit(X_train, y_train) BR_vars = br_two.keep_vars_ bar_times.append(time.time() - tmp) BR_X = X_train[BR_vars] BR_test = X_test[BR_vars] BR_preds2 = TrainGetPreds(BR_X, y_train, BR_test, metric=eval_metric2) # #Boruta - get predictions tmp = time.time() rf = RandomForestClassifier(n_jobs=-1, class_weight="balanced", max_depth=5) feat_selector = BorutaPy(rf, n_estimators="auto", verbose=2, random_state=1) feat_selector.fit(X_train.values, y_train.values) boruta_times.append(time.time() - tmp) X_train_filter = feat_selector.transform(X_train.values) X_test_filter = feat_selector.transform(X_test.values) Boruta_preds = TrainGetPreds(X_train_filter, y_train, X_test_filter, metric=eval_metric) Boruta_preds2 = TrainGetPreds(X_train_filter, y_train, X_test_filter, metric=eval_metric2) # evaluate predictions and append to lists y_hat.extend(y_pred) y_hat2.extend(y_pred2) y_hat_BR.extend(BR_preds) y_hat_BR2.extend(BR_preds2) y_hat_boruta.extend(Boruta_preds) y_hat_boruta2.extend(Boruta_preds2) y_actual.extend(y_test) # Set the fold it is trained on fold.extend([i] * len(y_pred)) i += 1 # Start building the array to be passed out; first is the timings, then the eval results values = [np.mean(boruta_times), np.mean(bar_times)] # Build the dataframe to pass into the evaluation functions results = pd.DataFrame({ "y_hat": y_hat, "y_hat2": y_hat2, "Fold": fold, "y_hat_BR": y_hat_BR, "y_hat_BR2": y_hat_BR2, "y_hat_boruta": y_hat_boruta, "y_hat_boruta2": y_hat_boruta2, "y_actual": y_actual, }) # then append the evaluation results to values values.extend(evalResults(results, eval=eval)) return values
def testBARvSelf(X, Y, eval, folds=5): if eval == "reg": eval_metric = "rmse" else: eval_metric = "logloss" br_params = {"eval_metric": eval_metric} np.random.seed( None ) # removing any seed to ensure that the folds are created differently # initialize empty lists - not the most efficient, but it works bar_times = [] bar2_times = [] y_hat = [] y_hat_BR = [] y_hat2_BR = [] y_actual = [] fold = [] # Start the cross validation kf = KFold(n_splits=folds) i = 1 for train, test in kf.split(X): X_train, X_test, y_train, y_test = ( X.iloc[train], X.iloc[test], Y[train], Y[test], ) # Get predictions on all features y_pred = TrainGetPreds(X_train, y_train, X_test, metric=eval_metric) # BAR1 tmp = time.time() br_one = BoostARoota(param=br_params) br_one.fit(X_train, y_train) BR_vars = br_one.keep_vars_ bar_times.append(time.time() - tmp) BR_X = X_train[BR_vars] BR_test = X_test[BR_vars] BR_preds = TrainGetPreds(BR_X, y_train, BR_test, metric=eval_metric) # BAR2 tmp = time.time() br_two = BoostARoota(param=br_params) br_two.fit(X_train, y_train) BR_vars = br_two.keep_vars_ bar2_times.append(time.time() - tmp) BR_X = X_train[BR_vars] BR_test = X_test[BR_vars] BR2_preds = TrainGetPreds(BR_X, y_train, BR_test, metric=eval_metric) # evaluate predictions and append to lists y_hat.extend(y_pred) y_hat_BR.extend(BR_preds) y_hat2_BR.extend(BR2_preds) y_actual.extend(y_test) # Set the fold it is trained on fold.extend([i] * len(y_pred)) i += 1 values = [np.mean(bar_times), np.mean(bar2_times)] # Start building the array to be passed out; first is the timings, then the eval results # Build the dataframe to pass into the evaluation functions results = pd.DataFrame({ "y_hat": y_hat, "Fold": fold, "y_hat_BR": y_hat_BR, "y_hat2_BR": y_hat2_BR, "y_actual": y_actual, }) values.extend(evalBARBAR(results, eval=eval)) return pd.DataFrame( values, [ "BarTime1", "BarTime2", "BAR1_Metric1", "BAR2_Metric1", "AllMetric1", "BAR1_Metric2", "BAR2_Metric2", "AllMetric2", ], )