示例#1
0
def oneIteration():
    timings = []
    for att, obj in combos:
        #subsample the objects/observations
        this_df = train.sample(n=obj).copy()
        this_y = labels.loc[this_df.index.values].copy()
        # this_df.reset_index(inplace=True, drop=True)
        # this_y.reset_index(inplace=True, drop=True)
        #extend the attributes
        if att != 500:
            extraAttLen = att - 500
            extraAttributes = pd.DataFrame(
                np.random.randint(0, 100, size=(obj, extraAttLen)),
                columns=['Ext' + str(x) for x in range(extraAttLen)])
            #Bind the dataframes together
            bound = pd.concat([this_df, extraAttributes],
                              axis=1,
                              ignore_index=True)

        else:
            bound = this_df.copy()
        start = time.time()
        br = BoostARoota(metric='logloss')
        br.fit(bound, this_y)
        timings.append(time.time() - start)
    return timings
示例#2
0
文件: diaml.py 项目: stenpiren/diaml
    def __init__(self, parameter_tuning=False, classifier_list=None):
        self.results = None
        self.new_results = None

        if classifier_list is None:
            self.classifiers = [
                make_pipeline(DiaPoly(), lgb.LGBMClassifier()),
                make_pipeline(DiaPoly(), LogisticRegression()),
                make_pipeline(BoostARoota(metric='logloss'),
                              lgb.LGBMClassifier()),
                make_pipeline(BoostARoota(metric='logloss'), DiaPoly(),
                              lgb.LGBMClassifier()),
                # make_pipeline(BoostARoota(metric='logloss', max_rounds=1), lgb.LGBMClassifier()),
                # make_pipeline(BoostARoota(metric='logloss', max_rounds=1), GradientBoostingClassifier()),
                ElasticNet(),
                LogisticRegression(),
                Lasso(),
                # Ridge(),
                lgb.LGBMClassifier(),
                CatBoostClassifier(iterations=100, logging_level='Silent'),
                # make_pipeline(BoostARoota(metric='logloss'), CatBoostClassifier(iterations=1000, logging_level='Silent')),
                CatBoostClassifier(iterations=1000, logging_level='Silent')
            ]

        else:
            self.classifiers = classifier_list
示例#3
0
def trainKFolds(X, Y, eval, folds=5):
    if eval == "reg":
        eval_metric = "rmse"
    else:
        eval_metric = "logloss"
    np.random.seed(
        None
    )  # removing any seed to ensure that the folds are created differently

    # initialize empty lists - not the most efficient, but it works
    bar_times = []
    y_hat = []
    y_hat_BR = []
    y_actual = []
    fold = []

    # Start the cross validation
    kf = KFold(n_splits=folds)
    i = 1
    for train, test in kf.split(X):
        X_train, X_test, y_train, y_test = (
            X.iloc[train],
            X.iloc[test],
            Y[train],
            Y[test],
        )

        # Get predictions on all features
        y_pred = TrainGetPreds(X_train, y_train, X_test, metric=eval_metric)

        # BoostARoota - tune to metric 1
        tmp = time.time()
        br = BoostARoota(eval_metric)
        br.fit(X_train, y_train)
        bar_times.append(time.time() - tmp)
        BR_X = br.transform(X_train)
        BR_test = br.transform(X_test)
        BR_preds = TrainGetPreds(BR_X, y_train, BR_test, metric=eval_metric)

        # evaluate predictions and append to lists
        y_hat.extend(y_pred)
        y_hat_BR.extend(BR_preds)
        y_actual.extend(y_test)
        # Set the fold it is trained on
        fold.extend([i] * len(y_pred))
        i += 1

    # Start building the array to be passed out; first is the timings, then the eval results
    values = [np.mean(bar_times)]
    # Build the dataframe to pass into the evaluation functions
    results = pd.DataFrame(
        {"y_hat": y_hat, "Fold": fold, "y_hat_BR": y_hat_BR, "y_actual": y_actual}
    )

    # then append the evaluation results to values
    values.extend(evalResults(results, eval=eval))

    return values
示例#4
0
#################
#Madelon Dataset
train_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data'
# download the file
raw_data = urllib.request.urlopen(train_url)
train = pd.read_csv(raw_data, delim_whitespace=True, header=None)
train.columns = ["Var" + str(x) for x in range(len(train.columns))]
labels_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels'
raw_data = urllib.request.urlopen(labels_url)
labels = pd.read_csv(raw_data, delimiter=",", header=None)
labels.columns = ["Y"]

########################################################################################################################
#
#  Test that BoostARoota is working
#
########################################################################################################################
br = BoostARoota(metric='logloss')

br.fit(train, labels)
len(train.columns)
len(br.keep_vars_)
new_train = br.transform(train)
new_train2 = br.fit_transform(train, labels)

#Dimension Reduction
print("Original training set has " + str(train.shape) + " dimensions. \n" +\
"BoostARoota with .fit() and .transform() reduces to " + str(new_train.shape) + " dimensions. \n" +\
"BoostARoota with .fit_transform() reduces to " + str(new_train2.shape) + " dimensions.\n" +\
"The two methods may give a slightly different dimensions because of random variation as it is being refit")
示例#5
0
    target_entity='obs',
    agg_primitives=['min', 'max', 'mean', 'count', 'sum', 'std', 'trend'],
    trans_primitives=['divide_numeric_scalar', 'multiply_numeric'],
    max_depth=1,
    n_jobs=1,
    verbose=1)

feature_matrix = feature_selection(feature_matrix, correlation_threshold=0.8)

df_ = feature_matrix  # make a copy of this
df_ = df_.dropna(how='any', axis=1)
y = df.pop('Class')

X = df_  # and another copy. Might not need this

br = BoostARoota(metric='logloss')
br.fit(X, y)

print()
print('keep vars')
print(list(br.keep_vars_))
chosen_features = br.keep_vars_

print()
print(' Final chosen features:')
print(' {}'.format(chosen_features))

# split these out
# this isn't going to be shuffled because that's a mess.
X_temp, X_holdout, y_temp, y_holdout = train_test_split(
    X[chosen_features].to_numpy(), y.to_numpy(), test_size=0.10)
示例#6
0
		df_.drop(inf_list, axis=1)

	# would it go faster if we can drop all the original columns?
	df_ = df_.drop(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
					'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
					'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28'], axis=1)

	df_ = df_.dropna(how='any', axis=1)
	X = df_ # and another copy. Might not need this



	print()
	print('Starting Boruta')

	br = BoostARoota(metric='logloss', silent=True) #cutoff=.5, delta=0.7, silent=True)
	br.fit(X, y)
	chosen_features = br.keep_vars_

	if len(chosen_features) == 0:
		print(' No chosen features!')
		continue

	ind = range(len(chosen_features))

	# let's do multiple rounds of this fitting and average
	avg_dict = {}

	for i in range(num_its):

		xgb_for_feat_imp = xgb.train(dtrain = xgb.DMatrix(X[chosen_features], label=y), params={})
示例#7
0
train_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data"
# download the file
raw_data = urllib.request.urlopen(train_url)
train = pd.read_csv(raw_data, delim_whitespace=True, header=None)
train.columns = ["Var" + str(x) for x in range(len(train.columns))]
labels_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels"
raw_data = urllib.request.urlopen(labels_url)
labels = pd.read_csv(raw_data, delimiter=",", header=None)
labels.columns = ["Y"]

########################################################################################################################
#
#  Test that BoostARoota is working
#
########################################################################################################################
br = BoostARoota(metric="logloss")

br.fit(train, labels)
len(train.columns)
len(br.keep_vars_)
new_train = br.transform(train)
new_train2 = br.fit_transform(train, labels)

# Dimension Reduction
print(
    "Original training set has " + str(train.shape) + " dimensions. \n" +
    "BoostARoota with .fit() and .transform() reduces to " +
    str(new_train.shape) + " dimensions. \n" +
    "BoostARoota with .fit_transform() reduces to " + str(new_train2.shape) +
    " dimensions.\n" +
    "The two methods may give a slightly different dimensions because of random variation as it is being refit"
示例#8
0
def trainKFolds(X, Y, eval, folds=5):
    if eval == "reg":
        eval_metric = "rmse"
        eval_metric2 = "mae"
    else:
        eval_metric = "logloss"
        eval_metric2 = "auc"
    np.random.seed(
        None
    )  # removing any seed to ensure that the folds are created differently

    # initialize empty lists - not the most efficient, but it works
    bar_times = []
    boruta_times = []
    y_hat = []
    y_hat2 = []
    y_hat_BR = []
    y_hat_BR2 = []
    y_hat_boruta = []
    y_hat_boruta2 = []
    y_actual = []
    fold = []

    # Start the cross validation
    kf = KFold(n_splits=folds)
    i = 1
    for train, test in kf.split(X):
        X_train, X_test, y_train, y_test = (
            X.iloc[train],
            X.iloc[test],
            Y[train],
            Y[test],
        )

        # Get predictions on all features
        y_pred = TrainGetPreds(X_train, y_train, X_test, metric=eval_metric)
        y_pred2 = TrainGetPreds(X_train, y_train, X_test, metric=eval_metric2)

        # BoostARoota - tune to metric 1
        tmp = time.time()
        br_one = BoostARoota(param={"eval_metric": eval_metric})
        br_one.fit(X_train, y_train)
        BR_vars = br_one.keep_vars_
        bar_times.append(time.time() - tmp)
        BR_X = X_train[BR_vars]
        BR_test = X_test[BR_vars]
        BR_preds = TrainGetPreds(BR_X, y_train, BR_test, metric=eval_metric)

        # BoostARoota - tune to metric 2
        tmp = time.time()
        br_two = BoostARoota(param={"eval_metric": eval_metric2})
        br_two.fit(X_train, y_train)
        BR_vars = br_two.keep_vars_
        bar_times.append(time.time() - tmp)
        BR_X = X_train[BR_vars]
        BR_test = X_test[BR_vars]
        BR_preds2 = TrainGetPreds(BR_X, y_train, BR_test, metric=eval_metric2)

        # #Boruta - get predictions
        tmp = time.time()
        rf = RandomForestClassifier(n_jobs=-1,
                                    class_weight="balanced",
                                    max_depth=5)
        feat_selector = BorutaPy(rf,
                                 n_estimators="auto",
                                 verbose=2,
                                 random_state=1)
        feat_selector.fit(X_train.values, y_train.values)
        boruta_times.append(time.time() - tmp)
        X_train_filter = feat_selector.transform(X_train.values)
        X_test_filter = feat_selector.transform(X_test.values)
        Boruta_preds = TrainGetPreds(X_train_filter,
                                     y_train,
                                     X_test_filter,
                                     metric=eval_metric)
        Boruta_preds2 = TrainGetPreds(X_train_filter,
                                      y_train,
                                      X_test_filter,
                                      metric=eval_metric2)

        # evaluate predictions and append to lists
        y_hat.extend(y_pred)
        y_hat2.extend(y_pred2)
        y_hat_BR.extend(BR_preds)
        y_hat_BR2.extend(BR_preds2)
        y_hat_boruta.extend(Boruta_preds)
        y_hat_boruta2.extend(Boruta_preds2)
        y_actual.extend(y_test)
        # Set the fold it is trained on
        fold.extend([i] * len(y_pred))
        i += 1

    # Start building the array to be passed out; first is the timings, then the eval results
    values = [np.mean(boruta_times), np.mean(bar_times)]
    # Build the dataframe to pass into the evaluation functions
    results = pd.DataFrame({
        "y_hat": y_hat,
        "y_hat2": y_hat2,
        "Fold": fold,
        "y_hat_BR": y_hat_BR,
        "y_hat_BR2": y_hat_BR2,
        "y_hat_boruta": y_hat_boruta,
        "y_hat_boruta2": y_hat_boruta2,
        "y_actual": y_actual,
    })

    # then append the evaluation results to values
    values.extend(evalResults(results, eval=eval))

    return values
示例#9
0
def testBARvSelf(X, Y, eval, folds=5):
    if eval == "reg":
        eval_metric = "rmse"
    else:
        eval_metric = "logloss"

    br_params = {"eval_metric": eval_metric}

    np.random.seed(
        None
    )  # removing any seed to ensure that the folds are created differently

    # initialize empty lists - not the most efficient, but it works
    bar_times = []
    bar2_times = []
    y_hat = []
    y_hat_BR = []
    y_hat2_BR = []
    y_actual = []
    fold = []

    # Start the cross validation
    kf = KFold(n_splits=folds)
    i = 1
    for train, test in kf.split(X):
        X_train, X_test, y_train, y_test = (
            X.iloc[train],
            X.iloc[test],
            Y[train],
            Y[test],
        )

        # Get predictions on all features
        y_pred = TrainGetPreds(X_train, y_train, X_test, metric=eval_metric)

        # BAR1
        tmp = time.time()
        br_one = BoostARoota(param=br_params)
        br_one.fit(X_train, y_train)
        BR_vars = br_one.keep_vars_
        bar_times.append(time.time() - tmp)
        BR_X = X_train[BR_vars]
        BR_test = X_test[BR_vars]
        BR_preds = TrainGetPreds(BR_X, y_train, BR_test, metric=eval_metric)

        # BAR2
        tmp = time.time()
        br_two = BoostARoota(param=br_params)
        br_two.fit(X_train, y_train)
        BR_vars = br_two.keep_vars_
        bar2_times.append(time.time() - tmp)
        BR_X = X_train[BR_vars]
        BR_test = X_test[BR_vars]
        BR2_preds = TrainGetPreds(BR_X, y_train, BR_test, metric=eval_metric)

        # evaluate predictions and append to lists
        y_hat.extend(y_pred)
        y_hat_BR.extend(BR_preds)
        y_hat2_BR.extend(BR2_preds)
        y_actual.extend(y_test)
        # Set the fold it is trained on
        fold.extend([i] * len(y_pred))
        i += 1

    values = [np.mean(bar_times), np.mean(bar2_times)]
    # Start building the array to be passed out; first is the timings, then the eval results
    # Build the dataframe to pass into the evaluation functions
    results = pd.DataFrame({
        "y_hat": y_hat,
        "Fold": fold,
        "y_hat_BR": y_hat_BR,
        "y_hat2_BR": y_hat2_BR,
        "y_actual": y_actual,
    })
    values.extend(evalBARBAR(results, eval=eval))

    return pd.DataFrame(
        values,
        [
            "BarTime1",
            "BarTime2",
            "BAR1_Metric1",
            "BAR2_Metric1",
            "AllMetric1",
            "BAR1_Metric2",
            "BAR2_Metric2",
            "AllMetric2",
        ],
    )