def linear_regression(X_train, X_test, y_train, y_test, *, normalize=False, copy_X=True, n_jobs=None): linReg = LinearRegression(normalize=normalize, copy_X=copy_X, n_jobs=n_jobs) model = linReg fit_start = time.time() linReg.fit(X_train, y_train) fit_end = time.time() fit_time = fit_end - fit_start pred_start = time.time() y_prediction = linReg.predict(X_test) pred_end = time.time() pred_time = pred_end - pred_start evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time) w = linReg.coef_ print(w) export = pd.DataFrame(columns=['y_test', 'y_prediction']) export['y_test'] = y_test export['y_prediction'] = y_prediction export.to_csv('notebooks/export.csv', index=False)
def XGBoost(X_train, X_test, y_train, y_test, *, objective='reg:linear', colsample_bytree=0.3, learning_rate=0.1, max_depth=5, alpha=10, n_estimators=10): xg_reg = xgb.XGBRegressor(objective=objective, colsample_bytree=colsample_bytree, learning_rate=learning_rate, max_depth=max_depth, alpha=alpha, n_estimators=n_estimators) model = str(xg_reg) fit_start = time.time() xg_reg.fit(X_train, y_train) fit_end = time.time() fit_time = fit_end - fit_start pred_start = time.time() y_prediction = xg_reg.predict(X_test) pred_end = time.time() pred_time = pred_end - pred_start evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time)
def k_neighbor_regressor(X_train, X_test, y_train, y_test, *, n_neighbors=3, weights='uniform', algorithm='auto', leaf_size=3, p=2, metric='minkowski', metric_params=None): knr = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, leaf_size=leaf_size, p=p, metric=metric, metric_params=metric_params) model = knr fit_start = time.time() knr.fit(X_train, y_train) fit_end = time.time() fit_time = fit_end - fit_start pred_start = time.time() y_prediction = knr.predict(X_test) pred_end = time.time() pred_time = pred_end - pred_start evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time) return y_test, y_prediction, model, fit_time, pred_time
def decision_tree(X_train, X_test, y_train, y_test, *, max_depth=None, random_state=None): dTree = DecisionTreeRegressor(max_depth=max_depth, random_state=random_state) model = str(dTree) + '\n\nwithout Pruning' fit_start = time.time() dTree.fit(X_train, y_train) fit_end = time.time() fit_time = fit_end - fit_start pred_start = time.time() y_prediction = dTree.predict(X_test) pred_end = time.time() pred_time = pred_end - pred_start evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time) evaluation.print_errors(y_test, y_prediction, model, fit_time, pred_time)
def random_forest(X_train, X_test, y_train, y_test, *, n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=2, random_state=None, verbose=1, warm_start=False): regr = RandomForestRegressor( n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start) model = str(regr) fit_start = time.time() regr.fit(X_train, y_train) fit_end = time.time() fit_time = fit_end - fit_start pred_start = time.time() y_prediction = regr.predict(X_test) pred_end = time.time() pred_time = pred_end - pred_start evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time) export = pd.DataFrame(columns=['y_test', 'y_prediction']) export['y_test'] = y_test export['y_prediction'] = y_prediction export.to_csv('notebooks/export_rf.csv', index=False)
def dtree_with_pruning(X_train, X_test, y_train, y_test, *, max_depth=None, random_state=None): # Erstellen und Trainieren des ursprünglichen Baumes dtree = DecisionTreeRegressor(max_depth=max_depth, random_state=random_state) model = str(dtree) + '\n\nwith Pruning (Legacy)' fit_start = time.time() dtree.fit(X_train, y_train) fit_end = time.time() fit_time = fit_end - fit_start pred_start = time.time() # Erstellen einer Liste zum Speichern der ge-prunten Bäume tree_array = [dtree] num_nodes = dtree.tree_.capacity # Pruning der Bäume und Anhängen an die Liste k = 1 while num_nodes > 1: tree_array.append(copy.deepcopy(tree_array[k - 1])) min_node_idx, min_gk = models.dtree.prune.determine_alpha( tree_array[k].tree_) models.dtree.prune.prune(tree_array[k].tree_, min_node_idx) num_nodes = sum(1 * (tree_array[k].tree_.n_node_samples != 0)) k += 1 # Finden des besten Baumes, basierend auf den Test-Daten predictlist = [] for i in range(0, len(tree_array)): pred = tree_array[i].predict(X_test) # predictlist.append(tree_array[i].score(X_test, y_test)) predictlist.append(mean_squared_error(y_test, pred)) tree_scores = np.array(predictlist) index = tree_scores.argmin() pred = tree_array[index].predict(X_test) pred_end = time.time() pred_time = pred_end - pred_start evaluation.save_errors(y_test, pred, model, fit_time, pred_time) evaluation.print_errors(y_test, pred, model, fit_time, pred_time)
def svm_regression(X_train, X_test, y_train, y_test, *, kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1): svmr = SVR(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, epsilon=epsilon, shrinking=shrinking, cache_size=cache_size, verbose=verbose, max_iter=max_iter) svmr_model = svmr svmr_fit_start = time.time() svmr.fit(X_train, y_train) svmr_fit_end = time.time() svmr_fit_time = svmr_fit_end - svmr_fit_start svmr_pred_start = time.time() pred = svmr.predict(X_test) svmr_pred_end = time.time() svmr_pred_time = svmr_pred_end - svmr_pred_start evaluation.save_errors(y_test, pred, svmr_model, svmr_fit_time, svmr_pred_time) evaluation.print_errors(y_test, pred, svmr_model, svmr_fit_time, svmr_pred_time)
def dtree_with_pruning_faster(X_train, X_test, y_train, y_test, *, max_depth=None, random_state=None): # Initiate model dtree = DecisionTreeRegressor(max_depth=max_depth, random_state=random_state) model = str(dtree) + '\n\nwith Pruning (Faster) ' # Fit model fit_start = time.time() dtree.fit(X_train, y_train) fit_end = time.time() fit_time = fit_end - fit_start pred_start = time.time() # Pruning trees tree_pruner = models.dtree.prune_faster.TreePruner(dtree) tree_pruner.run() # Calculating errors test_errors = [] train_errors = [] for tree in tree_pruner.trees: y_pred_test = tree.predict(X_test) test_errors.append(mean_squared_error(y_test, y_pred_test)) y_pred_train = tree.predict(X_train) train_errors.append(mean_squared_error(y_train, y_pred_train)) # Find the best tree based on test data test_errors_np = np.array(test_errors) index = test_errors_np.argmin() pred = tree_pruner.trees[index].predict(X_test) pred_end = time.time() pred_time = pred_end - pred_start evaluation.save_errors(y_test, pred, model, fit_time, pred_time) evaluation.print_errors(y_test, pred, model, fit_time, pred_time)
tree_pruner = trees.TreePruner(dTree) tree_pruner.run() error_rates = pd.DataFrame( columns=["tree", "MSE", "RMSE", "R2", "RMSE % of mean", "Cali"]) error_rates_tr = pd.DataFrame( columns=["tree", "MSE", "RMSE", "R2", "RMSE % of mean", "Cali"]) for t in tree_pruner.trees: pred = t.predict(val_X) pred_training = t.predict(train_X) idx = tree_pruner.trees.index(t) new_row = pd.concat( [pd.Series(idx, name="tree"), evaluation.save_errors(val_y, pred)], axis=1) new_row_tr = pd.concat([ pd.Series(idx, name="tree"), evaluation.save_errors(train_y, pred_training) ], axis=1) error_rates = error_rates.append(new_row, sort=False) error_rates_tr = error_rates_tr.append(new_row_tr, sort=False) print("sorted error rates for val:\n") err_sorted = error_rates.sort_values(["R2", "RMSE"], ascending=[False, True]) print(err_sorted) best_tree_nr = err_sorted.iloc[0, 0] best_tree = tree_pruner.trees[best_tree_nr]
def GBR(X_train, X_test, y_train, y_test, *, loss='ls', learning_rate=0.2, n_estimators=30, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.8, verbose=1, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001): GBR = GradientBoostingRegressor( loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, init=init, random_state=random_state, max_features=max_features, alpha=alpha, verbose=verbose, max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, presort=presort, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol) model = str(GBR) fit_start = time.time() GBR.fit(X_train, y_train) fit_end = time.time() fit_time = fit_end - fit_start pred_start = time.time() y_prediction = GBR.predict(X_test) pred_end = time.time() pred_time = pred_end - pred_start evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time)