Exemplo n.º 1
0
def linear_regression(X_train,
                      X_test,
                      y_train,
                      y_test,
                      *,
                      normalize=False,
                      copy_X=True,
                      n_jobs=None):
    linReg = LinearRegression(normalize=normalize,
                              copy_X=copy_X,
                              n_jobs=n_jobs)

    model = linReg
    fit_start = time.time()
    linReg.fit(X_train, y_train)
    fit_end = time.time()
    fit_time = fit_end - fit_start

    pred_start = time.time()
    y_prediction = linReg.predict(X_test)
    pred_end = time.time()
    pred_time = pred_end - pred_start

    evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time)
    w = linReg.coef_
    print(w)
    export = pd.DataFrame(columns=['y_test', 'y_prediction'])
    export['y_test'] = y_test
    export['y_prediction'] = y_prediction
    export.to_csv('notebooks/export.csv', index=False)
Exemplo n.º 2
0
def XGBoost(X_train,
            X_test,
            y_train,
            y_test,
            *,
            objective='reg:linear',
            colsample_bytree=0.3,
            learning_rate=0.1,
            max_depth=5,
            alpha=10,
            n_estimators=10):

    xg_reg = xgb.XGBRegressor(objective=objective,
                              colsample_bytree=colsample_bytree,
                              learning_rate=learning_rate,
                              max_depth=max_depth,
                              alpha=alpha,
                              n_estimators=n_estimators)

    model = str(xg_reg)

    fit_start = time.time()
    xg_reg.fit(X_train, y_train)
    fit_end = time.time()
    fit_time = fit_end - fit_start

    pred_start = time.time()
    y_prediction = xg_reg.predict(X_test)
    pred_end = time.time()
    pred_time = pred_end - pred_start

    evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time)
Exemplo n.º 3
0
def k_neighbor_regressor(X_train,
                         X_test,
                         y_train,
                         y_test,
                         *,
                         n_neighbors=3,
                         weights='uniform',
                         algorithm='auto',
                         leaf_size=3,
                         p=2,
                         metric='minkowski',
                         metric_params=None):
    knr = KNeighborsRegressor(n_neighbors=n_neighbors,
                              weights=weights,
                              algorithm=algorithm,
                              leaf_size=leaf_size,
                              p=p,
                              metric=metric,
                              metric_params=metric_params)
    model = knr
    fit_start = time.time()
    knr.fit(X_train, y_train)
    fit_end = time.time()
    fit_time = fit_end - fit_start

    pred_start = time.time()
    y_prediction = knr.predict(X_test)
    pred_end = time.time()
    pred_time = pred_end - pred_start

    evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time)

    return y_test, y_prediction, model, fit_time, pred_time
Exemplo n.º 4
0
def decision_tree(X_train,
                  X_test,
                  y_train,
                  y_test,
                  *,
                  max_depth=None,
                  random_state=None):

    dTree = DecisionTreeRegressor(max_depth=max_depth,
                                  random_state=random_state)

    model = str(dTree) + '\n\nwithout Pruning'

    fit_start = time.time()
    dTree.fit(X_train, y_train)
    fit_end = time.time()
    fit_time = fit_end - fit_start

    pred_start = time.time()
    y_prediction = dTree.predict(X_test)
    pred_end = time.time()
    pred_time = pred_end - pred_start

    evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time)
    evaluation.print_errors(y_test, y_prediction, model, fit_time, pred_time)
Exemplo n.º 5
0
def random_forest(X_train,
                  X_test,
                  y_train,
                  y_test,
                  *,
                  n_estimators=10,
                  criterion='mse',
                  max_depth=None,
                  min_samples_split=2,
                  min_samples_leaf=1,
                  min_weight_fraction_leaf=0.0,
                  max_features='auto',
                  max_leaf_nodes=None,
                  min_impurity_decrease=0.0,
                  min_impurity_split=None,
                  bootstrap=True,
                  oob_score=False,
                  n_jobs=2,
                  random_state=None,
                  verbose=1,
                  warm_start=False):

    regr = RandomForestRegressor(
        n_estimators=n_estimators,
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf,
        max_features=max_features,
        max_leaf_nodes=max_leaf_nodes,
        min_impurity_decrease=min_impurity_decrease,
        min_impurity_split=min_impurity_split,
        bootstrap=bootstrap,
        oob_score=oob_score,
        n_jobs=n_jobs,
        random_state=random_state,
        verbose=verbose,
        warm_start=warm_start)

    model = str(regr)

    fit_start = time.time()
    regr.fit(X_train, y_train)
    fit_end = time.time()
    fit_time = fit_end - fit_start

    pred_start = time.time()
    y_prediction = regr.predict(X_test)
    pred_end = time.time()
    pred_time = pred_end - pred_start

    evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time)

    export = pd.DataFrame(columns=['y_test', 'y_prediction'])
    export['y_test'] = y_test
    export['y_prediction'] = y_prediction
    export.to_csv('notebooks/export_rf.csv', index=False)
Exemplo n.º 6
0
def dtree_with_pruning(X_train, X_test, y_train, y_test, *, max_depth=None,
                       random_state=None):

    # Erstellen und Trainieren des ursprünglichen Baumes

    dtree = DecisionTreeRegressor(max_depth=max_depth,
                                  random_state=random_state)

    model = str(dtree) + '\n\nwith Pruning (Legacy)'

    fit_start = time.time()
    dtree.fit(X_train, y_train)
    fit_end = time.time()
    fit_time = fit_end - fit_start

    pred_start = time.time()
    # Erstellen einer Liste zum Speichern der ge-prunten Bäume
    tree_array = [dtree]
    num_nodes = dtree.tree_.capacity

    # Pruning der Bäume und Anhängen an die Liste
    k = 1

    while num_nodes > 1:
        tree_array.append(copy.deepcopy(tree_array[k - 1]))
        min_node_idx, min_gk = models.dtree.prune.determine_alpha(
            tree_array[k].tree_)
        models.dtree.prune.prune(tree_array[k].tree_, min_node_idx)
        num_nodes = sum(1 * (tree_array[k].tree_.n_node_samples != 0))
        k += 1

    # Finden des besten Baumes, basierend auf den Test-Daten
    predictlist = []

    for i in range(0, len(tree_array)):
        pred = tree_array[i].predict(X_test)
        # predictlist.append(tree_array[i].score(X_test, y_test))
        predictlist.append(mean_squared_error(y_test, pred))

    tree_scores = np.array(predictlist)
    index = tree_scores.argmin()
    pred = tree_array[index].predict(X_test)

    pred_end = time.time()
    pred_time = pred_end - pred_start

    evaluation.save_errors(y_test, pred, model,
                           fit_time, pred_time)
    evaluation.print_errors(y_test, pred, model,
                            fit_time, pred_time)
Exemplo n.º 7
0
def svm_regression(X_train,
                   X_test,
                   y_train,
                   y_test,
                   *,
                   kernel='rbf',
                   degree=3,
                   gamma='auto',
                   coef0=0.0,
                   tol=0.001,
                   C=1.0,
                   epsilon=0.1,
                   shrinking=True,
                   cache_size=200,
                   verbose=False,
                   max_iter=-1):
    svmr = SVR(kernel=kernel,
               degree=degree,
               gamma=gamma,
               coef0=coef0,
               tol=tol,
               C=C,
               epsilon=epsilon,
               shrinking=shrinking,
               cache_size=cache_size,
               verbose=verbose,
               max_iter=max_iter)

    svmr_model = svmr
    svmr_fit_start = time.time()
    svmr.fit(X_train, y_train)
    svmr_fit_end = time.time()
    svmr_fit_time = svmr_fit_end - svmr_fit_start

    svmr_pred_start = time.time()
    pred = svmr.predict(X_test)
    svmr_pred_end = time.time()
    svmr_pred_time = svmr_pred_end - svmr_pred_start

    evaluation.save_errors(y_test, pred, svmr_model, svmr_fit_time,
                           svmr_pred_time)
    evaluation.print_errors(y_test, pred, svmr_model, svmr_fit_time,
                            svmr_pred_time)
Exemplo n.º 8
0
def dtree_with_pruning_faster(X_train, X_test, y_train, y_test, *,
                              max_depth=None,
                              random_state=None):

    # Initiate model
    dtree = DecisionTreeRegressor(max_depth=max_depth,
                                  random_state=random_state)
    model = str(dtree) + '\n\nwith Pruning (Faster) '

    # Fit model
    fit_start = time.time()
    dtree.fit(X_train, y_train)
    fit_end = time.time()
    fit_time = fit_end - fit_start

    pred_start = time.time()
    # Pruning trees
    tree_pruner = models.dtree.prune_faster.TreePruner(dtree)
    tree_pruner.run()

    # Calculating errors
    test_errors = []
    train_errors = []

    for tree in tree_pruner.trees:
        y_pred_test = tree.predict(X_test)
        test_errors.append(mean_squared_error(y_test, y_pred_test))
        y_pred_train = tree.predict(X_train)
        train_errors.append(mean_squared_error(y_train, y_pred_train))

    # Find the best tree based on test data
    test_errors_np = np.array(test_errors)
    index = test_errors_np.argmin()
    pred = tree_pruner.trees[index].predict(X_test)

    pred_end = time.time()
    pred_time = pred_end - pred_start

    evaluation.save_errors(y_test, pred, model,
                           fit_time, pred_time)
    evaluation.print_errors(y_test, pred, model,
                            fit_time, pred_time)
    tree_pruner = trees.TreePruner(dTree)
    tree_pruner.run()

    error_rates = pd.DataFrame(
        columns=["tree", "MSE", "RMSE", "R2", "RMSE % of mean", "Cali"])
    error_rates_tr = pd.DataFrame(
        columns=["tree", "MSE", "RMSE", "R2", "RMSE % of mean", "Cali"])

    for t in tree_pruner.trees:
        pred = t.predict(val_X)
        pred_training = t.predict(train_X)
        idx = tree_pruner.trees.index(t)
        new_row = pd.concat(
            [pd.Series(idx, name="tree"),
             evaluation.save_errors(val_y, pred)],
            axis=1)
        new_row_tr = pd.concat([
            pd.Series(idx, name="tree"),
            evaluation.save_errors(train_y, pred_training)
        ],
                               axis=1)
        error_rates = error_rates.append(new_row, sort=False)
        error_rates_tr = error_rates_tr.append(new_row_tr, sort=False)

    print("sorted error rates for val:\n")
    err_sorted = error_rates.sort_values(["R2", "RMSE"],
                                         ascending=[False, True])
    print(err_sorted)
    best_tree_nr = err_sorted.iloc[0, 0]
    best_tree = tree_pruner.trees[best_tree_nr]
Exemplo n.º 10
0
def GBR(X_train,
        X_test,
        y_train,
        y_test,
        *,
        loss='ls',
        learning_rate=0.2,
        n_estimators=30,
        subsample=1.0,
        criterion='friedman_mse',
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_depth=3,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        init=None,
        random_state=None,
        max_features=None,
        alpha=0.8,
        verbose=1,
        max_leaf_nodes=None,
        warm_start=False,
        presort='auto',
        validation_fraction=0.1,
        n_iter_no_change=None,
        tol=0.0001):

    GBR = GradientBoostingRegressor(
        loss=loss,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        subsample=subsample,
        criterion=criterion,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf,
        max_depth=max_depth,
        min_impurity_decrease=min_impurity_decrease,
        min_impurity_split=min_impurity_split,
        init=init,
        random_state=random_state,
        max_features=max_features,
        alpha=alpha,
        verbose=verbose,
        max_leaf_nodes=max_leaf_nodes,
        warm_start=warm_start,
        presort=presort,
        validation_fraction=validation_fraction,
        n_iter_no_change=n_iter_no_change,
        tol=tol)

    model = str(GBR)

    fit_start = time.time()
    GBR.fit(X_train, y_train)
    fit_end = time.time()
    fit_time = fit_end - fit_start

    pred_start = time.time()
    y_prediction = GBR.predict(X_test)
    pred_end = time.time()
    pred_time = pred_end - pred_start

    evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time)