Пример #1
0
def decision_tree(X_train,
                  X_test,
                  y_train,
                  y_test,
                  *,
                  max_depth=None,
                  random_state=None):

    dTree = DecisionTreeRegressor(max_depth=max_depth,
                                  random_state=random_state)

    model = str(dTree) + '\n\nwithout Pruning'

    fit_start = time.time()
    dTree.fit(X_train, y_train)
    fit_end = time.time()
    fit_time = fit_end - fit_start

    pred_start = time.time()
    y_prediction = dTree.predict(X_test)
    pred_end = time.time()
    pred_time = pred_end - pred_start

    evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time)
    evaluation.print_errors(y_test, y_prediction, model, fit_time, pred_time)
Пример #2
0
def random_forest(X_train,
                  X_test,
                  y_train,
                  y_test,
                  *,
                  n_estimators=10,
                  criterion='mse',
                  max_depth=None,
                  min_samples_split=2,
                  min_samples_leaf=1,
                  min_weight_fraction_leaf=0.0,
                  max_features='auto',
                  max_leaf_nodes=None,
                  min_impurity_decrease=0.0,
                  min_impurity_split=None,
                  bootstrap=True,
                  oob_score=False,
                  n_jobs=1,
                  random_state=None,
                  verbose=0,
                  warm_start=False):

    regr = RandomForestRegressor(
        n_estimators=n_estimators,
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf,
        max_features=max_features,
        max_leaf_nodes=max_leaf_nodes,
        min_impurity_decrease=min_impurity_decrease,
        min_impurity_split=min_impurity_split,
        bootstrap=bootstrap,
        oob_score=oob_score,
        n_jobs=n_jobs,
        random_state=random_state,
        verbose=verbose,
        warm_start=warm_start)

    model = str(regr)

    fit_start = time.time()
    regr.fit(X_train, y_train)
    fit_end = time.time()
    fit_time = fit_end - fit_start

    pred_start = time.time()
    y_prediction = regr.predict(X_test)
    pred_end = time.time()
    pred_time = pred_end - pred_start

    evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time)

    evaluation.print_errors(y_test, y_prediction, model, fit_time, pred_time)
Пример #3
0
def dtree_with_pruning(X_train, X_test, y_train, y_test, *, max_depth=None,
                       random_state=None):

    # Erstellen und Trainieren des ursprünglichen Baumes

    dtree = DecisionTreeRegressor(max_depth=max_depth,
                                  random_state=random_state)

    model = str(dtree) + '\n\nwith Pruning (Legacy)'

    fit_start = time.time()
    dtree.fit(X_train, y_train)
    fit_end = time.time()
    fit_time = fit_end - fit_start

    pred_start = time.time()
    # Erstellen einer Liste zum Speichern der ge-prunten Bäume
    tree_array = [dtree]
    num_nodes = dtree.tree_.capacity

    # Pruning der Bäume und Anhängen an die Liste
    k = 1

    while num_nodes > 1:
        tree_array.append(copy.deepcopy(tree_array[k - 1]))
        min_node_idx, min_gk = models.dtree.prune.determine_alpha(
            tree_array[k].tree_)
        models.dtree.prune.prune(tree_array[k].tree_, min_node_idx)
        num_nodes = sum(1 * (tree_array[k].tree_.n_node_samples != 0))
        k += 1

    # Finden des besten Baumes, basierend auf den Test-Daten
    predictlist = []

    for i in range(0, len(tree_array)):
        pred = tree_array[i].predict(X_test)
        # predictlist.append(tree_array[i].score(X_test, y_test))
        predictlist.append(mean_squared_error(y_test, pred))

    tree_scores = np.array(predictlist)
    index = tree_scores.argmin()
    pred = tree_array[index].predict(X_test)

    pred_end = time.time()
    pred_time = pred_end - pred_start

    evaluation.save_errors(y_test, pred, model,
                           fit_time, pred_time)
    evaluation.print_errors(y_test, pred, model,
                            fit_time, pred_time)
Пример #4
0
def svm_regression(X_train,
                   X_test,
                   y_train,
                   y_test,
                   *,
                   kernel='rbf',
                   degree=3,
                   gamma='auto',
                   coef0=0.0,
                   tol=0.001,
                   C=1.0,
                   epsilon=0.1,
                   shrinking=True,
                   cache_size=200,
                   verbose=False,
                   max_iter=-1):
    svmr = SVR(kernel=kernel,
               degree=degree,
               gamma=gamma,
               coef0=coef0,
               tol=tol,
               C=C,
               epsilon=epsilon,
               shrinking=shrinking,
               cache_size=cache_size,
               verbose=verbose,
               max_iter=max_iter)

    svmr_model = svmr
    svmr_fit_start = time.time()
    svmr.fit(X_train, y_train)
    svmr_fit_end = time.time()
    svmr_fit_time = svmr_fit_end - svmr_fit_start

    svmr_pred_start = time.time()
    pred = svmr.predict(X_test)
    svmr_pred_end = time.time()
    svmr_pred_time = svmr_pred_end - svmr_pred_start

    evaluation.save_errors(y_test, pred, svmr_model, svmr_fit_time,
                           svmr_pred_time)
    evaluation.print_errors(y_test, pred, svmr_model, svmr_fit_time,
                            svmr_pred_time)
Пример #5
0
def dtree_with_pruning_faster(X_train, X_test, y_train, y_test, *,
                              max_depth=None,
                              random_state=None):

    # Initiate model
    dtree = DecisionTreeRegressor(max_depth=max_depth,
                                  random_state=random_state)
    model = str(dtree) + '\n\nwith Pruning (Faster) '

    # Fit model
    fit_start = time.time()
    dtree.fit(X_train, y_train)
    fit_end = time.time()
    fit_time = fit_end - fit_start

    pred_start = time.time()
    # Pruning trees
    tree_pruner = models.dtree.prune_faster.TreePruner(dtree)
    tree_pruner.run()

    # Calculating errors
    test_errors = []
    train_errors = []

    for tree in tree_pruner.trees:
        y_pred_test = tree.predict(X_test)
        test_errors.append(mean_squared_error(y_test, y_pred_test))
        y_pred_train = tree.predict(X_train)
        train_errors.append(mean_squared_error(y_train, y_pred_train))

    # Find the best tree based on test data
    test_errors_np = np.array(test_errors)
    index = test_errors_np.argmin()
    pred = tree_pruner.trees[index].predict(X_test)

    pred_end = time.time()
    pred_time = pred_end - pred_start

    evaluation.save_errors(y_test, pred, model,
                           fit_time, pred_time)
    evaluation.print_errors(y_test, pred, model,
                            fit_time, pred_time)
Пример #6
0
    plt.tight_layout()
    plt.show()

    train_X, test_X, train_y, test_y = features.heart.split_train_test(
        cleaned_data, "AHD")
    train_X, test_X, train_y, test_y = features.heart.scale_to_train(
        [train_X, test_X, train_y, test_y], [0, 2, 3, 6, 8, 9, 10, 18],
        "minmax")
    print(train_X)

    # log reg with simple feature set
    print("Evaluating simple feature set")
    #log_reg = lm.SGDClassifier(n_jobs=10, loss="log", max_iter = 50)
    log_reg = lm.LogisticRegression()
    log_reg.fit(train_X, train_y)
    pred = log_reg.predict(test_X)
    pred_proba = log_reg.predict_proba(test_X)

    evaluation.print_errors(test_y, pred)
    print("")
    """
    # log reg with advanced feature set
    print("Evaluating modified feature set")
    log_reg2 = lm.SGDClassifier(n_jobs=1, loss="log", max_iter=50)

    classifier.fit(log_reg2, input_data2, targets)
    pred, pred_proba = classifier.predict(log_reg2, input_data2)

    evaluation.print_errors(targets, pred)
    """
Пример #7
0

if __name__ == '__main__':
    data_train = pd.read_csv("data/zip.train", header = None, sep =" ")
    cleaned_train_data = data_train.dropna(axis=1, thresh=2)

    input_data = cleaned_train_data.iloc[:, 1:].values
    targets = cleaned_train_data[0].values

    input_data2 = features.zip_codes.multires(input_data)

    # log reg with simple feature set
    print("Evaluating simple feature set")
    log_reg = lm.SGDClassifier(n_jobs=1, loss="log", max_iter = 50)

    classifier.fit(log_reg, input_data, targets)
    pred, pred_proba = classifier.predict(log_reg, input_data)

    evaluation.print_errors(targets, pred)
    print("")

    # log reg with advanced feature set
    print("Evaluating modified feature set")
    log_reg2 = lm.SGDClassifier(n_jobs=1, loss="log", max_iter=50)

    classifier.fit(log_reg2, input_data2, targets)
    pred, pred_proba = classifier.predict(log_reg2, input_data2)

    evaluation.print_errors(targets, pred)

Пример #8
0
    clean_data = features.encode_binary(clean_data)
    clean_data = features.encode_category(clean_data, 'ChestPain')
    clean_data = features.encode_category(clean_data, 'Thal')

    data_train, data_test = features.split(clean_data, 0.2)

    X_train, y_train, X_test, y_test = features.set_target(
        data_train, data_test, 'AHD')

    logReg = lm.LogisticRegression()
    print(clean_data.head())
    classifier.fit(logReg, X_train, y_train)

    pred, pred_proba = classifier.predict(logReg, X_test)

    evaluation.print_errors(y_test, pred)

    #print(len(data_train), len(data_test))

    #print(clean_data)
    # cleaned_train_data = data_train.dropna(axis=1, thresh=2)
    #
    # input_data = cleaned_train_data.iloc[:, 1:].values
    # targets = cleaned_train_data[0].values
    #
    # input_data2 = features.zip_codes.multires(input_data)
    #
    # # log reg with simple feature set
    # print("Evaluating simple feature set")
    # log_reg = lm.SGDClassifier(n_jobs=1, loss="log", max_iter = 50)
    #