示例#1
0
def main():
    # ====================
    # Run sanity checks on model tree before training (using our own data)
    #  1) Reproduce model result on depth-0 model tree
    #  2) Reproduce sklearn DecisionTreeRegressor result using mean regression + mse
    #  3) Reproduce sklearn DecisionTreeClassifier result using modal class + gini loss
    # ====================
    run_tests(ModelTree, os.path.join("data", "data_clf.csv"))

    # ====================
    # For 1D polynomial data using a model tree with linear regression model
    # ====================

    # Generate 1D polynomial data and save as a csv
    func = lambda x: (x - 1) * (x - 4) * (x - 8) * (x - 8)
    data_csv_data_filename = os.path.join("data", "data_poly4_regr.csv")
    generate_csv_data(func, data_csv_data_filename, x_range=(0, 10), N=500)

    # Read generated data
    X, y, header = load_csv_data(data_csv_data_filename,
                                 mode="regr",
                                 verbose=True)
    assert X.shape[1] == 1

    # Train different depth model tree fits and plot results
    from models.mean_regr import mean_regr
    plot_model_tree_fit(mean_regr(), X, y)
    from models.linear_regr import linear_regr
    plot_model_tree_fit(linear_regr(), X, y)
示例#2
0
def main():
    # ====================
    # Settings
    # ====================
    mode = "regr"  # "clf" / "regr"
    save_model_tree = True  # save model tree?
    save_model_tree_predictions = True  # save model tree predictions/explanations?
    cross_validation = True  # cross-validate model tree?

    # ====================
    # Load data
    # ====================

    # directory = '/Users/xiangyusun/Development/LearningX/advanced_ML/model_tree/data'
    # directory = '~/Desktop/LearningX/advanced_ML/model_tree/data'
    directory = '/cs/oschulte/xiangyus/LearningX'

    # training_file_name = 'data_clf.csv'
    # training_file_name = 'sportlogiq_data_pass_2019_07_22_08_54_17_small.csv'
    training_file_name = 'sportlogiq_data_pass_2019_07_22_08_54_17.csv'

    data_csv_data_filename = os.path.join(directory, training_file_name)

    X, y, header = load_csv_data(data_csv_data_filename,
                                 mode=mode,
                                 verbose=True)

    # *********************************************
    #
    # Insert your models here!
    #
    # All models must have the following class instantiations:
    #
    #   fit(X, y)
    #   predict(X)
    #   loss(X, y, y_pred)
    #
    # Below are some ready-for-use regression models:
    #
    #   mean regressor  (models/mean_regr.py)
    #   linear regressor  (models/linear_regr.py)
    #   logistic regressor  (lmodels/ogistic_regr.py)
    #   support vector machine regressor  (models/svm_regr.py)
    #   decision tree regressor (models/DT_sklearn_regr.py)
    #   neural network regressor (models/DT_sklearn_regr.py)
    #
    # as well as some classification models:
    #
    #   modal classifier (models/modal_clf.py)
    #   decision tree classifier (models/DT_sklearn_clf.py)
    #
    # *********************************************
    from models.mean_regr import mean_regr
    from models.linear_regr import linear_regr
    from models.logistic_regr import logistic_regr
    from models.svm_regr import svm_regr
    from models.DT_sklearn_regr import DT_sklearn_regr

    from models.modal_clf import modal_clf
    from models.DT_sklearn_clf import DT_sklearn_clf

    # Choose model
    model = linear_regr()

    # Build model tree
    model_tree = ModelTree(model,
                           max_depth=10,
                           min_samples_leaf=10,
                           search_type="greedy",
                           n_search_grid=100)

    # ====================
    # Train model tree
    # ====================
    print("Training model tree with '{}'...".format(model.__class__.__name__))
    model_tree.fit(X, y, verbose=True)
    y_pred = model_tree.predict(X)
    explanations = model_tree.explain(X, header)
    loss = model_tree.loss(X, y, y_pred)
    print(" -> loss_train={:.6f}\n".format(loss))
    model_tree.export_graphviz(os.path.join(directory, "model_tree"),
                               header,
                               export_png=True,
                               export_pdf=False)

    # ====================
    # Save model tree results
    # ====================
    if save_model_tree:
        model_tree_filename = os.path.join(directory, "model_tree.p")
        print("Saving model tree to '{}'...".format(model_tree_filename))
        pickle.dump(model, open(model_tree_filename, 'wb'))

    if save_model_tree_predictions:
        predictions_csv_filename = os.path.join(directory,
                                                "model_tree_pred.csv")
        print("Saving mode tree predictions to '{}'".format(
            predictions_csv_filename))
        with open(predictions_csv_filename, "w") as f:
            writer = csv.writer(f)
            field_names = ["x", "y", "y_pred", "explanation"]
            writer.writerow(field_names)
            for (x_i, y_i, y_pred_i, exp_i) in zip(X, y, y_pred, explanations):
                field_values = [x_i, y_i, y_pred_i, exp_i]
                writer.writerow(field_values)
示例#3
0
def run_tests(ModelTree, data_csv_filename):

    print("Running model tree tests...")
    eps = 1E-6  # tolerance for test acceptance
    X, y, header = load_csv_data(data_csv_filename, mode="regr")

    # Test 1
    print(" [1/3] Checking depth-0 model tree...")
    from models.linear_regr import linear_regr
    model = linear_regr()
    MTR_0 = ModelTree(model,
                      max_depth=0,
                      min_samples_leaf=20,
                      search_type="greedy",
                      n_search_grid=100)
    loss_model = experiment(model, X, y)
    loss_MTR_0 = experiment(MTR_0, X, y)
    print("  -> loss(linregr)={:.6f}, loss(MTR_0_linregr)={:.6f}...".format(
        loss_model, loss_MTR_0))
    if np.abs(loss_model - loss_MTR_0) > eps:
        exit("err: passed test 1!")
    else:
        print("  -> passed test 1!")

    # Test 2
    print(
        " [2/3] Reproducing DecisionTreeRegressor sklearn (depth=20) result..."
    )
    from models.mean_regr import mean_regr
    MTR = ModelTree(mean_regr(),
                    max_depth=20,
                    min_samples_leaf=10,
                    search_type="greedy",
                    n_search_grid=100)
    from models.DT_sklearn_regr import DT_sklearn_regr
    DTR_sklearn = DT_sklearn_regr(max_depth=20, min_samples_leaf=10)
    loss_MTR = experiment(MTR, X, y)
    loss_DTR_sklearn = experiment(DTR_sklearn, X, y)
    print("  -> loss(MTR)={:.6f}, loss(DTR_sklearn)={:.6f}...".format(
        loss_MTR, loss_DTR_sklearn))
    if np.abs(loss_MTR - loss_DTR_sklearn) > eps:
        exit("err: passed test 2!")
    else:
        print("  -> passed test 2!")

    # Test 3
    print(
        " [3/3] Reproducing DecisionTreeClassifier sklearn (depth=20) result..."
    )
    from models.modal_clf import modal_clf
    MTC = ModelTree(modal_clf(),
                    max_depth=20,
                    min_samples_leaf=10,
                    search_type="greedy",
                    n_search_grid=100)
    from models.DT_sklearn_clf import DT_sklearn_clf
    DTC_sklearn = DT_sklearn_clf(max_depth=20, min_samples_leaf=10)
    loss_MTC = experiment(MTC, X, y)
    loss_DTC_sklearn = experiment(DTC_sklearn, X, y)
    print("  -> loss(MTC)={:.6f}, loss(DTC_sklearn)={:.6f}...".format(
        loss_MTC, loss_DTC_sklearn))
    if np.abs(loss_MTC - loss_DTC_sklearn) > eps:
        exit("err: passed test 3!")
    else:
        print("  -> passed test 3!")
    print()
示例#4
0
def main():
    # ====================
    # Settings
    # ====================
    mode = "regr"  # "clf" / "regr"
    save_model_tree = True  # save model tree?
    save_model_tree_predictions = False  # save model tree predictions/explanations?
    cross_validation = False  # cross-validate model tree?

    # ====================
    # Load data
    # ====================
    # data_csv_data_filename = os.path.join("data", "/home/congee/Documents/Project_preparation/train.csv")
    data_csv_data_filename = "/home/congee/Documents/Project_preparation/train.csv"
    X, y, header = load_csv_data(data_csv_data_filename, mode=mode, verbose=True)

    # *********************************************
    #
    # Insert your models here!
    #
    # All models must have the following class instantiations:
    #
    #   fit(X, y)
    #   predict(X)
    #   loss(X, y, y_pred)
    #
    # Below are some ready-for-use regression models:
    #
    #   mean regressor  (models/mean_regr.py)
    #   linear regressor  (models/linear_regr.py)
    #   logistic regressor  (lmodels/ogistic_regr.py)
    #   support vector machine regressor  (models/svm_regr.py)
    #   decision tree regressor (models/DT_sklearn_regr.py)
    #   neural network regressor (models/DT_sklearn_regr.py)
    #
    # as well as some classification models:
    #
    #   modal classifier (models/modal_clf.py)
    #   decision tree classifier (models/DT_sklearn_clf.py)
    #
    # *********************************************
    from models.mean_regr import mean_regr
    from models.linear_regr import linear_regr
    from models.logistic_regr import logistic_regr
    from models.svm_regr import svm_regr
    from models.DT_sklearn_regr import DT_sklearn_regr

    from models.modal_clf import modal_clf
    from models.DT_sklearn_clf import DT_sklearn_clf

    # Choose model
    model = linear_regr()

    # Build model tree
    model_tree = ModelTree(model, max_depth=5, min_samples_leaf=3,
                           search_type="greedy", n_search_grid=100)

    # ====================
    # Train model tree
    # ====================
    print("Training model tree with '{}'...".format(model.__class__.__name__))
    model_tree.fit(X, y, verbose=True)
    y_pred = model_tree.predict(X)
    explanations = model_tree.explain(X, header)
    loss = model_tree.loss(X, y, y_pred)
    print(" -> loss_train={:.6f}\n".format(loss))
    model_tree.export_graphviz(os.path.join("output", "model_tree"), header,
                               export_png=True, export_pdf=False)

    # ====================
    # Save model tree results
    # ====================
    if save_model_tree:
        model_tree_filename = os.path.join("output", "model_tree.p")
        print("Saving model tree to '{}'...".format(model_tree_filename))
        pickle.dump(model, open(model_tree_filename, 'wb'))

    if save_model_tree_predictions:
        predictions_csv_filename = os.path.join("output", "model_tree_pred.csv")
        print("Saving mode tree predictions to '{}'".format(predictions_csv_filename))
        with open(predictions_csv_filename, "w") as f:
            writer = csv.writer(f)
            field_names = ["x", "y", "y_pred", "explanation"]
            writer.writerow(field_names)
            for (x_i, y_i, y_pred_i, exp_i) in zip(X, y, y_pred, explanations):
                field_values = [x_i, y_i, y_pred_i, exp_i]
                writer.writerow(field_values)

    # ====================
    # Cross-validate model tree
    # ====================
    if cross_validation:
        cross_validate(model_tree, X, y, kfold=5, seed=1)

    import pandas as pd
    import numpy as np
    file_name = "/home/congee/Documents/Project_preparation/test.csv"
    data = pd.read_csv(file_name, header=0, sep=',')
    del data['id']
    del data['scale']
    del data['random_state']
    data['l2_ratio'] = 0
    data['l1_ratio'][data['penalty'] == 'none'] = 0
    data['l2_ratio'][data['penalty'] == 'none'] = 0
    data['l1_ratio'][data['penalty'] == 'l1'] = 1
    data['l2_ratio'][data['penalty'] == 'l1'] = 0
    data['l1_ratio'][data['penalty'] == 'l2'] = 0
    data['l2_ratio'][data['penalty'] == 'l2'] = 1
    data['l2_ratio'][data['penalty'] == 'elasticnet'] = 1 - data['l1_ratio'][data['penalty'] == 'elasticnet']
    data['n_jobs'][data['n_jobs'] == -1] = 16
    del data['penalty']
    X_test = np.array(data)
    Y_test = model_tree.predict(X_test)

    ans = pd.DataFrame(Y_test)
    ans.to_csv('/home/congee/Documents/Project_preparation/res_5.csv', sep=',')