def main(): # ==================== # Run sanity checks on model tree before training (using our own data) # 1) Reproduce model result on depth-0 model tree # 2) Reproduce sklearn DecisionTreeRegressor result using mean regression + mse # 3) Reproduce sklearn DecisionTreeClassifier result using modal class + gini loss # ==================== run_tests(ModelTree, os.path.join("data", "data_clf.csv")) # ==================== # For 1D polynomial data using a model tree with linear regression model # ==================== # Generate 1D polynomial data and save as a csv func = lambda x: (x - 1) * (x - 4) * (x - 8) * (x - 8) data_csv_data_filename = os.path.join("data", "data_poly4_regr.csv") generate_csv_data(func, data_csv_data_filename, x_range=(0, 10), N=500) # Read generated data X, y, header = load_csv_data(data_csv_data_filename, mode="regr", verbose=True) assert X.shape[1] == 1 # Train different depth model tree fits and plot results from models.mean_regr import mean_regr plot_model_tree_fit(mean_regr(), X, y) from models.linear_regr import linear_regr plot_model_tree_fit(linear_regr(), X, y)
def main(): # ==================== # Settings # ==================== mode = "regr" # "clf" / "regr" save_model_tree = True # save model tree? save_model_tree_predictions = True # save model tree predictions/explanations? cross_validation = True # cross-validate model tree? # ==================== # Load data # ==================== # directory = '/Users/xiangyusun/Development/LearningX/advanced_ML/model_tree/data' # directory = '~/Desktop/LearningX/advanced_ML/model_tree/data' directory = '/cs/oschulte/xiangyus/LearningX' # training_file_name = 'data_clf.csv' # training_file_name = 'sportlogiq_data_pass_2019_07_22_08_54_17_small.csv' training_file_name = 'sportlogiq_data_pass_2019_07_22_08_54_17.csv' data_csv_data_filename = os.path.join(directory, training_file_name) X, y, header = load_csv_data(data_csv_data_filename, mode=mode, verbose=True) # ********************************************* # # Insert your models here! # # All models must have the following class instantiations: # # fit(X, y) # predict(X) # loss(X, y, y_pred) # # Below are some ready-for-use regression models: # # mean regressor (models/mean_regr.py) # linear regressor (models/linear_regr.py) # logistic regressor (lmodels/ogistic_regr.py) # support vector machine regressor (models/svm_regr.py) # decision tree regressor (models/DT_sklearn_regr.py) # neural network regressor (models/DT_sklearn_regr.py) # # as well as some classification models: # # modal classifier (models/modal_clf.py) # decision tree classifier (models/DT_sklearn_clf.py) # # ********************************************* from models.mean_regr import mean_regr from models.linear_regr import linear_regr from models.logistic_regr import logistic_regr from models.svm_regr import svm_regr from models.DT_sklearn_regr import DT_sklearn_regr from models.modal_clf import modal_clf from models.DT_sklearn_clf import DT_sklearn_clf # Choose model model = linear_regr() # Build model tree model_tree = ModelTree(model, max_depth=10, min_samples_leaf=10, search_type="greedy", n_search_grid=100) # ==================== # Train model tree # ==================== print("Training model tree with '{}'...".format(model.__class__.__name__)) model_tree.fit(X, y, verbose=True) y_pred = model_tree.predict(X) explanations = model_tree.explain(X, header) loss = model_tree.loss(X, y, y_pred) print(" -> loss_train={:.6f}\n".format(loss)) model_tree.export_graphviz(os.path.join(directory, "model_tree"), header, export_png=True, export_pdf=False) # ==================== # Save model tree results # ==================== if save_model_tree: model_tree_filename = os.path.join(directory, "model_tree.p") print("Saving model tree to '{}'...".format(model_tree_filename)) pickle.dump(model, open(model_tree_filename, 'wb')) if save_model_tree_predictions: predictions_csv_filename = os.path.join(directory, "model_tree_pred.csv") print("Saving mode tree predictions to '{}'".format( predictions_csv_filename)) with open(predictions_csv_filename, "w") as f: writer = csv.writer(f) field_names = ["x", "y", "y_pred", "explanation"] writer.writerow(field_names) for (x_i, y_i, y_pred_i, exp_i) in zip(X, y, y_pred, explanations): field_values = [x_i, y_i, y_pred_i, exp_i] writer.writerow(field_values)
def run_tests(ModelTree, data_csv_filename): print("Running model tree tests...") eps = 1E-6 # tolerance for test acceptance X, y, header = load_csv_data(data_csv_filename, mode="regr") # Test 1 print(" [1/3] Checking depth-0 model tree...") from models.linear_regr import linear_regr model = linear_regr() MTR_0 = ModelTree(model, max_depth=0, min_samples_leaf=20, search_type="greedy", n_search_grid=100) loss_model = experiment(model, X, y) loss_MTR_0 = experiment(MTR_0, X, y) print(" -> loss(linregr)={:.6f}, loss(MTR_0_linregr)={:.6f}...".format( loss_model, loss_MTR_0)) if np.abs(loss_model - loss_MTR_0) > eps: exit("err: passed test 1!") else: print(" -> passed test 1!") # Test 2 print( " [2/3] Reproducing DecisionTreeRegressor sklearn (depth=20) result..." ) from models.mean_regr import mean_regr MTR = ModelTree(mean_regr(), max_depth=20, min_samples_leaf=10, search_type="greedy", n_search_grid=100) from models.DT_sklearn_regr import DT_sklearn_regr DTR_sklearn = DT_sklearn_regr(max_depth=20, min_samples_leaf=10) loss_MTR = experiment(MTR, X, y) loss_DTR_sklearn = experiment(DTR_sklearn, X, y) print(" -> loss(MTR)={:.6f}, loss(DTR_sklearn)={:.6f}...".format( loss_MTR, loss_DTR_sklearn)) if np.abs(loss_MTR - loss_DTR_sklearn) > eps: exit("err: passed test 2!") else: print(" -> passed test 2!") # Test 3 print( " [3/3] Reproducing DecisionTreeClassifier sklearn (depth=20) result..." ) from models.modal_clf import modal_clf MTC = ModelTree(modal_clf(), max_depth=20, min_samples_leaf=10, search_type="greedy", n_search_grid=100) from models.DT_sklearn_clf import DT_sklearn_clf DTC_sklearn = DT_sklearn_clf(max_depth=20, min_samples_leaf=10) loss_MTC = experiment(MTC, X, y) loss_DTC_sklearn = experiment(DTC_sklearn, X, y) print(" -> loss(MTC)={:.6f}, loss(DTC_sklearn)={:.6f}...".format( loss_MTC, loss_DTC_sklearn)) if np.abs(loss_MTC - loss_DTC_sklearn) > eps: exit("err: passed test 3!") else: print(" -> passed test 3!") print()
def main(): # ==================== # Settings # ==================== mode = "regr" # "clf" / "regr" save_model_tree = True # save model tree? save_model_tree_predictions = False # save model tree predictions/explanations? cross_validation = False # cross-validate model tree? # ==================== # Load data # ==================== # data_csv_data_filename = os.path.join("data", "/home/congee/Documents/Project_preparation/train.csv") data_csv_data_filename = "/home/congee/Documents/Project_preparation/train.csv" X, y, header = load_csv_data(data_csv_data_filename, mode=mode, verbose=True) # ********************************************* # # Insert your models here! # # All models must have the following class instantiations: # # fit(X, y) # predict(X) # loss(X, y, y_pred) # # Below are some ready-for-use regression models: # # mean regressor (models/mean_regr.py) # linear regressor (models/linear_regr.py) # logistic regressor (lmodels/ogistic_regr.py) # support vector machine regressor (models/svm_regr.py) # decision tree regressor (models/DT_sklearn_regr.py) # neural network regressor (models/DT_sklearn_regr.py) # # as well as some classification models: # # modal classifier (models/modal_clf.py) # decision tree classifier (models/DT_sklearn_clf.py) # # ********************************************* from models.mean_regr import mean_regr from models.linear_regr import linear_regr from models.logistic_regr import logistic_regr from models.svm_regr import svm_regr from models.DT_sklearn_regr import DT_sklearn_regr from models.modal_clf import modal_clf from models.DT_sklearn_clf import DT_sklearn_clf # Choose model model = linear_regr() # Build model tree model_tree = ModelTree(model, max_depth=5, min_samples_leaf=3, search_type="greedy", n_search_grid=100) # ==================== # Train model tree # ==================== print("Training model tree with '{}'...".format(model.__class__.__name__)) model_tree.fit(X, y, verbose=True) y_pred = model_tree.predict(X) explanations = model_tree.explain(X, header) loss = model_tree.loss(X, y, y_pred) print(" -> loss_train={:.6f}\n".format(loss)) model_tree.export_graphviz(os.path.join("output", "model_tree"), header, export_png=True, export_pdf=False) # ==================== # Save model tree results # ==================== if save_model_tree: model_tree_filename = os.path.join("output", "model_tree.p") print("Saving model tree to '{}'...".format(model_tree_filename)) pickle.dump(model, open(model_tree_filename, 'wb')) if save_model_tree_predictions: predictions_csv_filename = os.path.join("output", "model_tree_pred.csv") print("Saving mode tree predictions to '{}'".format(predictions_csv_filename)) with open(predictions_csv_filename, "w") as f: writer = csv.writer(f) field_names = ["x", "y", "y_pred", "explanation"] writer.writerow(field_names) for (x_i, y_i, y_pred_i, exp_i) in zip(X, y, y_pred, explanations): field_values = [x_i, y_i, y_pred_i, exp_i] writer.writerow(field_values) # ==================== # Cross-validate model tree # ==================== if cross_validation: cross_validate(model_tree, X, y, kfold=5, seed=1) import pandas as pd import numpy as np file_name = "/home/congee/Documents/Project_preparation/test.csv" data = pd.read_csv(file_name, header=0, sep=',') del data['id'] del data['scale'] del data['random_state'] data['l2_ratio'] = 0 data['l1_ratio'][data['penalty'] == 'none'] = 0 data['l2_ratio'][data['penalty'] == 'none'] = 0 data['l1_ratio'][data['penalty'] == 'l1'] = 1 data['l2_ratio'][data['penalty'] == 'l1'] = 0 data['l1_ratio'][data['penalty'] == 'l2'] = 0 data['l2_ratio'][data['penalty'] == 'l2'] = 1 data['l2_ratio'][data['penalty'] == 'elasticnet'] = 1 - data['l1_ratio'][data['penalty'] == 'elasticnet'] data['n_jobs'][data['n_jobs'] == -1] = 16 del data['penalty'] X_test = np.array(data) Y_test = model_tree.predict(X_test) ans = pd.DataFrame(Y_test) ans.to_csv('/home/congee/Documents/Project_preparation/res_5.csv', sep=',')