def train_svm(filename, X_train, X_test, y_train, y_test, solver='rbf', full_param=False, debug=False, numFolds=10, njobs=-1, scalar=1, make_graphs=False, pSVM={}): np.random.seed(1) algo = 'SVM' start = time.time() if len(pSVM) == 0: if full_param: param_grid = [{ 'kernel': [solver], # 0.0001 - Finished for Linear # 'max_iter': [-1, 10000, 100000], # 'shrinking' : [True, False], # Seems to just make things faster/slower on larger iterations, I think cutting down 2x is better # 'probability' : [True, False], 'random_state': [1] }] if solver == 'rbf': param_grid[0]['C'] = [ 0.001 ] #, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000, 100000] param_grid[0]['gamma'] = [ 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000, 100000 ] elif solver == 'sigmoid': param_grid[0]['gamma'] = [ 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000 ] param_grid[0]['coef0'] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] param_grid[0]['C'] = [ 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000, 100000 ] elif solver == 'poly': param_grid[0]['gamma'] = [ 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000, 100000 ] param_grid[0]['degree'] = [1, 2, 3, 4, 5, 6, 7, 8] param_grid[0]['coef0'] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] param_grid[0]['C'] = [ 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000, 100000 ] elif solver == 'linear': param_grid[0]['C'] = [1.0] else: param_grid = [{ 'kernel': [solver], 'C': [0.01, 0.1, 1., 10., 100], 'cache_size': [2000], 'random_state': [1] }] if solver == 'poly' or solver == 'linear': param_grid = [{ 'kernel': [solver], 'C': [0.001, 0.01, 0.1, 1., 10.], 'cache_size': [2000], 'random_state': [1] }] svm_classifier = svm.SVC(probability=True) grid_search = GridSearchCV(svm_classifier, param_grid, cv=numFolds, scoring='roc_auc_ovr_weighted', return_train_score=True, n_jobs=njobs, verbose=debug) grid_search.fit(X_train, y_train) cvres = grid_search.cv_results_ best_params = grid_search.best_params_ util.save_gridsearch_to_csv(cvres, algo, filename[:-4], scalar, solver) svm_classifier = svm.SVC() svm_classifier.set_params(**best_params) else: svm_classifier = svm.SVC() svm_classifier.set_params(**pSVM) start = time.time() svm_classifier.fit(X_train, y_train) print('SVM Fit Time: ', time.time() - start) start = time.time() y_prob = svm_classifier.predict_proba(X_train) train_score = roc_auc_score(y_train, y_prob, multi_class="ovr", average="weighted") print('SVM Train Score Time: ', time.time() - start) start = time.time() y_prob = svm_classifier.predict_proba(X_test) test_score = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted") print('SVM Test Score Time: ', time.time() - start) test_class = svm.SVC() test_class.set_params(**pSVM) if make_graphs: util.plot_learning_curve(svm_classifier, algo, filename[:-4], X_train, y_train, ylim=(0.0, 1.05), cv=10, n_jobs=njobs, debug=debug) util.compute_vc(algo, 'kernel', ['rbf', 'sigmoid', 'poly', 'linear'], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=False, njobs=njobs, debug=debug, smalllegend=True) util.svm_rbf_C_Gamma_viz(X_train, y_train, pSVM, njobs, filename[:-4], train_score) # computer Model Complexity/Validation curves util.compute_vc(algo, 'kernel', ['rbf', 'sigmoid', 'poly', 'linear'], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=False, njobs=njobs) util.compute_vc(algo, 'C', [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=True, njobs=njobs, debug=debug) if solver == 'rbf': util.compute_vc(algo, 'gamma', [0.0001, 0.001, 0.01, 0.1, 1.0, 5.0, 10.0], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=True, njobs=njobs, debug=debug) elif solver == 'sigmoid': util.compute_vc( algo, 'gamma', [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=True, njobs=njobs, debug=debug) util.compute_vc(algo, 'coef0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=False, njobs=njobs, debug=debug) elif solver == 'poly': util.compute_vc( algo, 'gamma', [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=True, njobs=njobs, debug=debug) util.compute_vc(algo, 'coef0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=False, njobs=njobs, debug=debug) util.compute_vc(algo, 'degree', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=False, njobs=njobs, debug=debug) return time.time() - start, round(train_score, 4), round(test_score, 4)
def train_kmeansNN(filename, X_train, X_test, y_train, y_test, debug=False, numFolds=10, njobs=-1, scalar=1, make_graphs=False, pNN={}, nolegend=False, random_seed=1, num_clusts=4): np.random.seed(random_seed) algo = 'Neural Network' start = time.time() if num_clusts != 1: KClusters = KMeans(init='k-means++', n_clusters=num_clusts, n_init=100, random_state=random_seed, max_iter=100).fit(X_train) X_train.insert(0, 'Cluster', KClusters.predict(X_train)) X_train['Cluster'] = X_train['Cluster'].apply(str) X_test.insert(0, 'Cluster', KClusters.predict(X_test)) X_test['Cluster'] = X_test['Cluster'].apply(str) X_train = pd.get_dummies(X_train, prefix='Cluster') X_test = pd.get_dummies(X_test, prefix='Cluster') param_grid = [{ 'hidden_layer_sizes': [(512, 512, 512, 512)], 'activation': ['relu'], # 'identity', 'solver': ['adam'], 'alpha': [0.01], #[0.0001, 0.001, 0.01, 0.1], 'batch_size': ['auto'], 'learning_rate_init': [0.01], #[0.001, 0.01], 'max_iter': [10000], 'warm_start': [True], 'early_stopping': [True], 'random_state': [1] }] nn_classifier = MLPClassifier() grid_search = GridSearchCV(nn_classifier, param_grid, cv=numFolds, scoring='roc_auc_ovr_weighted', return_train_score=True, n_jobs=njobs, verbose=debug) grid_search.fit(X_train, y_train) cvres = grid_search.cv_results_ util.save_gridsearch_to_csv(cvres, algo, filename[:-4] + '-' + str(num_clusts), scalar, '-kmeans') start = time.time() nn_classifier.fit(X_train, y_train) print('NN Fit Time: ', time.time() - start) start = time.time() y_prob = nn_classifier.predict_proba(X_train) train_score = roc_auc_score(y_train, y_prob, multi_class="ovr", average="weighted") print('NN Train Score Time: ', train_score, time.time() - start) start = time.time() y_prob = nn_classifier.predict_proba(X_test) test_score = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted") print('NN Test Score Time: ', test_score, time.time() - start) test_class = MLPClassifier() test_class.set_params(**pNN) if make_graphs: # computer Model Complexity/Validation curves util.plot_learning_curve(nn_classifier, 'K-Means', filename[:-4], X_train, y_train, ylim=(0.0, 1.05), cv=10, n_jobs=njobs, debug=debug) return time.time() - start, round(train_score, 4), round(test_score, 4)
def train_NN_LLE(filename, X_train, X_test, y_train, y_test, debug=False, numFolds=10, njobs=-1, scalar=1, make_graphs=False, pNN={}, nolegend=False, random_seed=1, num_dim=4): np.random.seed(random_seed) algo = 'LLE' + str(num_dim) start = time.time() lle = LocallyLinearEmbedding(n_neighbors=10, n_components=num_dim, random_state=random_seed, n_jobs=-1) lle.fit(X_train) X_train = lle.transform(X_train) X_test = lle.transform(X_test) param_grid = [{ 'hidden_layer_sizes': [(512, 512, 512, 512)], 'activation': ['relu'], # 'identity', 'solver': ['adam'], 'alpha': [0.0001, 0.001, 0.01, 0.1], 'batch_size': ['auto'], 'learning_rate_init': [0.001, 0.01], 'max_iter': [10000], 'warm_start': [True], 'early_stopping': [True], 'random_state': [1] }] nn_classifier = MLPClassifier() grid_search = GridSearchCV(nn_classifier, param_grid, cv=numFolds, scoring='roc_auc_ovr_weighted', return_train_score=True, n_jobs=njobs, verbose=debug) grid_search.fit(X_train, y_train) cvres = grid_search.cv_results_ util.save_gridsearch_to_csv(cvres, algo, filename[:-4] + '-' + str(num_dim), scalar, '') start = time.time() nn_classifier.fit(X_train, y_train) print('NN Fit Time: ', time.time() - start) start = time.time() y_prob = nn_classifier.predict_proba(X_train) train_score = roc_auc_score(y_train, y_prob, multi_class="ovr", average="weighted") print('NN Train Score Time: ', train_score, time.time() - start) start = time.time() y_prob = nn_classifier.predict_proba(X_test) test_score = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted") print('NN Test Score Time: ', test_score, time.time() - start) test_class = MLPClassifier() test_class.set_params(**pNN) if make_graphs: # computer Model Complexity/Validation curves util.plot_learning_curve(nn_classifier, algo, filename[:-4], X_train, y_train, ylim=(0.0, 1.05), cv=10, n_jobs=njobs, debug=debug) return time.time() - start, round(train_score, 4), round(test_score, 4)
def train_BTree(filename, X_train, X_test, y_train, y_test, full_param=False, debug=False, numFolds=10, njobs=-1, scalar=1, make_graphs=False, pBTree={}): np.random.seed(1) start = time.time() algo = 'Boosted Tree' if len(pBTree) == 0: if full_param: param_grid = [{'base_estimator__criterion' : ['gini', 'entropy'], 'base_estimator__max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 100], # 'base_estimator__min_samples_split': [2, 3, 5, 6, 8, 10], # 'base_estimator__min_samples_leaf' : [1, 2, 3, 5, 6, 8, 10], # 'base_estimator__max_features' : [0.9, 1.0], # 0.1, 0.3, 0.5, 'base_estimator__max_leaf_nodes': [10, 100], # 2, 4, 5, 7, 'base_estimator__ccp_alpha' : [0.0, 0.005, 0.01], # 0.015, 0.02, 0.025, 0.030, 0.035, 0.04], "base_estimator__splitter" : ["best"], # "random"], "n_estimators" : [1, 50, 100, 150, 200, 250, 300], "learning_rate" : [0.1, 0.5, 1], 'random_state' : [1] }] else: param_grid = [{'base_estimator__criterion': ['gini', 'entropy'], 'base_estimator__max_depth': [3, 5, 7, 10], 'base_estimator__ccp_alpha': [0.0, 0.005, 0.01, 0.035], # 'base_estimator__min_samples_split': [3, 5, 7, 10], # 'base_estimator__ccp_alpha' : [0.0, 0.005, 0.015, 0.025, 0.35, 0.04], "n_estimators" : [1, 50, 100, 150], # "learning_rate" : [0.1, 0.5, 1], 'random_state' : [1] }] DTC = DecisionTreeClassifier(random_state=11) adaTree = AdaBoostClassifier(base_estimator=DTC) # run grid search grid_search = GridSearchCV(adaTree, param_grid=param_grid, cv=numFolds, scoring='roc_auc_ovr_weighted', return_train_score=True, n_jobs=njobs, verbose=debug) grid_search.fit(X_train, y_train) cvres = grid_search.cv_results_ best_params = grid_search.best_params_ util.save_gridsearch_to_csv(cvres, algo, filename[:-4], scalar) btree_classifier = AdaBoostClassifier(base_estimator=DTC) btree_classifier.set_params(**best_params) else: DTC = DecisionTreeClassifier() btree_classifier = AdaBoostClassifier(base_estimator=DTC) btree_classifier.set_params(**pBTree) start = time.time() btree_classifier.fit(X_train, y_train) print('BTree Fit Time: ', time.time() - start) start = time.time() y_prob = btree_classifier.predict_proba(X_train) train_score = roc_auc_score(y_train, y_prob, multi_class="ovr", average="weighted") print('BTree Train Score Time: ', time.time() - start) start = time.time() y_prob = btree_classifier.predict_proba(X_test) test_score = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted") print('BTree Test Score Time: ', time.time() - start) DTC = DecisionTreeClassifier() test_class = AdaBoostClassifier(base_estimator=DTC) test_class.set_params(**pBTree) if make_graphs: util.boost_lr_vs_nest(X_train, y_train, pBTree, njobs, filename[:-4], train_score) util.compute_vc(algo, 'n_estimators', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 1000], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs, debug=debug, extraText='log') util.plot_learning_curve(btree_classifier, algo, filename[:-4], X_train, y_train, ylim=(0.0, 1.05), cv=10, n_jobs=njobs, debug=debug) util.compute_vc(algo, 'base_estimator__max_depth', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 40, 50, 60, 70, 80, 90, 100], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs, debug=debug) util.compute_vc(algo, 'base_estimator__max_leaf_nodes', [2, 3, 4, 5, 6, 7, 8, 9, 10, 25, 50, 75, 100, 200, 500, 1000, 10000], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs) # computer Model Complexity/Validation curves util.compute_vc(algo, 'base_estimator__criterion', ['gini', 'entropy'], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs) util.compute_vc(algo, 'n_estimators', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 1000], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs, debug=debug) util.compute_vc(algo, 'n_estimators', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 1000], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs, debug=debug, extraText='log') util.compute_vc(algo, 'learning_rate', [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs, debug=debug) util.compute_vc(algo, 'base_estimator__ccp_alpha', [0.000001, 0.00001, 0.00002, 0.00003, 0.00004, 0.00005, 0.00006, 0.00007, 0.00008, 0.00009, 0.0001, 0.00011, 0.00012, 0.00013, 0.00014, 0.00015, 0.00016, 0.00017, 0.00018, 0.00019, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001, 0.01, 0.1, 1], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs) util.compute_vc(algo, 'base_estimator__min_samples_split', [2, 3, 5, 6, 8, 10], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs) util.compute_vc(algo, 'base_estimator__min_samples_leaf', [1, 2, 3, 5, 6, 8, 10, 25, 50, 75, 100, 250, 500, 750, 1000], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs) util.compute_vc(algo, 'base_estimator__max_features', [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9, 0.99999, 1.0], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs) util.compute_vc(algo, 'base_estimator__splitter', ["best", "random"], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs) return time.time() - start, round(train_score, 4), round(test_score, 4)
def train_NN(filename, X_train, X_test, y_train, y_test, solver='adam', full_param=False, debug=False, numFolds=10, njobs=-1, scalar=1, make_graphs=False, pNN={}, nolegend=False): np.random.seed(1) algo = 'Neural Network' start = time.time() if len(pNN) == 0: if full_param: param_grid = [{ 'hidden_layer_sizes': [(8), (16), (32), (8, 8), (16, 16), (32, 32), (8, 8, 8), (16, 16, 16), (32, 32, 32), (128, ), (128, 128), (128, 128, 128), (128, 128, 128, 128), (256, ), (256, 256), (512, ), (512, 512), (256, 256, 256), (256, 256, 256, 256), (512, 512, 512), (512, 512, 512, 512)], 'activation': ['logistic', 'tanh', 'relu'], # 'identity', 'solver': [solver], # 'lbfgs', 'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1], 'batch_size': ['auto'], 'learning_rate_init': [0.001, 0.01], 'max_iter': [10000], 'warm_start': [True], 'early_stopping': [True], 'random_state': [1] }] if solver == 'sgd': param_grid[0]['learning_rate'] = [ 'constant', 'invscaling', 'adaptive' ] # Only used when solver='sgd' else: param_grid = [{ 'hidden_layer_sizes': [(8), (16), (32), (8, 8), (16, 16), (32, 32), (8, 16), (8, 32), (16, 32), (128, ), (128, 128), (128, 128, 128), (128, 128, 128, 128)], # 'hidden_layer_sizes': [(512, 512), (256, 256), (1024), (1024, 1024),], #(256, 256, 256), (256, 256, 256, 256), (512, 512, 512), (512, 512, 512, 512)], 'solver': [solver], 'activation': ['identity', 'relu'], # , 'logistic', 'tanh'], 'max_iter': [10000], 'early_stopping': [True], 'random_state': [1] }] nn_classifier = MLPClassifier() grid_search = GridSearchCV(nn_classifier, param_grid, cv=numFolds, scoring='roc_auc_ovr_weighted', return_train_score=True, n_jobs=njobs, verbose=debug) grid_search.fit(X_train, y_train) cvres = grid_search.cv_results_ best_params = grid_search.best_params_ util.save_gridsearch_to_csv(cvres, algo, filename[:-4], scalar, solver) nn_classifier = MLPClassifier() nn_classifier.set_params(**best_params) else: nn_classifier = MLPClassifier() nn_classifier.set_params(**pNN) start = time.time() nn_classifier.fit(X_train, y_train) print('NN Fit Time: ', time.time() - start) start = time.time() y_prob = nn_classifier.predict_proba(X_train) train_score = roc_auc_score(y_train, y_prob, multi_class="ovr", average="weighted") print('NN Train Score Time: ', time.time() - start) start = time.time() y_prob = nn_classifier.predict_proba(X_test) test_score = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted") print('NN Test Score Time: ', time.time() - start) test_class = MLPClassifier() test_class.set_params(**pNN) if make_graphs: # computer Model Complexity/Validation curves util.plot_learning_curve(nn_classifier, algo, filename[:-4], X_train, y_train, ylim=(0.0, 1.05), cv=10, n_jobs=njobs, debug=debug) util.compute_vc(algo, 'activation', ['identity', 'logistic', 'tanh', 'relu'], X_train, y_train, X_test, y_test, nn_classifier, filename[:-4], test_class, pNN, log=False, njobs=njobs, debug=debug) util.compute_vc( algo, 'max_iter', [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 ], X_train, y_train, X_test, y_test, nn_classifier, filename[:-4], test_class, pNN, log=True, njobs=njobs, debug=debug) util.compute_vc(algo, 'hidden_layer_sizes', [(1), (2), (4), (8), (16), (32), (64), (128), (256), (512)], X_train, y_train, X_test, y_test, nn_classifier, filename[:-4], test_class, pNN, log=False, njobs=njobs, debug=debug, fString=True, extraText=' 1-Layer', rotatex=True, nolegend=nolegend) util.compute_vc(algo, 'hidden_layer_sizes', [ (1, 1), (2, 2), (4, 4), (8, 8), (16, 16), (32, 32), (64, 64), (128, 128), (256, 256), (512, 512), ], X_train, y_train, X_test, y_test, nn_classifier, filename[:-4], test_class, pNN, log=False, njobs=njobs, debug=debug, fString=True, extraText=' 2-Layer', rotatex=True, nolegend=nolegend) util.compute_vc(algo, 'hidden_layer_sizes', [ (1, 1, 1), (2, 2, 2), (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32), (64, 64, 64), (128, 128, 128), (256, 256, 256), (512, 512, 512), ], X_train, y_train, X_test, y_test, nn_classifier, filename[:-4], test_class, pNN, log=False, njobs=njobs, debug=debug, fString=True, extraText=' 3-Layer', rotatex=True, nolegend=nolegend) util.compute_vc(algo, 'hidden_layer_sizes', [(1, 1, 1, 1), (2, 2, 2, 2), (4, 4, 4, 4), (8, 8, 8, 8), (16, 16, 16, 16), (32, 32, 32, 32), (64, 64, 64, 64), (128, 128, 128, 128), (256, 256, 256, 256), (512, 512, 512, 512)], X_train, y_train, X_test, y_test, nn_classifier, filename[:-4], test_class, pNN, log=False, njobs=njobs, debug=debug, fString=True, extraText=' 4-Layer', rotatex=True, nolegend=nolegend) util.compute_vc(algo, 'solver', ['adam', 'sgd', 'lbfgs'], X_train, y_train, X_test, y_test, nn_classifier, filename[:-4], test_class, log=False, njobs=njobs) util.compute_vc( algo, 'alpha', [ 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 100000, 1000000 ], X_train, y_train, X_test, y_test, nn_classifier, filename[:-4], test_class, pNN, log=True, njobs=njobs, debug=debug) if solver == 'sgd': util.compute_vc(algo, 'learning_rate', ['constant', 'invscaling', 'adaptive'], X_train, y_train, X_test, y_test, nn_classifier, filename[:-4], test_class, log=True, njobs=njobs) return time.time() - start, round(train_score, 4), round(test_score, 4)
def train_DTree(filename, X_train, X_test, y_train, y_test, full_param=False, debug=False, numFolds=10, njobs=-1, scalar=1, make_graphs=False, pDTree={}): np.random.seed(1) algo = 'Decision Tree' start = time.time() if len(pDTree) == 0: if full_param: param_grid = [{ 'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, 7, 10, 100], # 3, 5, 7, 10, 100,\ 'min_samples_split': [2, 3, 5, 7, 8, 10], # 'min_samples_leaf' : [0.1, 0.2, 0.3, 0.5], 'ccp_alpha': [0.0, 0.00001, 0.0001, 0.001, 0.005, 0.01, 0.015], 'random_state': [1], }] else: param_grid = [{ 'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, 7, 10, 100], 'min_samples_split': [2, 3, 5, 7, 10], 'ccp_alpha': [0, .01, .02], 'random_state': [1] }] tree_classifier = DecisionTreeClassifier() grid_search = GridSearchCV(tree_classifier, param_grid, cv=numFolds, scoring='roc_auc_ovr_weighted', return_train_score=True, n_jobs=njobs, verbose=debug) grid_search.fit(X_train, y_train) cvres = grid_search.cv_results_ best_params = grid_search.best_params_ util.save_gridsearch_to_csv(cvres, algo, filename[:-4], scalar) # Fit algo to best parameters and compute test score tree_classifier = DecisionTreeClassifier() tree_classifier.set_params(**best_params) else: # Fit algo to best parameters and compute test score tree_classifier = DecisionTreeClassifier() tree_classifier.set_params(**pDTree) start = time.time() tree_classifier.fit(X_train, y_train) print('DTree Fit Time: ', time.time() - start) start = time.time() y_prob = tree_classifier.predict_proba(X_train) train_score = roc_auc_score(y_train, y_prob, multi_class="ovr", average="weighted") print('DTree Train Score Time: ', time.time() - start) start = time.time() y_prob = tree_classifier.predict_proba(X_test) test_score = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted") print('DTree Test Score Time: ', time.time() - start) if make_graphs: '''# Plot DTree create_DT_image(tree_classifier, X_train, filename[:-4], scalar, True) # Plot without pruning, need to make it again with ccp_alpha = 0 unprune_tree = DecisionTreeClassifier() unprune_tree.set_params(**pDTree) unprune_tree.set_params(**{'ccp_alpha': 0}) unprune_tree.fit(X_train, y_train) create_DT_image(unprune_tree, X_train, filename[:-4], scalar, False)''' # computer Model Complexity/Validation curves test_class = DecisionTreeClassifier() test_class.set_params(**pDTree) util.compute_vc(algo, 'criterion', ['gini', 'entropy'], X_train, y_train, X_test, y_test, tree_classifier, filename[:-4], test_class, pDTree, log=False, njobs=njobs, debug=debug, smalllegend=True) # Plot Learning Curve util.plot_learning_curve(tree_classifier, algo, filename[:-4], X_train, y_train, ylim=(0.0, 1.05), cv=10, n_jobs=njobs, debug=debug) util.compute_vc(algo, 'max_depth', [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 50, 75, 100 ], X_train, y_train, X_test, y_test, tree_classifier, filename[:-4], test_class, pDTree, log=True, njobs=njobs, debug=debug) util.compute_vc( algo, 'ccp_alpha', [ 0.000001, 0.00001, 0.00002, 0.00003, 0.00004, 0.00005, 0.00006, 0.00007, 0.00008, 0.00009, 0.0001, 0.00011, 0.00012, 0.00013, 0.00014, 0.00015, 0.00016, 0.00017, 0.00018, 0.00019, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001, 0.01, 0.1, 1 ], X_train, y_train, X_test, y_test, tree_classifier, filename[:-4], test_class, pDTree, log=True, njobs=njobs, debug=debug) util.compute_vc(algo, 'min_samples_split', [2, 3, 5, 7, 10], X_train, y_train, X_test, y_test, tree_classifier, filename[:-4], test_class, pDTree, log=False, njobs=njobs, debug=debug) util.compute_vc(algo, 'min_samples_leaf', [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 25, 50, 75, 100, 250, 500, 750, 1000 ], X_train, y_train, X_test, y_test, tree_classifier, filename[:-4], test_class, pDTree, log=True, njobs=njobs) util.compute_vc( algo, 'max_leaf_nodes', [ 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000 ], X_train, y_train, X_test, y_test, tree_classifier, filename[:-4], test_class, pDTree, log=True, njobs=njobs) util.compute_vc(algo, 'max_features', [ 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.8, 0.85, 0.9, 0.95, 0.99999, 1.0 ], X_train, y_train, X_test, y_test, tree_classifier, filename[:-4], test_class, pDTree, log=False, njobs=njobs) util.compute_vc(algo, 'splitter', ["best", "random"], X_train, y_train, X_test, y_test, tree_classifier, filename[:-4], test_class, pDTree, log=False, njobs=njobs) tree_classifier.set_params(**{'ccp_alpha': 0}) test_class.set_params(**{'ccp_alpha': 0}) pDTree['ccp_alpha'] = 0 util.compute_vc(algo, 'max_depth', [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 50, 75, 100 ], X_train, y_train, X_test, y_test, tree_classifier, filename[:-4], test_class, pDTree, log=True, njobs=njobs, debug=debug, extraText='noprune') return time.time() - start, round(train_score, 4), round(test_score, 4)