def perform(self): # TODO: Clean up the older alpha stuff? max_depths = np.arange(1, 51, 1) params = {'DT__criterion': ['gini', 'entropy'], 'DT__max_depth': max_depths, 'DT__class_weight': ['balanced', None]} # , 'DT__max_leaf_nodes': max_leaf_nodes} complexity_param = {'name': 'DT__max_depth', 'display_name': 'Max Depth', 'values': max_depths} best_params = None # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # the various graphs # # Dataset 1: # best_params = {'criterion': 'gini', 'max_depth': 5, 'class_weight': 'balanced'} # # Dataset 2: best_params = {'criterion': 'entropy', 'max_depth': 14, 'class_weight': 'balanced'} learner = learners.DTLearner(random_state=self._details.seed) if best_params is not None: learner.set_params(**best_params) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'DT', 'DT', params, complexity_param=complexity_param, seed=self._details.seed, threads=self._details.threads, best_params=best_params, verbose=self._verbose)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/Boosting.py # Search for good alphas alphas = np.arange(1, 11) max_depths = np.arange(1, 41, 1) # np.arange(1, 11) base = learners.DTLearner(criterion='gini', class_weight='balanced', random_state=self._details.seed) of_base = learners.DTLearner(criterion='gini', class_weight='balanced', random_state=self._details.seed) booster = learners.BoostingLearner(algorithm='SAMME', learning_rate=1, base_estimator=base, random_state=self._details.seed) of_booster = learners.BoostingLearner(algorithm='SAMME', learning_rate=1, base_estimator=of_base, random_state=self._details.seed) # TODO: No 90 here? params = {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100], 'Boost__base_estimator__max_depth': max_depths} iteration_params = {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]} of_params = {'Boost__base_estimator__max_depth': 100, 'Boost__n_estimators': 50} complexity_param = {'name': 'Boost__n_estimators', 'display_name': 'Estimator count', 'x_scale': 'log', 'values': [1, 2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]} experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, booster, 'Boost', 'Boost', params, complexity_param=complexity_param, seed=self._details.seed, threads=self._details.threads, verbose=self._verbose) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, of_booster, 'Boost_OF', 'Boost', of_params, seed=self._details.seed, iteration_params=iteration_params, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/DT.py max_depths = np.arange(1, 25, 1) params = { 'DT__criterion': ['gini', 'entropy'], 'DT__max_depth': max_depths, 'DT__class_weight': ['balanced'] } complexity_param = { 'name': 'DT__max_depth', 'display_name': 'Max Depth', 'values': max_depths } learner = learners.DTLearner(random_state=self._details.seed) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'DT', 'DT', params, complexity_param=complexity_param, seed=self._details.seed, threads=self._details.threads, verbose=self._verbose)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/KNN.py params = { 'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'], 'KNN__n_neighbors': np.arange(1, 51, 3), 'KNN__weights': ['uniform'] } complexity_param = { 'name': 'KNN__n_neighbors', 'display_name': 'Neighbor count', 'values': np.arange(1, 51, 1) } best_params = None if self._details.ds_best_params is not None and 'KNN' in self._details.ds_best_params: best_params = self._details.ds_best_params['KNN'] learner = learners.KNNLearner(n_jobs=self._details.threads) if best_params is not None: learner.set_params(**best_params) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'KNN', 'KNN', params, complexity_param=complexity_param, seed=self._details.seed, best_params=best_params, threads=self._details.threads, verbose=self._verbose)
def perform(self): # TODO: Clean up the older alpha stuff? max_depths = np.arange(1, 51, 1) params = { 'DT__criterion': ['gini', 'entropy'], 'DT__max_depth': max_depths, 'DT__class_weight': ['balanced', None] } # , 'DT__max_leaf_nodes': max_leaf_nodes} complexity_param = { 'name': 'DT__max_depth', 'display_name': 'Max Depth', 'values': max_depths } best_params = None learner = learners.DTLearner(random_state=self._details.seed) if self._details.ds_best_params is not None and 'DT' in self._details.ds_best_params: best_params = self._details.ds_best_params['DT'] if best_params is not None: learner.set_params(**best_params) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'DT', 'DT', params, complexity_param=complexity_param, seed=self._details.seed, threads=self._details.threads, best_params=best_params, verbose=self._verbose)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/KNN.py params = { 'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'], 'KNN__n_neighbors': np.arange(1, 51, 3), 'KNN__weights': ['uniform'] } complexity_param = { 'name': 'KNN__n_neighbors', 'display_name': 'Neighbor count', 'values': np.arange(1, 51, 1) } best_params = None # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # the various graphs # # Dataset 1: params_wine = { 'metric': 'manhattan', 'n_neighbors': 22, 'weights': 'uniform' } if self._details.ds_name == "wine-qual": for k in params.keys(): if k in params_wine.keys(): params[k] = [params_wine.get(k)] # # Dataset 2: params_enhancer = { 'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'uniform' } if self._details.ds_name == "enhancer-b": for k in params.keys(): if k in params_enhancer.keys(): params[k] = [params_enhancer.get(k)] learner = learners.KNNLearner(n_jobs=self._details.threads) if best_params is not None: learner.set_params(**best_params) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'KNN', 'KNN', params, complexity_param=complexity_param, seed=self._details.seed, best_params=best_params, threads=self._details.threads, verbose=self._verbose)
def perform(self): # TODO: Clean up the older alpha stuff? max_depths = np.arange(1, 51, 1) params = { "DT__criterion": ["gini", "entropy"], "DT__max_depth": max_depths, "DT__class_weight": ["balanced", None] } # , "DT__max_leaf_nodes": max_leaf_nodes} complexity_param = { "name": "DT__max_depth", "display_name": "Max Depth", "values": max_depths } # max_leaf_nodes = np.arange(10, 200, 10) # params = {"DT__criterion": ["gini", "entropy"], # "DT__class_weight": ["balanced", None], "DT__max_leaf_nodes": max_leaf_nodes} # complexity_param = { # "name": "DT__max_leaf_nodes", "display_name": "Max Leaf Nodes", "values": max_leaf_nodes} best_params = None # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # the various graphs # # Dataset 1: # Seed: 2702306879, 3882803657 # best_params = {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 11} # # Dataset 2: # best_params = {"criterion": "entropy", "max_depth": 4, "class_weight": "balanced"} learner = learners.DTLearner(random_state=self._details.seed) if best_params is not None: learner.set_params(**best_params) self.log( "Best parameters are provided, GridSearchCV will is skipped") else: self.log( "Best parameters are not provided, GridSearchCV is scheduled") experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, "DT", "DT", params, complexity_param=complexity_param, seed=self._details.seed, threads=self._details.threads, best_params=best_params, verbose=self._verbose)
def perform(self): # TODO: Clean up the older alpha stuff? max_depths = np.arange(1, 50, 1) params = None complexity_param = None if self._details.ds_name == "poisonous_mushrooms": params = { "DT__criterion": ["gini"], "DT__max_depth": max_depths, } # , 'DT__max_leaf_nodes': max_leaf_nodes} complexity_param = { "name": "DT__max_depth", "display_name": "Max Depth", "values": max_depths, } elif self._details.ds_name == "spam": params = { "DT__criterion": ["gini"], "DT__max_depth": max_depths, } # , 'DT__max_leaf_nodes': max_leaf_nodes} complexity_param = { "name": "DT__max_depth", "display_name": "Max Depth", "values": max_depths, } best_params = None # if self._details.ds_name == "poisonous_mushrooms": # best_params = {"criterion": "gini", "max_depth": 7} # elif self._details.ds_name == "spam": # best_params = {"criterion": "gini", "max_depth": 50} learner = learners.DTLearner(random_state=self._details.seed) if best_params is not None: learner.set_params(**best_params) experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, "DT", "DT", params, complexity_param=complexity_param, seed=self._details.seed, threads=self._details.threads, best_params=best_params, verbose=self._verbose, )
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/Boosting.py max_depths = np.arange(1, 11, 1) # NOTE: Criterion may need to be adjusted here depending on the dataset base = learners.DTLearner(criterion='entropy', class_weight='balanced', max_depth=10, random_state=self._details.seed) of_base = learners.DTLearner(criterion='entropy', class_weight='balanced', random_state=self._details.seed) booster = learners.BoostingLearner(algorithm='SAMME', learning_rate=1, base_estimator=base, random_state=self._details.seed) of_booster = learners.BoostingLearner(algorithm='SAMME', learning_rate=1, base_estimator=of_base, random_state=self._details.seed) params = {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 90, 100], 'Boost__learning_rate': [(2**x)/100 for x in range(7)]+[1], 'Boost__base_estimator__max_depth': max_depths} iteration_details = { 'params': {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]} } of_params = {'Boost__base_estimator__max_depth': None} #complexity_param = {'name': 'Boost__learning_rate', 'display_name': 'Learning rate', 'x_scale': 'log', # 'values': [(2**x)/100 for x in range(7)]+[1]} complexity_param = {'name': 'Boost__n_estimators', 'display_name': 'N_estimators', 'x_scale': 'linear', 'values': [1, 2, 5, 10, 20, 30, 45, 60, 80, 90, 100]} best_params = None # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # the various graphs # # Dataset 1: # best_params = {'base_estimator__max_depth': 4, 'learning_rate': 0.32, 'n_estimators': 20} # # Dataset 2: best_params = {'base_estimator__max_depth': 5, 'learning_rate': 0.64, 'n_estimators': 45} if best_params is not None: booster.set_params(**best_params) of_booster.set_params(**best_params) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, booster, 'Boost', 'Boost', params, complexity_param=complexity_param, iteration_details=iteration_details, best_params=best_params, seed=self._details.seed, threads=self._details.threads, verbose=self._verbose) # TODO: This should turn OFF regularization experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, of_booster, 'Boost_OF', 'Boost', of_params, seed=self._details.seed, iteration_details=iteration_details, best_params=best_params, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/SVM.py alphas = [10**-x for x in np.arange(1, 9.01, 1 / 2)] samples = self._details.ds.features.shape[0] gamma_fracs = np.arange(0.2, 2.1, 0.2) params = { 'SVM__alpha': alphas, 'SVM__max_iter': [int((1e6 / samples) / .8) + 1], 'SVM__gamma_frac': gamma_fracs } complexity_param = { 'name': 'SVM__gamma_frac', 'display_name': 'Gamma Fraction', 'values': gamma_fracs } iteration_params = {'SVM__max_iter': [2**x for x in range(12)]} learner = learners.SVMLearner(tol=None) best_params = experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'SVM_RBF', 'SVM', params, complexity_param=complexity_param, seed=self._details.seed, iteration_params=iteration_params, threads=self._details.threads, verbose=self._verbose) of_params = best_params.copy() of_params['SVM__alpha'] = 1e-16 learner = learners.SVMLearner(n_jobs=self._details.threads) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'SVM_RBF_OF', 'SVM', of_params, seed=self._details.seed, iteration_params=iteration_params, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/Boosting.py max_depths = np.arange(1, 11, 1) # NOTE: Criterion may need to be adjusted here depending on the dataset base = learners.DTLearner(criterion='gini', class_weight='balanced', max_depth=7, random_state=self._details.seed) of_base = learners.DTLearner(criterion='gini', class_weight='balanced', random_state=self._details.seed) booster = learners.BoostingLearner(algorithm='SAMME', learning_rate=1, base_estimator=base, random_state=self._details.seed) of_booster = learners.BoostingLearner(algorithm='SAMME', learning_rate=1, base_estimator=of_base, random_state=self._details.seed) params = {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 90, 100], 'Boost__learning_rate': [(2**x)/100 for x in range(7)]+[1], 'Boost__base_estimator__max_depth': max_depths} iteration_details = { 'params': {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]} } of_params = {'Boost__base_estimator__max_depth': None} complexity_param = {'name': 'Boost__learning_rate', 'display_name': 'Learning rate', 'x_scale': 'log', 'values': [(2**x)/100 for x in range(7)]+[1]} best_params = None if self._details.ds_best_params is not None and 'Boost' in self._details.ds_best_params: best_params = self._details.ds_best_params['Boost'] if best_params is not None: booster.set_params(**best_params) of_booster.set_params(**best_params) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, booster, 'Boost', 'Boost', params, complexity_param=complexity_param, iteration_details=iteration_details, best_params=best_params, seed=self._details.seed, threads=self._details.threads, verbose=self._verbose) # TODO: This should turn OFF regularization experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, of_booster, 'Boost_OF', 'Boost', of_params, seed=self._details.seed, iteration_details=iteration_details, best_params=best_params, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/KNN.py n_neighbors = np.arange(1, 51, 3) params = { 'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'], 'KNN__n_neighbors': n_neighbors, 'KNN__weights': ['uniform'] } complexity_param = { 'name': 'KNN__n_neighbors', 'display_name': 'Neighbor count', 'values': n_neighbors } best_params = None # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # the various graphs # # Dataset 1: # best_params = {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'} # # Dataset 1: # best_params = {'metric': 'chebyshev', 'n_neighbors': 13, 'weights': 'uniform'} learner = learners.KNNLearner(n_jobs=self._details.threads) if best_params is not None: learner.set_params(**best_params) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'KNN', 'KNN', params, complexity_param=complexity_param, seed=self._details.seed, best_params=best_params, threads=self._details.threads, verbose=self._verbose)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/ANN.py # Search for good alphas alphas = [10 ** -x for x in np.arange(-1, 5.01, 1 / 2)] # TODO: Allow for tuning of hidden layers based on dataset provided d = self._details.ds.features.shape[1] hiddens = [(h,) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] params = {'MLP__activation': ['relu', 'tanh'], 'MLP__alpha': alphas, 'MLP__hidden_layer_sizes': hiddens} timing_params = {'MLP__early_stopping': True} iteration_params = {'MLP__max_iter': [2 ** x for x in range(11)] + [2000, 2200, 2400, 2600, 2800, 3000]} complexity_param = {'name': 'MLP__alpha', 'display_name': 'Alpha', 'x_scale': 'log', 'values': alphas} learner = learners.ANNLearner(tol=1e-8, verbose=self._verbose) best_params = experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'ANN', 'MLP', params, complexity_param=complexity_param, seed=self._details.seed, timing_params=timing_params, iteration_pipe_params=timing_params, iteration_params=iteration_params, threads=self._details.threads, verbose=self._verbose) of_params = best_params.copy() of_params['MLP__alpha'] = 0 learner = learners.ANNLearner() experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'ANN_OF', 'MLP', of_params, seed=self._details.seed, timing_params=timing_params, iteration_pipe_params=timing_params, iteration_params=iteration_params, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/SVM.py samples = self._details.ds.features.shape[0] features = self._details.ds.features.shape[1] gamma_fracs = np.arange(1 / features, 2.1, 0.2) tols = np.arange(1e-8, 1e-1, 0.01) C_values = np.arange(0.001, 2.5, 0.25) iters = [-1, int((1e6 / samples) / 0.8) + 1] best_params_linear = None best_params_rbf = None # # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # # the various graphs # # # if self._details.ds_name == "spam": # best_params_linear = { # "C": 0.101, # "class_weight": "balanced", # "loss": "squared_hinge", # "max_iter": 33, # "tol": 1.00e-08, # } # best_params_rbf = { # "C": 0.251, # "class_weight": "balanced", # "decision_function_shape": "ovo", # "tol": 0.07000001, # } # elif self._details.ds_name == "poisonous_mushrooms": # best_params_linear = { # "C": 0.001, # "class_weight": "balanced", # "loss": "squared_hinge", # "max_iter": 42, # "tol": 1.00e-08, # } # best_params_rbf = { # "C": 0.251, # "class_weight": "balanced", # "decision_function_shape": "ovo", # "tol": 0.06000001, # } # Linear SVM params = { "SVM__max_iter": iters, "SVM__tol": tols, "SVM__class_weight": [{ 1: 10 }], "SVM__C": C_values, } complexity_param = { "name": "SVM__C", "display_name": "Penalty", "values": np.arange(0.001, 2.5, 0.1), } iteration_details = { "x_scale": "log", "params": { "SVM__max_iter": [2**x for x in range(12)] }, } # NOTE: If this is causing issues, try the RBFSVMLearner. Passing use_linear=True will use a linear kernel # and passing use_linear=False will use the RBF kernel. This method is slower but if libsvm is not # available it may be your only option learner = learners.LinearSVMLearner(dual=False) if best_params_linear is not None: learner.set_params(**best_params_linear) # best_params = experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, "SVMLinear", "SVM", params, complexity_param=complexity_param, seed=self._details.seed, best_params=best_params_linear, threads=self._details.threads, verbose=self._verbose, ) of_params = best_params.copy() learner = learners.LinearSVMLearner(dual=True) if best_params_linear is not None: learner.set_params(**best_params_linear) experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, "SVMLinear_OF", "SVM", of_params, seed=self._details.seed, iteration_details=iteration_details, best_params=best_params_linear, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True, ) # RBF SVM params = { "SVM__max_iter": iters, "SVM__tol": tols, "SVM__class_weight": ["balanced"], "SVM__C": C_values, "SVM__decision_function_shape": ["ovo", "ovr"], "SVM__gamma": gamma_fracs, } params = { "SVM__C": C_values, "SVM__class_weight": ["balanced"], "SVM__tol": tols, "SVM__decision_function_shape": ["ovo", "ovr"], } complexity_param = { "name": "SVM__C", "display_name": "Penalty", "values": np.arange(0.001, 2.5, 0.25), } # learner = learners.SVMLearner(kernel="rbf") if best_params_rbf is not None: learner.set_params(**best_params_rbf) best_params = experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, "SVM_RBF", "SVM", params, complexity_param=complexity_param, seed=self._details.seed, best_params=best_params_rbf, threads=self._details.threads, verbose=self._verbose, ) of_params = best_params.copy() learner = learners.SVMLearner(kernel="rbf") if best_params_rbf is not None: learner.set_params(**best_params_rbf) experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, "SVM_RBF_OF", "SVM", of_params, seed=self._details.seed, iteration_details=iteration_details, best_params=best_params_rbf, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True, )
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/ANN.py # Search for good alphas alphas = [10**-x for x in np.arange(-1, 9.01, 0.5)] # TODO: Allow for better tuning of hidden layers based on dataset provided d = self._details.ds.features.shape[1] hiddens = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] learning_rates = sorted([(2**x) / 1000 for x in range(8)] + [0.000001]) params = { 'MLP__activation': ['relu', 'logistic'], 'MLP__alpha': alphas, 'MLP__learning_rate_init': learning_rates, 'MLP__hidden_layer_sizes': hiddens } timing_params = {'MLP__early_stopping': False} iteration_details = { 'x_scale': 'log', 'params': { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, 'pipe_params': timing_params } complexity_param = { 'name': 'MLP__alpha', 'display_name': 'Alpha', 'x_scale': 'log', 'values': alphas } best_params = None if self._details.ds_best_params is not None and 'ANN' in self._details.ds_best_params: best_params = self._details.ds_best_params['ANN'] learner = learners.ANNLearner(max_iter=3000, early_stopping=True, random_state=self._details.seed, verbose=self._verbose) if best_params is not None: learner.set_params(**best_params) cv_best_params = experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'ANN', 'MLP', params, complexity_param=complexity_param, seed=self._details.seed, timing_params=timing_params, iteration_details=iteration_details, best_params=best_params, threads=self._details.threads, verbose=self._verbose) # TODO: This should turn OFF regularization of_params = cv_best_params.copy() of_params['MLP__alpha'] = 0 if best_params is not None: learner.set_params(**best_params) learner = learners.ANNLearner(max_iter=3000, early_stopping=True, random_state=self._details.seed, verbose=self._verbose) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'ANN_OF', 'MLP', of_params, seed=self._details.seed, timing_params=timing_params, iteration_details=iteration_details, best_params=best_params, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/ANN.py # Search for good alphas alphas = [3, 1, 0.5, 0.25, 0.10, 0.005, 0.001] # TODO: Allow for better tuning of hidden layers based on dataset provided d = self._details.ds.features.shape[1] hiddens = [(2, 2), (4, 4), (8, 8), (16, 16), (32, 32), (64, 64), (128, 128)] learning_rates = [0.000001, 0.0001, 0.001, 0.01, 0.1, 0.5] params = { 'MLP__activation': ['relu', 'logistic'], 'MLP__alpha': alphas, 'MLP__learning_rate_init': learning_rates, 'MLP__hidden_layer_sizes': hiddens } timing_params = {'MLP__early_stopping': False} iteration_details = { 'x_scale': 'log', 'params': { 'MLP__max_iter': [1000, 2500, 5000, 10000, 30000] }, 'pipe_params': timing_params } complexity_param = { 'name': 'MLP__alpha', 'display_name': 'Alpha', 'x_scale': 'log', 'values': alphas } best_params = None # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # the various graphs # # Dataset 1: # best_params = {'activation': 'relu', 'alpha': 1.0, 'hidden_layer_sizes': (36, 36), # 'learning_rate_init': 0.016} # Dataset 2: # best_params = {'activation': 'relu', 'alpha': 1e-05, 'hidden_layer_sizes': (16, 16), # 'learning_rate_init': 0.064} learner = learners.ANNLearner(max_iter=3000, early_stopping=True, random_state=self._details.seed, verbose=self._verbose) if best_params is not None: learner.set_params(**best_params) cv_best_params = experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'ANN', 'MLP', params, complexity_param=complexity_param, seed=self._details.seed, timing_params=timing_params, iteration_details=iteration_details, best_params=best_params, threads=self._details.threads, verbose=self._verbose) # TODO: This should turn OFF regularization of_params = cv_best_params.copy() of_params['MLP__alpha'] = 0 if best_params is not None: learner.set_params(**best_params) learner = learners.ANNLearner(max_iter=3000, early_stopping=True, random_state=self._details.seed, verbose=self._verbose) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'ANN_OF', 'MLP', of_params, seed=self._details.seed, timing_params=timing_params, iteration_details=iteration_details, best_params=best_params, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/ANN.py # TODO: Allow for better tuning of hidden layers based on dataset provided d = self._details.ds.features.shape[1] timing_params = {"MLP__early_stopping": False} iteration_details = { "x_scale": "log", "params": { "MLP__max_iter": [2 ** x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, "pipe_params": timing_params, } params = None complexity_param = None if self._details.ds_name == "spam": # Search for good alphas alphas = [10 ** -x for x in np.arange(0, 9.01, 0.5)] hiddens = [(h,) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] learning_rates = [0.064] # learning_rates = sorted([(2 ** x) / 1000 for x in range(8)] + [0.000001]) params = { "MLP__activation": ["logistic"], "MLP__alpha": alphas, "MLP__learning_rate_init": learning_rates, "MLP__hidden_layer_sizes": hiddens, } complexity_param = { "name": "MLP__alpha", "display_name": "Alpha", "x_scale": "log", "values": alphas, } elif self._details.ds_name == "poisonous_mushrooms": # Search for good alphas alphas = [10 ** -x for x in np.arange(0, 9.01, 0.5)] # hiddens = [(16,)] # [(h,) * l for l in [1, 2] for h in [d, d // 2, d * 2]] hiddens = [(h,) * l for l in [1, 2] for h in [d, d // 2, d * 2]] learning_rates = [ 0.064 ] # sorted([(2**x)/1000 for x in range(8)]+[0.000001]) params = { "MLP__activation": ["logistic"], "MLP__alpha": alphas, "MLP__learning_rate_init": learning_rates, "MLP__hidden_layer_sizes": hiddens, } complexity_param = { "name": "MLP__alpha", "display_name": "Alpha", "x_scale": "log", "values": alphas, } best_params = None # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # the various graphs # # if self._details.ds_name == 'spam': # best_params = {'activation': 'logistic', 'alpha': 1.00E-06, 'hidden_layer_sizes': (16,), 'learning_rate_init': 0.064} # elif self._details.ds_name == 'poisonous_mushrooms': # best_params = {'activation': 'logistic', 'alpha': 0.003162278, 'hidden_layer_sizes': (16,), # 'learning_rate_init': 0.064} learner = learners.ANNLearner( max_iter=3000, early_stopping=True, random_state=self._details.seed, verbose=self._verbose, ) if best_params is not None: learner.set_params(**best_params) cv_best_params = experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, "ANN", "MLP", params, complexity_param=complexity_param, seed=self._details.seed, iteration_details=iteration_details, timing_params=timing_params, best_params=best_params, threads=self._details.threads, verbose=self._verbose, ) # TODO: This should turn OFF regularization of_params = cv_best_params.copy() of_params["MLP__alpha"] = 0 if best_params is not None: learner.set_params(**best_params) learner = learners.ANNLearner( max_iter=3000, early_stopping=True, random_state=self._details.seed, verbose=self._verbose, ) experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, "ANN_OF", "MLP", of_params, seed=self._details.seed, timing_params=timing_params, iteration_details=iteration_details, best_params=best_params, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True, )
def perform(self): # TODO: Clean up the older alpha stuff? max_depths = np.arange(1, 21, 1) #alphas = [-1,-1e-3,-(1e-3)*10**-0.5, -1e-2, -(1e-2)*10**-0.5,-1e-1,-(1e-1)*10**-0.5, 0, (1e-1)*10**-0.5,1e-1,(1e-2)*10**-0.5,1e-2,(1e-3)*10**-0.5,1e-3] alphas = [x / 1000 for x in range(-40, 40, 4)] #params = {'DT__criterion': ['gini', 'entropy'], # 'DT__max_depth': max_depths, # 'alpha' : alphas, # 'DT__class_weight': ['balanced', None] #} # , 'DT__max_leaf_nodes': max_leaf_nodes} params = { 'DT__criterion': ['gini', 'entropy'], 'DT__alpha': alphas, 'DT__class_weight': ['balanced'], 'DT__random_state': [self._details.seed] } complexity_param = { 'name': 'DT__alpha', 'display_name': 'alpha', 'values': alphas } best_params = None # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # the various graphs # # Dataset 1: params_wine = { 'DT__criterion': 'gini', 'DT__alpha': 0.008, 'DT__class_weight': 'balanced' } if self._details.ds_name == "wine-qual" and self._details.bparams: for k in params.keys(): if k in params_wine.keys(): params[k] = [params_wine.get(k)] # # Dataset 2: params_enhancer = { 'DT__criterion': 'gini', 'DT__alpha': 0.008, 'DT__class_weight': 'balanced' } if self._details.ds_name == "enhancer-b" and self._details.bparams: for k in params.keys(): if k in params_enhancer.keys(): params[k] = [params_enhancer.get(k)] learner = learners.DTLearner(random_state=self._details.seed) if best_params is not None: learner.set_params(**best_params) best_params = experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'DT', 'DT', params, complexity_param=complexity_param, seed=self._details.seed, threads=self._details.threads, best_params=best_params, verbose=self._verbose, apply_pruning=True)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/ANN.py # Search for good alphas alphas = [10**-x for x in np.arange(-3, 9.01, 0.5)] alphas = [0] + alphas # TODO: Allow for better tuning of hidden layers based on dataset provided d = self._details.ds.features.shape[1] hiddens = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] learning_rates = sorted([(2**x) / 1000 for x in range(8)] + [0.000001]) params = { 'MLP__activation': ['relu', 'logistic', 'tanh'], 'MLP__alpha': alphas, 'MLP__learning_rate_init': learning_rates, 'MLP__hidden_layer_sizes': hiddens, 'MLP__random_state': [self._details.seed], 'MLP__beta_1': [0.5, 0.9, 0.99, 0.999], 'MLP__beta_2': [0.5, 0.9, 0.99, 0.999] } timing_params = {'MLP__early_stopping': False} iteration_details = { 'x_scale': 'log', 'params': { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2400, 2700, 3000] }, 'pipe_params': timing_params } complexity_param = { 'name': 'MLP__alpha', 'display_name': 'Alpha', 'x_scale': 'log', 'values': alphas } best_params = None # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # the various graphs # # Dataset 1: # best_params = {'activation': 'relu', 'alpha': 1.0, 'hidden_layer_sizes': (36, 36), alpha = 0 params_wine = { 'MLP__activation': 'tanh', 'MLP__alpha': 0.1, 'MLP__learning_rate_init': 0.064, 'MLP__hidden_layer_sizes': (12, 12), 'MLP__beta_1': 0.99, 'MLP__beta_2': 0.99 } if self._details.ds_name == "wine-qual" and self._details.bparams: alpha = 0.1 for k in params.keys(): if k in params_wine.keys(): params[k] = [params_wine.get(k)] params_enhancer = { 'MLP__activation': 'logistic', 'MLP__alpha': 0.001, 'MLP__learning_rate_init': 0.128, 'MLP__hidden_layer_sizes': (38, 38), 'MLP__beta_1': 0.5, 'MLP__beta_2': 0.999 } if self._details.ds_name == "enhancer-b" and self._details.bparams: alpha = 0.001 for k in params.keys(): if k in params_enhancer.keys(): params[k] = [params_enhancer.get(k)] #if self._details.ds_name == "wine-qual": # best_params = params_wine # 'learning_rate_init': 0.016} # Dataset 2: # best_params = {'activation': 'relu', 'alpha': 1e-05, 'hidden_layer_sizes': (16, 16), # 'learning_rate_init': 0.064} learner = learners.ANNLearner(max_iter=3000, early_stopping=True, random_state=self._details.seed, verbose=self._verbose) if best_params is not None: learner.set_params(**best_params) cv_best_params = experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'ANN', 'MLP', params, complexity_param=complexity_param, seed=self._details.seed, timing_params=timing_params, iteration_details=iteration_details, best_params=best_params, threads=self._details.threads, verbose=self._verbose) # TODO: This should turn OFF regularization of_params = cv_best_params.copy() of_params['MLP__alpha'] = alpha if best_params is not None: learner.set_params(**best_params) learner = learners.ANNLearner(max_iter=3000, early_stopping=True, random_state=self._details.seed, verbose=self._verbose) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'ANN_', 'MLP', of_params, seed=self._details.seed, timing_params=timing_params, iteration_details=iteration_details, best_params=best_params, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/ANN.py # Search for good alphas # alphas = [10 ** -x for x in np.arange(-1, 9.01, 0.5)] # YS trying a larger intervals # alphas = [10 ** -x for x in np.arange(-1, 5.01, 1)] alphas = [10**-x for x in np.arange(1, 3.01, 1)] # TODO: Allow for better tuning of hidden layers based on dataset provided d = self._details.ds.features.shape[1] hiddens = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] # learning_rates = sorted([(2**x)/1000 for x in range(8)]+[0.000001]) # YS trying a larger intervals learning_rates = sorted([(4**x) / 1000 for x in range(3)]) # YS trying a larger intervals # params = {'MLP__activation': ['relu', 'logistic'], 'MLP__alpha': alphas, params = { 'MLP__activation': ['relu', 'logistic'], 'MLP__alpha': alphas, 'MLP__learning_rate_init': learning_rates, 'MLP__hidden_layer_sizes': hiddens } # YS changing early stopping to True # timing_params = {'MLP__early_stopping': False} timing_params = {'MLP__early_stopping': True} iteration_details = { 'x_scale': 'log', # 'params': {'MLP__max_iter': # [2 ** x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, # 3000]}, # YS cutting the max_iter 'params': { 'MLP__max_iter': [2**x for x in range(11)] }, 'pipe_params': timing_params } complexity_param = { 'name': 'MLP__alpha', 'display_name': 'Alpha', 'x_scale': 'log', 'values': alphas } best_params = None # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # the various graphs # # Dataset 1: # best_params = {'activation': 'relu', 'alpha': 1.0, 'hidden_layer_sizes': (36, 36), # 'learning_rate_init': 0.016} # Dataset 2: # best_params = {'activation': 'relu', 'alpha': 1e-05, 'hidden_layer_sizes': (16, 16), # 'learning_rate_init': 0.064} # learner = learners.ANNLearner(max_iter=3000, early_stopping=True, random_state=self._details.seed, # verbose=self._verbose) # YS cutting the max_iter learner = learners.ANNLearner(max_iter=2000, early_stopping=True, random_state=self._details.seed, verbose=self._verbose) if best_params is not None: learner.set_params(**best_params) cv_best_params = experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'ANN', 'MLP', params, complexity_param=complexity_param, seed=self._details.seed, timing_params=timing_params, iteration_details=iteration_details, best_params=best_params, threads=self._details.threads, verbose=self._verbose) # TODO: This should turn OFF regularization of_params = cv_best_params.copy() of_params['MLP__alpha'] = 0 if best_params is not None: learner.set_params(**best_params) # learner = learners.ANNLearner(max_iter=3000, early_stopping=True, random_state=self._details.seed, # verbose=self._verbose) # YS cutting the max_iter learner = learners.ANNLearner(max_iter=2000, early_stopping=True, random_state=self._details.seed, verbose=self._verbose) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'ANN_OF', 'MLP', of_params, seed=self._details.seed, timing_params=timing_params, iteration_details=iteration_details, best_params=best_params, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/SVM.py samples = self._details.ds.features.shape[0] features = self._details.ds.features.shape[1] gamma_fracs = np.arange(1 / features, 2.1, 0.2) tols = np.arange(1e-8, 1e-1, 0.01) C_values = np.arange(0.001, 2.5, 0.25) iters = [-1, int((1e6 / samples) / .8) + 1] best_params_linear = None best_params_rbf = None # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # the various graphs # # Dataset 1 (credit default): ''' best_params_linear = { "C": 0.251, "class_weight": "balanced", "dual": False, "fit_intercept": True, "intercept_scaling": 1, "loss": "squared_hinge", "max_iter": 42, "multi_class": "ovr", "penalty": "l2", "tol": 0.020000010000000002, "verbose": False } best_params_rbf = { "C": 0.751, "cache_size": 200, "class_weight": "balanced", "coef0": 0, "decision_function_shape": "ovo", "degree": 3, "gamma": 0.043478260869565216, "kernel": "rbf", "max_iter": -1, "probability": False, "shrinking": True, "tol": 0.08000001, "verbose": False } # Dataset 2: best_params_linear = { "C": 0.251, "class_weight": "balanced", "dual": False, "fit_intercept": True, "intercept_scaling": 1, "loss": "squared_hinge", "max_iter": 42, "multi_class": "ovr", "penalty": "l2", "tol": 0.020000010000000002, "verbose": False } best_params_rbf = { "C": 1.501, "cache_size": 200, "class_weight": "balanced", "coef0": 0, "decision_function_shape": "ovo", "degree": 3, "gamma": 0.0056179775280898875, "kernel": "rbf", "max_iter": -1, "probability": False, "shrinking": True, "tol": 0.09000000999999999, "verbose": False } ''' # Linear SVM params = { 'SVM__max_iter': iters, 'SVM__tol': tols, 'SVM__class_weight': ['balanced'], 'SVM__C': C_values } complexity_param = { 'name': 'SVM__C', 'display_name': 'Penalty', 'values': np.arange(0.001, 2.5, 0.1) } iteration_details = { 'x_scale': 'log', 'params': { 'SVM__max_iter': [2**x for x in range(12)] }, } # NOTE: If this is causing issues, try the RBFSVMLearner. Passing use_linear=True will use a linear kernel # and passing use_linear=False will use the RBF kernel. This method is slower but if libsvm is not # available it may be your only option learner = learners.LinearSVMLearner(dual=False) if best_params_linear is not None: learner.set_params(**best_params_linear) best_params = experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'SVMLinear', 'SVM', params, complexity_param=complexity_param, seed=self._details.seed, iteration_details=iteration_details, best_params=best_params_linear, threads=self._details.threads, verbose=self._verbose) of_params = best_params.copy() learner = learners.LinearSVMLearner(dual=True) if best_params_linear is not None: learner.set_params(**best_params_linear) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'SVMLinear_OF', 'SVM', of_params, seed=self._details.seed, iteration_details=iteration_details, best_params=best_params_linear, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True) # RBF SVM params = { 'SVM__max_iter': iters, 'SVM__tol': tols, 'SVM__class_weight': ['balanced'], 'SVM__C': C_values, 'SVM__decision_function_shape': ['ovo', 'ovr'], 'SVM__gamma': gamma_fracs } complexity_param = { 'name': 'SVM__C', 'display_name': 'Penalty', 'values': np.arange(0.001, 2.5, 0.1) } learner = learners.SVMLearner(kernel='rbf') if best_params_rbf is not None: learner.set_params(**best_params_rbf) best_params = experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'SVM_RBF', 'SVM', params, complexity_param=complexity_param, seed=self._details.seed, iteration_details=iteration_details, best_params=best_params_rbf, threads=self._details.threads, verbose=self._verbose) of_params = best_params.copy() learner = learners.SVMLearner(kernel='rbf') if best_params_rbf is not None: learner.set_params(**best_params_rbf) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'SVM_RBF_OF', 'SVM', of_params, seed=self._details.seed, iteration_details=iteration_details, best_params=best_params_rbf, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/Boosting.py alphas = [x / 1000 for x in range(-10, 40, 4)] crit = "entropy" lr = [(2**x) / 100 for x in range(7)] + [1] n_estimators = [1, 2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] n_estimators_iter = [ 1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 140, 160, 200, 240, 300 ] # /output-ew2 if 'enhancer-b' == self._details.ds_name and self._details.bparams: alphas = [0.05] crit = "gini" lr = [0.16] + [10**(x / 8) for x in range(-32, 16)] lr = [0.32] n_estimators = n_estimators_iter n_estimators = [5] if 'wine-qual' == self._details.ds_name and self._details.bparams: alphas = [0.014] crit = "gini" lr = [0.16] # use old lr range here n_estimators = [20] # NOTE: Criterion may need to be adjusted here depending on the dataset base = learners.DTLearner(criterion=crit, class_weight='balanced', random_state=self._details.seed) of_base = learners.DTLearner(criterion=crit, class_weight='balanced', random_state=self._details.seed) booster = learners.BoostingLearner(algorithm='SAMME.R', learning_rate=1, base_estimator=base, random_state=self._details.seed) of_booster = learners.BoostingLearner(algorithm='SAMME.R', learning_rate=1, base_estimator=of_base, random_state=self._details.seed) params = { 'Boost__n_estimators': n_estimators, 'Boost__learning_rate': lr, 'Boost__base_estimator__alpha': alphas, 'Boost__random_state': [self._details.seed], 'Boost__base_estimator__random_state': [self._details.seed] } iteration_details = { 'params': { 'Boost__n_estimators': n_estimators_iter } } of_params = {'Boost__base_estimator__alpha': -1} complexity_param = { 'name': 'Boost__learning_rate', 'display_name': 'Learning rate', 'x_scale': 'log', 'values': [10**(x / 8) for x in range(-32, 16)] } best_params = None # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # the various graphs # # Dataset 1: # best_params = {'base_estimator__max_depth': 8, 'learning_rate': 0.32, 'n_estimators': 90} # # Dataset 2: # best_params = {'base_estimator__max_depth': 6, 'learning_rate': 0.16, 'n_estimators': 20} if best_params is not None: booster.set_params(**best_params) of_booster.set_params(**best_params) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, booster, 'Boost', 'Boost', params, complexity_param=complexity_param, iteration_details=iteration_details, best_params=best_params, seed=self._details.seed, threads=self._details.threads, verbose=self._verbose)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/KNN.py """original""" # params = {'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'], 'KNN__n_neighbors': np.arange(1, 51, 3), # 'KNN__weights': ['uniform']} """new: changed metric and n_neighbors""" """ https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html "distance" https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html KNN__metric is our distance functions: manhattan and euclidean manhattan: sum(|x - y|) euclidean: sqrt(sum((x - y)^2)) "weights" uniform: uniform weights. All points in each neighborhood are weighted equally. distance: weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. """ params = { 'KNN__metric': ['manhattan', 'euclidean'], 'KNN__n_neighbors': np.arange(1, 51, 5), 'KNN__weights': ['uniform'] } complexity_param = { 'name': 'KNN__n_neighbors', 'display_name': 'Neighbor count', 'values': np.arange(1, 51, 5) } best_params = None # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # the various graphs # # Dataset 1: # best_params = {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'uniform'} # # Dataset 1: # best_params = {'metric': 'euclidean', 'n_neighbors': 4, 'weights': 'uniform'} learner = learners.KNNLearner(n_jobs=self._details.threads) if best_params is not None: learner.set_params(**best_params) """perform_experiment(ds, ds_name, ds_readable_name, clf, clf_name, clf_label, params, timing_params=None, iteration_details=None, complexity_param=None, seed=0, threads=1, iteration_lc_only=False, best_params=None, verbose=False)""" """pipe is built with pipe = Pipeline([('Scale', StandardScaler()), ('KNN', learner)]) """ experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'KNN', 'KNN', params, complexity_param=complexity_param, seed=self._details.seed, best_params=best_params, threads=self._details.threads, verbose=self._verbose)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/ANN.py # Search for good alphas """ alphas: - constrains/penalizes our max weights: high of 10 and diminishing drastically - recall: the larger the weight for an attr, the more it dominates, can lead to OVERFITTING """ alphas = [10 ** -x for x in np.arange(-1, 9.01, 0.5)] # TODO: Allow for better tuning of hidden layers based on dataset provided d = self._details.ds.features.shape[1] """ hiddens - based on the number of features (Xs, or attrs) in our data set - we test 1-2-3 layers using a multiple or division of # Xs. - ex: 23 attributes: test 11, 23, 46 hidden layer sizes """ hiddens = [(h,) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] """ https://machinelearningmastery.com/understand-the-dynamics-of-learning-rate-on-deep-learning-neural-networks/ learning_rates: - hyper-parameter that controls how much to change the model in response to the estimated error each time the model weights are updated. - a value too small may result in a long training process that could get stuck, - a value too large may result in learning a sub-optimal set of weights too fast or an unstable training process - may be the most important hyper-parameter when configuring ANN. - small positive value, often in the range between 0.0 and 1.0. """ learning_rates = sorted([(2**x)/1000 for x in range(8)]+[0.000001]) """ logistic: the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)). relu: the rectified linear unit function, returns f(x) = max(0, x) """ params = {'MLP__activation': ['relu', 'logistic'], 'MLP__alpha': alphas, 'MLP__learning_rate_init': learning_rates, 'MLP__hidden_layer_sizes': hiddens} timing_params = {'MLP__early_stopping': False} iteration_details = {'x_scale': 'log', 'params': {'MLP__max_iter': [2 ** x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000]}, 'pipe_params': timing_params} complexity_param = {'name': 'MLP__alpha', 'display_name': 'Alpha', 'x_scale': 'log', 'values': alphas} best_params = None # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # the various graphs # # Dataset 1: # best_params = {'activation': 'relu', 'alpha': 1.0, 'hidden_layer_sizes': (36, 36), # 'learning_rate_init': 0.016} # Dataset 2: # best_params = {'activation': 'relu', 'alpha': 1e-05, 'hidden_layer_sizes': (16, 16), # 'learning_rate_init': 0.064} learner = learners.ANNLearner(max_iter=3000, early_stopping=True, random_state=self._details.seed, verbose=self._verbose) if best_params is not None: learner.set_params(**best_params) cv_best_params = experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'ANN', 'MLP', params, complexity_param=complexity_param, seed=self._details.seed, timing_params=timing_params, iteration_details=iteration_details, best_params=best_params, threads=self._details.threads, verbose=self._verbose) # TODO: This should turn OFF regularization of_params = cv_best_params.copy() of_params['MLP__alpha'] = 0 if best_params is not None: learner.set_params(**best_params) learner = learners.ANNLearner(max_iter=3000, early_stopping=True, random_state=self._details.seed, verbose=self._verbose) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'ANN_OF', 'MLP', of_params, seed=self._details.seed, timing_params=timing_params, iteration_details=iteration_details, best_params=best_params, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/KNN.py params = None complexity_param = None best_params = None if self._details.ds_name == "spam": params = { "KNN__metric": ["minkowski", "chebyshev", "euclidean"], "KNN__n_neighbors": np.arange(1, 21, 3), # "KNN__n_neighbors": [3, 5, 7, 9], "KNN__weights": ["uniform", "distance"], } complexity_param = { "name": "KNN__n_neighbors", "display_name": "Neighbor count", "values": np.arange(1, 21, 3), # "values": [3, 5, 7, 9], # "values": [1, 2, 3, 4], } elif self._details.ds_name == "poisonous_mushrooms": params = { "KNN__metric": ["minkowski", "chebyshev", "euclidean"], "KNN__n_neighbors": np.arange(1, 21, 3), "KNN__weights": ["uniform", "distance"], } complexity_param = { "name": "KNN__n_neighbors", "display_name": "Neighbor count", "values": np.arange(1, 21, 3), } # # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # # the various graphs # # # # if self._details.ds_name == "spam": # best_params = { # "metric": "chebyshev", # "n_neighbors": 7, # "weights": "uniform", # } # elif self._details.ds_name == "poisonous_mushrooms": # best_params = { # "metric": "chebyshev", # "n_neighbors": 7, # "weights": "uniform", # } learner = learners.KNNLearner(n_jobs=self._details.threads) if best_params is not None: learner.set_params(**best_params) experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, "KNN", "KNN", params, complexity_param=complexity_param, seed=self._details.seed, best_params=best_params, threads=self._details.threads, verbose=self._verbose, )
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/Boosting.py max_depths = np.arange(1, 15, 1) # NOTE: Criterion may need to be adjusted here depending on the dataset base = learners.DTLearner( criterion="entropy", class_weight="balanced", max_depth=5, random_state=self._details.seed, ) of_base = learners.DTLearner( criterion="entropy", class_weight="balanced", random_state=self._details.seed, ) booster = learners.BoostingLearner( algorithm="SAMME", learning_rate=1, base_estimator=base, random_state=self._details.seed, ) of_booster = learners.BoostingLearner( algorithm="SAMME", learning_rate=1, base_estimator=of_base, random_state=self._details.seed, ) params = { "Boost__n_estimators": [1, 2, 5, 10, 20, 30, 45, 60, 80, 90, 100], "Boost__learning_rate": [(2**x) / 100 for x in range(7)] + [1], "Boost__base_estimator__max_depth": max_depths, } iteration_details = { "params": { "Boost__n_estimators": [ 1, 2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, ] } } of_params = {"Boost__base_estimator__max_depth": None} complexity_param = { "name": "Boost__learning_rate", "display_name": "Learning rate", "x_scale": "log", "values": [(2**x) / 100 for x in range(7)] + [1], } best_params = None # # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # # the various graphs # # # if self._details.ds_name == "spam": # best_params = { # "base_estimator__max_depth": 10, # "learning_rate": 0.32, # "n_estimators": 30, # } # elif self._details.ds_name == "poisonous_mushrooms": # best_params = { # "base_estimator__max_depth": 10, # "learning_rate": 0.08, # "n_estimators": 60, # } # # Dataset 1: # # best_params = {'base_estimator__max_depth': 8, 'learning_rate': 0.32, 'n_estimators': 90} # # # # Dataset 2: # # best_params = {'base_estimator__max_depth': 6, 'learning_rate': 0.16, 'n_estimators': 20} # if best_params is not None: # booster.set_params(**best_params) # of_booster.set_params(**best_params) experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, booster, "Boost", "Boost", params, complexity_param=complexity_param, iteration_details=iteration_details, best_params=best_params, seed=self._details.seed, threads=self._details.threads, verbose=self._verbose, )
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/SVM.py samples = self._details.ds.features.shape[0] features = self._details.ds.features.shape[1] # original # gamma_fracs = np.arange(1/features, 2.1, 0.2) # tols = np.arange(1e-8, 1e-1, 0.01) # C_values = np.arange(0.001, 2.5, 0.25) # iters = [-1, int((1e6/samples)/.8)+1] # YS changed gamma_fracs = np.arange(1 / features, 1.1, 0.25) tols = np.arange(1e-3, 1e-1, 0.03) C_values = np.arange(0.1, 2.5, 0.5) iters = [-1, int((1e6 / samples) / .8) + 1] best_params_linear = None best_params_rbf = None # Uncomment to select known best params from grid search. This will skip the grid search and just rebuild # the various graphs # # Dataset 1: # best_params_linear = {'C': 0.5, 'class_weight': 'balanced', 'loss': 'squared_hinge', # 'max_iter': 1478, 'tol': 0.06000001} # best_params_rbf = {'C': 2.0, 'class_weight': 'balanced', 'decision_function_shape': 'ovo', # 'gamma': 0.05555555555555555, 'max_iter': -1, 'tol': 1e-08} # Dataset 2: # best_params_linear = {'C': 1.0, 'class_weight': 'balanced', 'loss': 'hinge', 'dual': True, # 'max_iter': 70, 'tol': 0.08000001} # best_params_rbf = {'C': 1.5, 'class_weight': 'balanced', 'decision_function_shape': 'ovo', # 'gamma': 0.125, 'max_iter': -1, 'tol': 0.07000001} # Linear SVM params = { 'SVM__max_iter': iters, 'SVM__tol': tols, 'SVM__class_weight': ['balanced'], 'SVM__C': C_values } # original # complexity_param = {'name': 'SVM__C', 'display_name': 'Penalty', 'values': np.arange(0.001, 2.5, 0.1)} # YS changed complexity_param = { 'name': 'SVM__C', 'display_name': 'Penalty', 'values': np.arange(0.1, 2.5, 0.5) } iteration_details = { 'x_scale': 'log', # original # 'params': {'SVM__max_iter': [2**x for x in range(12)]}, # YS changed from range(12) 'params': { 'SVM__max_iter': [4**x for x in range(6)] }, } # NOTE: If this is causing issues, try the RBFSVMLearner. Passing use_linear=True will use a linear kernel # and passing use_linear=False will use the RBF kernel. This method is slower but if libsvm is not # available it may be your only option learner = learners.LinearSVMLearner(dual=False) if best_params_linear is not None: learner.set_params(**best_params_linear) best_params = experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'SVMLinear', 'SVM', params, complexity_param=complexity_param, seed=self._details.seed, iteration_details=iteration_details, best_params=best_params_linear, threads=self._details.threads, verbose=self._verbose) of_params = best_params.copy() learner = learners.LinearSVMLearner(dual=True) if best_params_linear is not None: learner.set_params(**best_params_linear) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'SVMLinear_OF', 'SVM', of_params, seed=self._details.seed, iteration_details=iteration_details, best_params=best_params_linear, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True) # RBF SVM params = { 'SVM__max_iter': iters, 'SVM__tol': tols, 'SVM__class_weight': ['balanced'], 'SVM__C': C_values, 'SVM__decision_function_shape': ['ovo', 'ovr'], 'SVM__gamma': gamma_fracs } # original # complexity_param = {'name': 'SVM__C', 'display_name': 'Penalty', 'values': np.arange(0.001, 2.5, 0.1)} # YS changed complexity_param = { 'name': 'SVM__C', 'display_name': 'Penalty', 'values': np.arange(0.1, 2.5, 0.5) } learner = learners.SVMLearner(kernel='rbf') if best_params_rbf is not None: learner.set_params(**best_params_rbf) best_params = experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'SVM_RBF', 'SVM', params, complexity_param=complexity_param, seed=self._details.seed, iteration_details=iteration_details, best_params=best_params_rbf, threads=self._details.threads, verbose=self._verbose) of_params = best_params.copy() learner = learners.SVMLearner(kernel='rbf') if best_params_rbf is not None: learner.set_params(**best_params_rbf) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'SVM_RBF_OF', 'SVM', of_params, seed=self._details.seed, iteration_details=iteration_details, best_params=best_params_rbf, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True)
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-1/blob/master/SVM.py samples = self._details.ds.features.shape[0] features = self._details.ds.features.shape[1] gamma_fracs = np.arange(1 / features, 2.1, 0.2) tols = np.arange(1e-8, 1e-1, 0.01) C_values = np.arange(0.001, 2.5, 0.25) iters = [-1, int((1e6 / samples) / .8) + 1] best_params_linear = None if self._details.ds_best_params is not None and 'SVM_Linear' in self._details.ds_best_params: best_params_linear = self._details.ds_best_params['SVM_Linear'] best_params_rbf = None if self._details.ds_best_params is not None and 'SVM_RBF' in self._details.ds_best_params: best_params_rbf = self._details.ds_best_params['SVM_RBF'] # Linear SVM params = { 'SVM__max_iter': iters, 'SVM__tol': tols, 'SVM__class_weight': ['balanced'], 'SVM__C': C_values } complexity_param = { 'name': 'SVM__C', 'display_name': 'Penalty', 'values': np.arange(0.001, 2.5, 0.1) } iteration_details = { 'x_scale': 'log', 'params': { 'SVM__max_iter': [2**x for x in range(12)] }, } # RBF SVM if len(np.unique(self._details.ds.classes)) > 2: decision_functions = ['ovo'] else: decision_functions = ['ovo', 'ovr'] params = { 'SVM__max_iter': iters, 'SVM__tol': tols, 'SVM__class_weight': ['balanced'], 'SVM__C': C_values, 'SVM__decision_function_shape': decision_functions, 'SVM__gamma': gamma_fracs } complexity_param = { 'name': 'SVM__C', 'display_name': 'Penalty', 'values': np.arange(0.001, 2.5, 0.1) } learner = learners.SVMLearner(kernel='rbf') if best_params_rbf is not None: learner.set_params(**best_params_rbf) best_params = experiments.perform_experiment( self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'SVM_RBF', 'SVM', params, complexity_param=complexity_param, seed=self._details.seed, iteration_details=iteration_details, best_params=best_params_rbf, threads=self._details.threads, verbose=self._verbose) of_params = best_params.copy() learner = learners.SVMLearner(kernel='rbf') if best_params_rbf is not None: learner.set_params(**best_params_rbf) experiments.perform_experiment(self._details.ds, self._details.ds_name, self._details.ds_readable_name, learner, 'SVM_RBF_OF', 'SVM', of_params, seed=self._details.seed, iteration_details=iteration_details, best_params=best_params_rbf, threads=self._details.threads, verbose=self._verbose, iteration_lc_only=True)