def run_model(DecisionTree_params, category): """Full-scale training, validation and testing using all amines. Args: DecisionTree_params: A dictionary of the parameters for the decision tree model. See initialize() for more information. category: A string representing the category the model is classified under. """ # Feature names hard-coded for decision tree visualization features = [ '_rxn_M_acid', '_rxn_M_inorganic', '_rxn_M_organic', '_solv_GBL', '_solv_DMSO', '_solv_DMF', '_stoich_mmol_org', '_stoich_mmol_inorg', '_stoich_mmol_acid', '_stoich_mmol_solv', '_stoich_org/solv', '_stoich_inorg/solv', '_stoich_acid/solv', '_stoich_org+inorg/solv', '_stoich_org+inorg+acid/solv', '_stoich_org/liq', '_stoich_inorg/liq', '_stoich_org+inorg/liq', '_stoich_org/inorg', '_stoich_acid/inorg', '_rxn_Temperature_C', '_rxn_Reactiontime_s', '_feat_AvgPol', '_feat_Refractivity', '_feat_MaximalProjectionArea', '_feat_MaximalProjectionRadius', '_feat_maximalprojectionsize', '_feat_MinimalProjectionArea', '_feat_MinimalProjectionRadius', '_feat_minimalprojectionsize', '_feat_MolPol', '_feat_VanderWaalsSurfaceArea', '_feat_ASA', '_feat_ASA_H', '_feat_ASA_P', '_feat_ASA-', '_feat_ASA+', '_feat_ProtPolarSurfaceArea', '_feat_Hacceptorcount', '_feat_Hdonorcount', '_feat_RotatableBondCount', '_raw_standard_molweight', '_feat_AtomCount_N', '_feat_BondCount', '_feat_ChainAtomCount', '_feat_RingAtomCount', '_feat_primaryAmine', '_feat_secondaryAmine', '_rxn_plateEdgeQ', '_feat_maxproj_per_N', '_raw_RelativeHumidity' ] # Unload common parameters config = DecisionTree_params['configs'][category] if DecisionTree_params[ 'configs'] else None verbose = DecisionTree_params['verbose'] warning = DecisionTree_params['warning'] stats_path = DecisionTree_params['stats_path'] result_dict = DecisionTree_params['result_dict'] model_name = DecisionTree_params['model_name'] print(f'Running model {model_name}') # Unload the training data specific parameters num_draws = DecisionTree_params['num_draws'] train_size = DecisionTree_params['train_size'] active_learning_iter = DecisionTree_params['active_learning_iter'] cross_validation = DecisionTree_params['cross_validate'] full = DecisionTree_params['full_dataset'] active_learning = DecisionTree_params['active_learning'] w_hx = DecisionTree_params['with_historical_data'] w_k = DecisionTree_params['with_k'] draw_success = DecisionTree_params['draw_success'] # Specify the desired operation fine_tuning = DecisionTree_params['fine_tuning'] save_model = DecisionTree_params['save_model'] visualize = DecisionTree_params['visualize'] to_file = True if fine_tuning: class_weights = [{ 0: i, 1: 1.0 - i } for i in np.linspace(.05, .95, num=50)] class_weights.append('balanced') class_weights.append(None) max_depths = [i for i in range(9, 26)] max_depths.append(None) ft_params = { 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': max_depths, 'min_samples_split': [i for i in range(2, 11)], 'min_samples_leaf': [i for i in range(1, 4)], 'class_weight': class_weights } result_path = './results/ft_{}.pkl'.format(model_name) grid_search(ActiveDecisionTree, ft_params, result_path, num_draws, train_size, active_learning_iter, active_learning=active_learning, w_hx=w_hx, w_k=w_k, draw_success=draw_success, result_dict=result_dict, model_name=model_name) else: # Load the desired sized dataset under desired option dataset = process_dataset(num_draw=num_draws, train_size=train_size, active_learning_iter=active_learning_iter, verbose=verbose, cross_validation=cross_validation, full=full, active_learning=active_learning, w_hx=w_hx, w_k=w_k, success=draw_success) draws = list(dataset.keys()) amine_list = list(dataset[0]['x_t'].keys()) for amine in amine_list: # Create the decision tree model instance for the specific amine ADT = ActiveDecisionTree(amine=amine, config=config, verbose=verbose, stats_path=stats_path, result_dict=result_dict, model_name=model_name) for set_id in draws: # Unload the randomly drawn dataset values x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \ dataset[set_id]['y_t'], \ dataset[set_id]['x_v'], \ dataset[set_id]['y_v'], \ dataset[set_id]['all_data'], \ dataset[set_id]['all_labels'] # Load the training and validation set into the model ADT.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine], all_labels[amine]) # Train the data on the training set ADT.train(warning=warning) # Conduct active learning with all the observations available in the pool if active_learning: ADT.active_learning(num_iter=active_learning_iter, warning=warning) if visualize: # Plot the decision tree # To compile the graph, use the following command in terminal # dot -Tpng "{dt_file_name}.dot" -o "{desired file name}.png" # If using Jupyter Notebook, add ! in front to run command lines file_name = './results/{0:s}_dt_{1:s}_{2:d}.dot'.format( model_name, amine, set_id) export_graphviz(ADT.model, feature_names=features, class_names=['FAILURE', 'SUCCESS'], out_file=file_name, filled=True, rounded=True, special_characters=True) if to_file: ADT.store_metrics_to_file() # Save the model for future reproducibility if save_model: ADT.save_model(model_name)
def grid_search(clf, combinations, path, num_draws, train_size, active_learning_iter, active_learning=True, w_hx=True, w_k=True, draw_success=False, model_name=''): """Fine tune the model based on average bcr performance to find the best model hyper-parameters. Similar to GridSearchCV in scikit-learn package, we try out all the combinations and evaluate performance across all amine-specific models under different categories. Args: clf: A class object representing the classifier being fine tuned. combinations: A list of dictionaries representing the possible hyper-parameter values to try out. path: A string representing the directory path to store the statistics of all combinations tried during one stage of fine tuning. num_draws: An integer representing the number of random drawn to create the dataset. train_size: An integer representing the number of amine-specific experiments used for training. Corresponds to the k in the category description. active_learning_iter: An integer representing the number of iterations in an active learning loop. Corresponds to the x in the category description. active_learning: A boolean representing if active learning will be involved in testing or not. w_hx: A boolean representing if the models are trained with historical data or not. w_k: A boolean representing if the modes are trained with amine-specific experiments. draw_success: A boolean representing if the models are trained on regular randomly-drawn datasets or random datasets with at least one success for each amine. model_name: A string representing the name of the model being fine tuned. Returns: best_option: A dictionary representing the hyper-parameters that yields the best performance on average. The keys may vary for models. """ # Load or initialize dictionary to keep all configurations' performances if os.path.exists(path): with open(path, 'rb') as f: ft_log = pickle.load(f) else: ft_log = defaultdict(dict) if model_name not in ft_log: ft_log[model_name] = defaultdict(dict) # Load the full dataset under specific categorical option dataset = process_dataset(num_draw=num_draws, train_size=train_size, active_learning_iter=active_learning_iter, verbose=False, cross_validation=True, full=True, active_learning=active_learning, w_hx=w_hx, w_k=w_k, success=draw_success) draws = list(dataset.keys()) amine_list = list(dataset[0]['x_t'].keys()) if 'Default' not in ft_log[model_name]: # Set baseline performance base_accuracies = [] base_precisions = [] base_recalls = [] base_bcrs = [] for amine in amine_list: if amine == 'XZUCBFLUEBDNSJ-UHFFFAOYSA-N' and draw_success: # Skipping the amine with only 1 successful experiment overall # Can't run 4-ii and 5-ii models on this amine continue else: ACLF = clf(amine=amine, verbose=False) for set_id in draws: # Unload the randomly drawn dataset values x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \ dataset[set_id]['y_t'], \ dataset[set_id]['x_v'], \ dataset[set_id]['y_v'], \ dataset[set_id]['all_data'], \ dataset[set_id]['all_labels'] # Load the training and validation set into the model ACLF.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine], all_labels[amine]) # Train the data on the training set ACLF.train(warning=False) ACLF.find_inner_avg() base_accuracies.append( ACLF.metrics['average']['accuracies'][-1]) base_precisions.append( ACLF.metrics['average']['precisions'][-1]) base_recalls.append(ACLF.metrics['average']['recalls'][-1]) base_bcrs.append(ACLF.metrics['average']['bcrs'][-1]) # Calculated the average baseline performances ft_log[model_name]['Default']['accuracies'] = sum( base_accuracies) / len(base_accuracies) ft_log[model_name]['Default']['precisions'] = sum( base_precisions) / len(base_precisions) ft_log[model_name]['Default']['recalls'] = sum(base_recalls) / len( base_recalls) ft_log[model_name]['Default']['bcrs'] = sum(base_bcrs) / len(base_bcrs) # Try out each possible combinations of hyper-parameters for option in combinations: accuracies = [] precisions = [] recalls = [] bcrs = [] for amine in amine_list: if amine == 'XZUCBFLUEBDNSJ-UHFFFAOYSA-N' and draw_success: # Skipping the amine with only 1 successful experiment overall # Can't run 4-ii and 5-ii models on this amine continue else: # print("Training and cross validation on {} amine.".format(amine)) ACLF = clf(amine=amine, config=option, verbose=False) for set_id in draws: # Unload the randomly drawn dataset values x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \ dataset[set_id]['y_t'], \ dataset[set_id]['x_v'], \ dataset[set_id]['y_v'], \ dataset[set_id]['all_data'], \ dataset[set_id]['all_labels'] # Load the training and validation set into the model ACLF.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine], all_labels[amine]) # Train the data on the training set ACLF.train(warning=False) ACLF.find_inner_avg() accuracies.append(ACLF.metrics['average']['accuracies'][-1]) precisions.append(ACLF.metrics['average']['precisions'][-1]) recalls.append(ACLF.metrics['average']['recalls'][-1]) bcrs.append(ACLF.metrics['average']['bcrs'][-1]) ft_log[model_name][str( option)]['accuracies'] = sum(accuracies) / len(accuracies) ft_log[model_name][str( option)]['precisions'] = sum(precisions) / len(precisions) ft_log[model_name][str( option)]['recalls'] = sum(recalls) / len(recalls) ft_log[model_name][str(option)]['bcrs'] = sum(bcrs) / len(bcrs) # Save the fine tuning performances to pkl if not multi-processing with open(path, 'wb') as f: pickle.dump(ft_log, f)
def grid_search(clf, ft_params, path, num_draws, train_size, active_learning_iter, active_learning=True, w_hx=True, w_k=True, draw_success=False, random=False, random_size=10, result_dict=None, model_name=''): """Fine tune the model based on average bcr performance to find the best model hyper-parameters. Similar to GridSearchCV in scikit-learn package, we try out all the combinations and evaluate performance across all amine-specific models under different categories. Args: clf: A class object representing the classifier being fine tuned. ft_params: A dictionary representing the possible hyper-parameter values to try out. path: A string representing the directory path to store the statistics of all combinations tried during one stage of fine tuning. num_draws: An integer representing the number of random drawn to create the dataset. train_size: An integer representing the number of amine-specific experiments used for training. Corresponds to the k in the category description. active_learning_iter: An integer representing the number of iterations in an active learning loop. Corresponds to the x in the category description. active_learning: A boolean representing if active learning will be involved in testing or not. w_hx: A boolean representing if the models are trained with historical data or not. w_k: A boolean representing if the modes are trained with amine-specific experiments. draw_success: A boolean representing if the models are trained on regular randomly-drawn datasets or random datasets with at least one success for each amine. random: A boolean representing if we want to do random search or not. random_size: An integer representing the number of random combinations to try and compare. result_dict: A dictionary representing the result dictionary used during multi-thread processing. model_name: A string representing the name of the model being fine tuned. Returns: best_option: A dictionary representing the hyper-parameters that yields the best performance on average. The keys may vary for models. """ # Load or initialize dictionary to keep all configurations' performances if result_dict: ft_log = result_dict elif os.path.exists(path): with open(path, 'rb') as f: ft_log = pickle.load(f) else: ft_log = defaultdict(dict) if model_name not in ft_log: ft_log[model_name] = defaultdict(dict) # Set all possible combinations combinations = [] keys, values = zip(*ft_params.items()) for bundle in itertools.product(*values): combinations.append(dict(zip(keys, bundle))) # Random search if we are not searching through the whole grid if random: combinations = list(np.random.choice(combinations, size=random_size)) # Load the full dataset under specific categorical option dataset = process_dataset(num_draw=num_draws, train_size=train_size, active_learning_iter=active_learning_iter, verbose=False, cross_validation=True, full=True, active_learning=active_learning, w_hx=w_hx, w_k=w_k, success=draw_success) draws = list(dataset.keys()) amine_list = list(dataset[0]['x_t'].keys()) # Set baseline performance base_accuracies = [] base_precisions = [] base_recalls = [] base_bcrs = [] # Log the starting time of fine tuning start_time = time.time() for amine in amine_list: if amine == 'XZUCBFLUEBDNSJ-UHFFFAOYSA-N' and draw_success: # Skipping the amine with only 1 successful experiment overall # Can't run 4-ii and 5-ii models on this amine continue else: ACLF = clf(amine=amine, verbose=False) for set_id in draws: # Unload the randomly drawn dataset values x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \ dataset[set_id]['y_t'], \ dataset[set_id]['x_v'], \ dataset[set_id]['y_v'], \ dataset[set_id]['all_data'], \ dataset[set_id]['all_labels'] # Load the training and validation set into the model ACLF.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine], all_labels[amine]) # Train the data on the training set ACLF.train(warning=False) ACLF.find_inner_avg() base_accuracies.append(ACLF.metrics['average']['accuracies'][-1]) base_precisions.append(ACLF.metrics['average']['precisions'][-1]) base_recalls.append(ACLF.metrics['average']['recalls'][-1]) base_bcrs.append(ACLF.metrics['average']['bcrs'][-1]) # Calculated the average baseline performances ft_log[model_name]['Default']['accuracies'] = sum(base_accuracies) / len( base_accuracies) ft_log[model_name]['Default']['precisions'] = sum(base_precisions) / len( base_precisions) ft_log[model_name]['Default']['recalls'] = sum(base_recalls) / len( base_recalls) ft_log[model_name]['Default']['bcrs'] = sum(base_bcrs) / len(base_bcrs) # Try out each possible combinations of hyper-parameters print(f'There are {len(combinations)} many combinations to try.') for option in combinations: accuracies = [] precisions = [] recalls = [] bcrs = [] for amine in amine_list: if amine == 'XZUCBFLUEBDNSJ-UHFFFAOYSA-N' and draw_success: # Skipping the amine with only 1 successful experiment overall # Can't run 4-ii and 5-ii models on this amine continue else: # print("Training and cross validation on {} amine.".format(amine)) ACLF = clf(amine=amine, config=option, verbose=False) for set_id in draws: # Unload the randomly drawn dataset values x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \ dataset[set_id]['y_t'], \ dataset[set_id]['x_v'], \ dataset[set_id]['y_v'], \ dataset[set_id]['all_data'], \ dataset[set_id]['all_labels'] # Load the training and validation set into the model ACLF.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine], all_labels[amine]) # Train the data on the training set ACLF.train(warning=False) ACLF.find_inner_avg() accuracies.append(ACLF.metrics['average']['accuracies'][-1]) precisions.append(ACLF.metrics['average']['precisions'][-1]) recalls.append(ACLF.metrics['average']['recalls'][-1]) bcrs.append(ACLF.metrics['average']['bcrs'][-1]) ft_log[model_name][str( option)]['accuracies'] = sum(accuracies) / len(accuracies) ft_log[model_name][str( option)]['precisions'] = sum(precisions) / len(precisions) ft_log[model_name][str( option)]['recalls'] = sum(recalls) / len(recalls) ft_log[model_name][str(option)]['bcrs'] = sum(bcrs) / len(bcrs) # Find the total time used for fine tuning end_time = time.time() time_lapsed = end_time - start_time # Make time used more readable days = int(time_lapsed / 86400) hours = int((time_lapsed - (86400 * days)) / 3600) minutes = int((time_lapsed - (86400 * days) - (3600 * hours)) / 60) seconds = round( time_lapsed - (86400 * days) - (3600 * hours) - (minutes * 60), 2) per_combo = round(time_lapsed / (len(combinations)), 4) print(f'Fine tuning for {model_name} completed.') print( f'Total time used: {days} days {hours} hours {minutes} minutes {seconds} seconds.' ) print(f'Or about {per_combo} seconds per combination.') # Save the fine tuning performances to pkl if not multi-processing if not result_dict: with open(path, 'wb') as f: pickle.dump(ft_log, f)
def run_model(RandomForest_params, category): """Full-scale training, validation and testing using all amines. Args: RandomForest_params: A dictionary of the parameters for the random forest model. See initialize() for more information. category: A string representing the category the model is classified under. """ # Unload common parameters config = RandomForest_params['config'][category] if RandomForest_params[ 'config'] else None verbose = RandomForest_params['verbose'] warning = RandomForest_params['warning'] stats_path = RandomForest_params['stats_path'] result_dict = RandomForest_params['result_dict'] model_name = RandomForest_params['model_name'] print(f'Running model {model_name}') # Unload the training data specific parameters num_draws = RandomForest_params['num_draws'] train_size = RandomForest_params['train_size'] cross_validation = RandomForest_params['cross_validate'] active_learning = RandomForest_params['active_learning'] w_hx = RandomForest_params['with_historical_data'] w_k = RandomForest_params['with_k'] active_learning_iter = RandomForest_params['active_learning_iter'] full = RandomForest_params['full_dataset'] draw_success = RandomForest_params['draw_success'] # Specify the desired operation fine_tuning = RandomForest_params['fine_tuning'] save_model = RandomForest_params['save_model'] to_file = True if fine_tuning: class_weights = [{ 0: i, 1: 1.0 - i } for i in np.linspace(.05, .95, num=50)] class_weights.append('balanced') class_weights.append(None) ft_params = { 'n_estimators': [100, 200, 500, 1000], 'criterion': ['gini', 'entropy'], 'max_depth': [i for i in range(1, 9)], 'max_features': ['auto', 'sqrt', 'log2', None], 'bootstrap': [True], 'min_samples_leaf': [i for i in range(1, 6)], 'min_samples_split': [i for i in range(2, 11)], 'ccp_alpha': [.1 * i for i in range(1)], 'class_weight': class_weights } result_path = './results/ft_{}.pkl'.format(model_name) grid_search(ActiveRandomForest, ft_params, result_path, num_draws, train_size, active_learning_iter, active_learning=active_learning, w_hx=w_hx, w_k=w_k, draw_success=draw_success, result_dict=result_dict, model_name=model_name) else: # Load the desired sized dataset under desired option dataset = process_dataset(num_draw=num_draws, train_size=train_size, active_learning_iter=active_learning_iter, verbose=verbose, cross_validation=cross_validation, full=full, active_learning=active_learning, w_hx=w_hx, w_k=w_k, success=draw_success) draws = list(dataset.keys()) amine_list = list(dataset[0]['x_t'].keys()) for amine in amine_list: # Create the RandomForest model instance for the specific amine ARF = ActiveRandomForest(amine=amine, config=config, verbose=verbose, stats_path=stats_path, result_dict=result_dict, model_name=model_name) for set_id in draws: # Unload the randomly drawn dataset values x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \ dataset[set_id]['y_t'], \ dataset[set_id]['x_v'], \ dataset[set_id]['y_v'], \ dataset[set_id]['all_data'], \ dataset[set_id]['all_labels'] # Load the training and validation set into the model ARF.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine], all_labels[amine]) # Train the data on the training set ARF.train(warning=warning) # Conduct active learning with all the observations available in the pool if active_learning: ARF.active_learning(num_iter=active_learning_iter, warning=warning) if to_file: ARF.store_metrics_to_file() # Save the model for future reproducibility if save_model: ARF.save_model(model_name)
def run_model(LinearSVM_params, category): """Full-scale training, validation and testing using all amines. Args: LinearSVM_params: A dictionary of the parameters for the LinearSVM model. See initialize() for more information. category: A string representing the category the model is classified under. """ # Unload common parameters config = LinearSVM_params['configs'][category] if LinearSVM_params[ 'configs'] else None verbose = LinearSVM_params['verbose'] warning = LinearSVM_params['warning'] stats_path = LinearSVM_params['stats_path'] model_name = LinearSVM_params['model_name'] print(f'Running model {model_name}') # Unload the training data specific parameters train_size = LinearSVM_params['train_size'] active_learning_iter = LinearSVM_params['active_learning_iter'] cross_validation = LinearSVM_params['cross_validate'] full = LinearSVM_params['full_dataset'] active_learning = LinearSVM_params['active_learning'] w_hx = LinearSVM_params['with_historical_data'] w_k = LinearSVM_params['with_k'] # Specify the desired operation fine_tuning = LinearSVM_params['fine_tuning'] save_model = LinearSVM_params['save_model'] to_params = True if fine_tuning: class_weights = [{ 0: i, 1: 1.0 - i } for i in np.linspace(.1, .9, num=9)] class_weights.append('balanced') class_weights.append(None) ft_params = { # 'penalty': ['l1', 'l2'], 'penalty': ['l1'], # 'loss': ['hinge', 'squared_hinge'], 'loss': ['squared_hinge'], 'dual': [False], # 'C': [.001, .01, .1, 1, 10], 'C': [i for i in np.linspace(0.001, 0.01, num=10)], # 'tol': [.0001, .001, .01, .1, 1], 'tol': [i for i in np.linspace(0.01, 0.1, num=10)], 'fit_intercept': [True], 'class_weight': class_weights, } _ = grid_search(ActiveLinearSVM, ft_params, train_size, active_learning_iter, active_learning=active_learning, w_hx=w_hx, w_k=w_k, info=True) else: # Load the desired sized dataset under desired option amine_list, x_t, y_t, x_v, y_v, all_data, all_labels = process_dataset( train_size=train_size, active_learning_iter=active_learning_iter, verbose=verbose, cross_validation=cross_validation, full=full, active_learning=active_learning, w_hx=w_hx, w_k=w_k) # print(amine_list) for amine in amine_list: if cross_validation: # print("Training and cross validation on {} amine.".format(amine)) # Create the LinearSVM model instance for the specific amine ALSVM = ActiveLinearSVM(amine=amine, config=config, verbose=verbose, stats_path=stats_path, model_name=model_name) # Load the training and validation set into the model ALSVM.load_dataset(x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine], all_labels[amine]) # Train the data on the training set ALSVM.train(warning=warning) # Conduct active learning with all the observations available in the pool if active_learning: ALSVM.active_learning(num_iter=active_learning_iter, warning=warning, to_params=to_params) else: ALSVM.store_metrics_to_params() # Save the model for future reproducibility if save_model: ALSVM.save_model(model_name)
def run_model(SVM_params, category): """Full-scale training, validation and testing using all amines. Args: SVM_params: A dictionary of the parameters for the SVM model. See initialize() for more information. category: A string representing the category the model is classified under. """ # Unload common parameters config = SVM_params['configs'][category] if SVM_params['configs'] else None verbose = SVM_params['verbose'] stats_path = SVM_params['stats_path'] model_name = SVM_params['model_name'] print(f'Running model {model_name}') # Unload the training data specific parameters train_size = SVM_params['train_size'] active_learning_iter = SVM_params['active_learning_iter'] cross_validation = SVM_params['cross_validate'] full = SVM_params['full_dataset'] active_learning = SVM_params['active_learning'] w_hx = SVM_params['with_historical_data'] w_k = SVM_params['with_k'] # Specify the desired operation fine_tuning = SVM_params['fine_tuning'] save_model = SVM_params['save_model'] to_params = True if fine_tuning: w0 = [i for i in np.linspace(.1, .9, num=9)] w0.append(1) ft_params = { '-t': [0, 1, 2, 3], '-d': [i for i in range(1, 6)], '-g': [.0001, .001, .01, 1 / 51, .1, 1], '-c': [.0001, .001, .01, .1, 1, 10], '-m': [4000], '-w0': w0, } _ = grid_search(ActiveSVC, ft_params, train_size, active_learning_iter, active_learning=active_learning, w_hx=w_hx, w_k=w_k, info=True) else: # Load the desired sized dataset under desired option amine_list, x_t, y_t, x_v, y_v, all_data, all_labels = process_dataset( train_size=train_size, active_learning_iter=active_learning_iter, verbose=verbose, cross_validation=cross_validation, full=full, active_learning=active_learning, w_hx=w_hx, w_k=w_k) # print(amine_list) for amine in amine_list: if cross_validation: # print("Training and cross validation on {} amine.".format(amine)) # Create the SVM model instance for the specific amine ASVM = ActiveSVC(amine=amine, config=config, verbose=verbose, stats_path=stats_path, model_name=model_name) # Load the training and validation set into the model ASVM.load_dataset(x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine], all_labels[amine]) # Train the data on the training set ASVM.train() # Conduct active learning with all the observations available in the pool if active_learning: ASVM.active_learning(num_iter=active_learning_iter, to_params=to_params) else: ASVM.store_metrics_to_params() # Save the model for future reproducibility if save_model: ASVM.save_model(model_name)
def grid_search(clf, params, train_size, active_learning_iter, active_learning=True, w_hx=True, w_k=True, info=False): """Fine tune the model based on average bcr performance to find the best model hyper-parameters. Similar to GridSearchCV in scikit-learn package, we try out all the combinations and evaluate performance across all amine-specific models under different categories. Args: clf: A class object representing the classifier being fine tuned. params: A dictionary representing the possible hyper-parameter values to try out. train_size: An integer representing the number of amine-specific experiments used for training. Corresponds to the k in the category description. active_learning_iter: An integer representing the number of iterations in an active learning loop. Corresponds to the x in the category description. active_learning: A boolean representing if active learning will be involved in testing or not. w_hx: A boolean representing if the models are trained with historical data or not. w_k: A boolean representing if the modes are trained with amine-specific experiments. info: A boolean. Setting it to True will make the function print out additional information during the fine-tuning stage. Default to False. Returns: best_option: A dictionary representing the hyper-parameters that yields the best performance on average. The keys may vary for models. """ # Set all possible combinations combinations = [] keys, values = zip(*params.items()) for bundle in itertools.product(*values): config_dict = dict(zip(keys, bundle)) # Delete duplicate configs where the kernel is not poly if not (config_dict['-t'] != 1 and config_dict['-d'] != 1): config = dict_to_str_config(config_dict) combinations.append(config) # Load the full dataset under specific categorical option amine_list, train_data, train_labels, val_data, val_labels, all_data, all_labels = process_dataset( train_size=train_size, active_learning_iter=active_learning_iter, verbose=False, cross_validation=True, full=True, active_learning=active_learning, w_hx=w_hx, w_k=w_k) # Set baseline performance base_accuracies = [] base_precisions = [] base_recalls = [] base_bcrs = [] base_aucs = [] for amine in amine_list: ACLF = clf(amine=amine, verbose=False) # Exact and load the training and validation set into the model x_t, y_t = train_data[amine], train_labels[amine] x_v, y_v = val_data[amine], val_labels[amine] all_task_data, all_task_labels = all_data[amine], all_labels[amine] ACLF.load_dataset(x_t, y_t, x_v, y_v, all_task_data, all_task_labels) ACLF.train(warning=False) # Calculate AUC auc = roc_auc_score(ACLF.all_labels, ACLF.y_preds) base_accuracies.append(ACLF.metrics['accuracies'][-1]) base_precisions.append(ACLF.metrics['precisions'][-1]) base_recalls.append(ACLF.metrics['recalls'][-1]) base_bcrs.append(ACLF.metrics['bcrs'][-1]) base_aucs.append(auc) # Calculated the average baseline performances base_avg_accuracy = sum(base_accuracies) / len(base_accuracies) base_avg_precision = sum(base_precisions) / len(base_precisions) base_avg_recall = sum(base_recalls) / len(base_recalls) base_avg_bcr = sum(base_bcrs) / len(base_bcrs) base_avg_auc = sum(base_aucs) / len(base_aucs) best_metric = base_avg_auc previous_recall = base_avg_recall if info: print(f'Baseline average accuracy is {base_avg_accuracy}') print(f'Baseline average precision is {base_avg_precision}') print(f'Baseline average recall is {base_avg_recall}') print(f'Baseline average bcr is {base_avg_bcr}') print(f'Baseline average auc is {base_avg_auc}') best_option = {} option_no = 1 # Try out each possible combinations of hyper-parameters print(f'There are {len(combinations)} many combinations to try.') for option in combinations: accuracies = [] precisions = [] recalls = [] bcrs = [] aucs = [] print(f'Trying option {option_no}') option_no += 1 for amine in amine_list: # print("Training and cross validation on {} amine.".format(amine)) ACLF = clf(amine=amine, config=option, verbose=False) # Exact and load the training and validation set into the model x_t, y_t = train_data[amine], train_labels[amine] x_v, y_v = val_data[amine], val_labels[amine] all_task_data, all_task_labels = all_data[amine], all_labels[amine] ACLF.load_dataset(x_t, y_t, x_v, y_v, all_task_data, all_task_labels) ACLF.train(warning=False) # Calculate AUC auc = roc_auc_score(ACLF.all_labels, ACLF.y_preds) accuracies.append(ACLF.metrics['accuracies'][-1]) precisions.append(ACLF.metrics['precisions'][-1]) recalls.append(ACLF.metrics['recalls'][-1]) bcrs.append(ACLF.metrics['bcrs'][-1]) aucs.append(auc) avg_accuracy = sum(accuracies) / len(accuracies) avg_precision = sum(precisions) / len(precisions) avg_recall = sum(recalls) / len(recalls) avg_bcr = sum(bcrs) / len(bcrs) avg_auc = sum(aucs) / len(aucs) if best_metric - avg_auc < .01 and avg_recall > previous_recall: if info: print(f'The previous best option is {best_option}') print(f'The current best setting is {option}') print( f'The fine-tuned average accuracy is {avg_accuracy} vs. the base accuracy {base_avg_accuracy}' ) print( f'The fine-tuned average precision is {avg_precision} vs. the base precision {base_avg_precision}' ) print( f'The fine-tuned average recall rate is {avg_recall} vs. the base recall rate {base_avg_recall}' ) print( f'The fine-tuned average bcr is {avg_bcr} vs. the base bcr {base_avg_bcr}' ) print( f'The fine-tuned average auc is {avg_auc} vs. the base auc {base_avg_auc}' ) print() best_metric = avg_auc previous_recall = avg_recall best_option = option if info: print() print(f'The best setting for all amines is {best_option}') print(f'With an average auc of {best_metric}') return best_option
def run_model(GradientBoosting_params, category): """Full-scale training, validation and testing using all amines. Args: GradientBoosting_params: A dictionary of the parameters for the Gradient Boosting model. See initialize() for more information. category: A string representing the category the model is classified under. """ # Unload common parameters config = GradientBoosting_params['config'][ category] if GradientBoosting_params['config'] else None verbose = GradientBoosting_params['verbose'] warning = GradientBoosting_params['warning'] stats_path = GradientBoosting_params['stats_path'] result_dict = GradientBoosting_params['result_dict'] model_name = GradientBoosting_params['model_name'] print(f'Running model {model_name}') # Unload the training data specific parameters num_draws = GradientBoosting_params['num_draws'] train_size = GradientBoosting_params['train_size'] active_learning_iter = GradientBoosting_params['active_learning_iter'] active_learning = GradientBoosting_params['active_learning'] cross_validation = GradientBoosting_params['cross_validate'] full = GradientBoosting_params['full_dataset'] w_hx = GradientBoosting_params['with_historical_data'] w_k = GradientBoosting_params['with_k'] draw_success = GradientBoosting_params['draw_success'] # Specify the desired operation fine_tuning = GradientBoosting_params['fine_tuning'] save_model = GradientBoosting_params['save_model'] to_file = True if fine_tuning: ft_params = { 'loss': ['deviance', 'exponential'], 'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [100, 200, 500, 1000], 'criterion': ['friedman_mse', 'mse', 'mae'], 'max_depth': [i for i in range(1, 9)], 'max_features': ['auto', 'sqrt', 'log2', None], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [2, 5, 10], 'ccp_alpha': [.1 * i for i in range(1)] } result_path = './results/ft_{}.pkl'.format(model_name) grid_search(ActiveGradientBoosting, ft_params, result_path, num_draws, train_size, active_learning_iter, active_learning=active_learning, w_hx=w_hx, w_k=w_k, draw_success=draw_success, result_dict=result_dict, model_name=model_name) else: # Load the desired sized dataset under desired option dataset = process_dataset(num_draw=num_draws, train_size=train_size, active_learning_iter=active_learning_iter, verbose=verbose, cross_validation=cross_validation, full=full, active_learning=active_learning, w_hx=w_hx, w_k=w_k, success=draw_success) draws = list(dataset.keys()) amine_list = list(dataset[0]['x_t'].keys()) # print(training_batches.keys()) for amine in amine_list: if amine == 'XZUCBFLUEBDNSJ-UHFFFAOYSA-N' and draw_success: # Skipping the amine with only 1 successful experiment overall # Can't run 4-ii and 5-ii models on this amine continue else: # Create the GradientBoosting model instance for the specific amine AGB = ActiveGradientBoosting(amine=amine, config=config, verbose=verbose, stats_path=stats_path, result_dict=result_dict, model_name=model_name) for set_id in draws: # Unload the randomly drawn dataset values x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \ dataset[set_id]['y_t'], \ dataset[set_id]['x_v'], \ dataset[set_id]['y_v'], \ dataset[set_id]['all_data'], \ dataset[set_id]['all_labels'] # Load the training and validation set into the model AGB.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine], all_labels[amine]) # Train the data on the training set AGB.train(warning=warning) # Conduct active learning with all the observations available in the pool if active_learning: AGB.active_learning(num_iter=active_learning_iter, warning=warning) if to_file: AGB.store_metrics_to_file() # Save the model for future reproducibility if save_model: AGB.save_model(model_name)
def run_model(KNN_params, category): """Full-scale training, validation and testing using all amines. Args: KNN_params: A dictionary of the parameters for the KNN model. See initialize() for more information. category: A string representing the category the model is classified under. """ # Unload common parameters config = KNN_params['configs'][category] if KNN_params['configs'] else None verbose = KNN_params['verbose'] warning = KNN_params['warning'] stats_path = KNN_params['stats_path'] result_dict = KNN_params['result_dict'] model_name = KNN_params['model_name'] print(f'Running model {model_name}') # Unload the training data specific parameters num_draws = KNN_params['num_draws'] train_size = KNN_params['train_size'] active_learning_iter = KNN_params['active_learning_iter'] cross_validation = KNN_params['cross_validate'] full = KNN_params['full_dataset'] active_learning = KNN_params['active_learning'] w_hx = KNN_params['with_historical_data'] w_k = KNN_params['with_k'] draw_success = KNN_params['draw_success'] # Specify the desired operation fine_tuning = KNN_params['fine_tuning'] save_model = KNN_params['save_model'] to_file = True if fine_tuning: # Set all possible combinations ft_params = { 'n_neighbors': [i for i in range(1, 10)], 'leaf_size': [i for i in range(1, 51)], 'p': [i for i in range(1, 4)] } result_path = './results/ft_{}.pkl'.format(model_name) grid_search( ActiveKNN, ft_params, result_path, num_draws, train_size, active_learning_iter, active_learning=active_learning, w_hx=w_hx, w_k=w_k, draw_success=draw_success, result_dict=result_dict, model_name=model_name, ) else: # Load the desired sized dataset under desired option dataset = process_dataset( num_draw=num_draws, train_size=train_size, active_learning_iter=active_learning_iter, verbose=verbose, cross_validation=cross_validation, full=full, active_learning=active_learning, w_hx=w_hx, w_k=w_k, success=draw_success, ) draws = list(dataset.keys()) amine_list = list(dataset[0]['x_t'].keys()) for amine in amine_list: # Create the KNN model instance for the specific amine KNN = ActiveKNN(amine=amine, config=config, verbose=verbose, stats_path=stats_path, result_dict=result_dict, model_name=model_name) for set_id in draws: # Unload the randomly drawn dataset values x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \ dataset[set_id]['y_t'], \ dataset[set_id]['x_v'], \ dataset[set_id]['y_v'], \ dataset[set_id]['all_data'], \ dataset[set_id]['all_labels'] # Load the training and validation set into the model KNN.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine], all_labels[amine]) # Train the data on the training set KNN.train(warning=warning) # Conduct active learning with all the observations available in the pool if active_learning: KNN.active_learning(num_iter=active_learning_iter, warning=warning) if to_file: KNN.store_metrics_to_file() # Save the model for future reproducibility if save_model: KNN.save_model(model_name)