def grid_search_AB(): grid_search(data, lambda **kwargs: TimeCombiner(AvgModel(), TimeConcepts(concepts=concepts, **kwargs)), {"K": 0.25}, { "alpha": np.arange(0.2, 0.7, 0.2), "beta": np.arange(0.02, 0.1, 0.02), }, plot_axes=['alpha', 'beta'], time=True, )
def grid_search_AB2(): grid_search(data, lambda **kwargs: TimeCombiner(AvgModel(), TimePriorCurrentModel(**kwargs)), {"KI": 0.3, 'KC': 0.3}, { "alpha": np.arange(0.2, 1.1, 0.2), "beta": np.arange(0.02, 0.2, 0.02), }, plot_axes=['alpha', 'beta'], time=True, )
def grid_search_AB(): grid_search(data, lambda **kwargs: TimeCombiner(AvgModel(), BasicTimeModel(**kwargs)), {"K": 0.2}, { "alpha": np.arange(0.4, 1.3, 0.2), "beta": np.arange(0.06, 0.2, 0.02), }, plot_axes=['alpha', 'beta'], time=True, )
def grid(data, model): utils.grid_search(data, model, {"KC": 3, "KI": 0.5}, { # {"alpha": 0.25, "beta": 0.02}, { "alpha": np.arange(0.4, 1.7, 0.2), "beta": np.arange(0., 0.2, 0.02), # "KC": np.arange(1.5, 5.0, 0.25), # "KI": np.arange(0, 2.5, 0.25), # }, plot_axes=["KC", "KI"]) }, plot_axes=["alpha", "beta"]) plt.show()
def grid_search_Ks(): grid_search(data, lambda **kwargs: TimeCombiner(AvgModel(), TimePriorCurrentModel(**kwargs)), {"alpha": 0.6, "beta": 0.1}, {"KC": np.arange(0.1, 0.7, 0.1),"KI": np.arange(0.1, 0.7, 0.1)}, plot_axes=['KI', 'KC'], time=True, )
def grid_search_K(): grid_search(data, lambda **kwargs: TimeCombiner(AvgModel(), TimeConcepts(concepts=concepts, **kwargs)), {"alpha": 0.4, "beta": 0.05}, {"K": np.arange(0, 0.5, 0.05)}, plot_axes='K', time=True, )
def run_model(DecisionTree_params, category): """Full-scale training, validation and testing using all amines. Args: DecisionTree_params: A dictionary of the parameters for the decision tree model. See initialize() for more information. category: A string representing the category the model is classified under. """ # Feature names hard-coded for decision tree visualization features = [ '_rxn_M_acid', '_rxn_M_inorganic', '_rxn_M_organic', '_solv_GBL', '_solv_DMSO', '_solv_DMF', '_stoich_mmol_org', '_stoich_mmol_inorg', '_stoich_mmol_acid', '_stoich_mmol_solv', '_stoich_org/solv', '_stoich_inorg/solv', '_stoich_acid/solv', '_stoich_org+inorg/solv', '_stoich_org+inorg+acid/solv', '_stoich_org/liq', '_stoich_inorg/liq', '_stoich_org+inorg/liq', '_stoich_org/inorg', '_stoich_acid/inorg', '_rxn_Temperature_C', '_rxn_Reactiontime_s', '_feat_AvgPol', '_feat_Refractivity', '_feat_MaximalProjectionArea', '_feat_MaximalProjectionRadius', '_feat_maximalprojectionsize', '_feat_MinimalProjectionArea', '_feat_MinimalProjectionRadius', '_feat_minimalprojectionsize', '_feat_MolPol', '_feat_VanderWaalsSurfaceArea', '_feat_ASA', '_feat_ASA_H', '_feat_ASA_P', '_feat_ASA-', '_feat_ASA+', '_feat_ProtPolarSurfaceArea', '_feat_Hacceptorcount', '_feat_Hdonorcount', '_feat_RotatableBondCount', '_raw_standard_molweight', '_feat_AtomCount_N', '_feat_BondCount', '_feat_ChainAtomCount', '_feat_RingAtomCount', '_feat_primaryAmine', '_feat_secondaryAmine', '_rxn_plateEdgeQ', '_feat_maxproj_per_N', '_raw_RelativeHumidity' ] # Unload common parameters config = DecisionTree_params['configs'][category] if DecisionTree_params[ 'configs'] else None verbose = DecisionTree_params['verbose'] warning = DecisionTree_params['warning'] stats_path = DecisionTree_params['stats_path'] result_dict = DecisionTree_params['result_dict'] model_name = DecisionTree_params['model_name'] print(f'Running model {model_name}') # Unload the training data specific parameters num_draws = DecisionTree_params['num_draws'] train_size = DecisionTree_params['train_size'] active_learning_iter = DecisionTree_params['active_learning_iter'] cross_validation = DecisionTree_params['cross_validate'] full = DecisionTree_params['full_dataset'] active_learning = DecisionTree_params['active_learning'] w_hx = DecisionTree_params['with_historical_data'] w_k = DecisionTree_params['with_k'] draw_success = DecisionTree_params['draw_success'] # Specify the desired operation fine_tuning = DecisionTree_params['fine_tuning'] save_model = DecisionTree_params['save_model'] visualize = DecisionTree_params['visualize'] to_file = True if fine_tuning: class_weights = [{ 0: i, 1: 1.0 - i } for i in np.linspace(.05, .95, num=50)] class_weights.append('balanced') class_weights.append(None) max_depths = [i for i in range(9, 26)] max_depths.append(None) ft_params = { 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': max_depths, 'min_samples_split': [i for i in range(2, 11)], 'min_samples_leaf': [i for i in range(1, 4)], 'class_weight': class_weights } result_path = './results/ft_{}.pkl'.format(model_name) grid_search(ActiveDecisionTree, ft_params, result_path, num_draws, train_size, active_learning_iter, active_learning=active_learning, w_hx=w_hx, w_k=w_k, draw_success=draw_success, result_dict=result_dict, model_name=model_name) else: # Load the desired sized dataset under desired option dataset = process_dataset(num_draw=num_draws, train_size=train_size, active_learning_iter=active_learning_iter, verbose=verbose, cross_validation=cross_validation, full=full, active_learning=active_learning, w_hx=w_hx, w_k=w_k, success=draw_success) draws = list(dataset.keys()) amine_list = list(dataset[0]['x_t'].keys()) for amine in amine_list: # Create the decision tree model instance for the specific amine ADT = ActiveDecisionTree(amine=amine, config=config, verbose=verbose, stats_path=stats_path, result_dict=result_dict, model_name=model_name) for set_id in draws: # Unload the randomly drawn dataset values x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \ dataset[set_id]['y_t'], \ dataset[set_id]['x_v'], \ dataset[set_id]['y_v'], \ dataset[set_id]['all_data'], \ dataset[set_id]['all_labels'] # Load the training and validation set into the model ADT.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine], all_labels[amine]) # Train the data on the training set ADT.train(warning=warning) # Conduct active learning with all the observations available in the pool if active_learning: ADT.active_learning(num_iter=active_learning_iter, warning=warning) if visualize: # Plot the decision tree # To compile the graph, use the following command in terminal # dot -Tpng "{dt_file_name}.dot" -o "{desired file name}.png" # If using Jupyter Notebook, add ! in front to run command lines file_name = './results/{0:s}_dt_{1:s}_{2:d}.dot'.format( model_name, amine, set_id) export_graphviz(ADT.model, feature_names=features, class_names=['FAILURE', 'SUCCESS'], out_file=file_name, filled=True, rounded=True, special_characters=True) if to_file: ADT.store_metrics_to_file() # Save the model for future reproducibility if save_model: ADT.save_model(model_name)
def run_model(RandomForest_params, category): """Full-scale training, validation and testing using all amines. Args: RandomForest_params: A dictionary of the parameters for the random forest model. See initialize() for more information. category: A string representing the category the model is classified under. """ # Unload common parameters config = RandomForest_params['config'][category] if RandomForest_params[ 'config'] else None verbose = RandomForest_params['verbose'] warning = RandomForest_params['warning'] stats_path = RandomForest_params['stats_path'] result_dict = RandomForest_params['result_dict'] model_name = RandomForest_params['model_name'] print(f'Running model {model_name}') # Unload the training data specific parameters num_draws = RandomForest_params['num_draws'] train_size = RandomForest_params['train_size'] cross_validation = RandomForest_params['cross_validate'] active_learning = RandomForest_params['active_learning'] w_hx = RandomForest_params['with_historical_data'] w_k = RandomForest_params['with_k'] active_learning_iter = RandomForest_params['active_learning_iter'] full = RandomForest_params['full_dataset'] draw_success = RandomForest_params['draw_success'] # Specify the desired operation fine_tuning = RandomForest_params['fine_tuning'] save_model = RandomForest_params['save_model'] to_file = True if fine_tuning: class_weights = [{ 0: i, 1: 1.0 - i } for i in np.linspace(.05, .95, num=50)] class_weights.append('balanced') class_weights.append(None) ft_params = { 'n_estimators': [100, 200, 500, 1000], 'criterion': ['gini', 'entropy'], 'max_depth': [i for i in range(1, 9)], 'max_features': ['auto', 'sqrt', 'log2', None], 'bootstrap': [True], 'min_samples_leaf': [i for i in range(1, 6)], 'min_samples_split': [i for i in range(2, 11)], 'ccp_alpha': [.1 * i for i in range(1)], 'class_weight': class_weights } result_path = './results/ft_{}.pkl'.format(model_name) grid_search(ActiveRandomForest, ft_params, result_path, num_draws, train_size, active_learning_iter, active_learning=active_learning, w_hx=w_hx, w_k=w_k, draw_success=draw_success, result_dict=result_dict, model_name=model_name) else: # Load the desired sized dataset under desired option dataset = process_dataset(num_draw=num_draws, train_size=train_size, active_learning_iter=active_learning_iter, verbose=verbose, cross_validation=cross_validation, full=full, active_learning=active_learning, w_hx=w_hx, w_k=w_k, success=draw_success) draws = list(dataset.keys()) amine_list = list(dataset[0]['x_t'].keys()) for amine in amine_list: # Create the RandomForest model instance for the specific amine ARF = ActiveRandomForest(amine=amine, config=config, verbose=verbose, stats_path=stats_path, result_dict=result_dict, model_name=model_name) for set_id in draws: # Unload the randomly drawn dataset values x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \ dataset[set_id]['y_t'], \ dataset[set_id]['x_v'], \ dataset[set_id]['y_v'], \ dataset[set_id]['all_data'], \ dataset[set_id]['all_labels'] # Load the training and validation set into the model ARF.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine], all_labels[amine]) # Train the data on the training set ARF.train(warning=warning) # Conduct active learning with all the observations available in the pool if active_learning: ARF.active_learning(num_iter=active_learning_iter, warning=warning) if to_file: ARF.store_metrics_to_file() # Save the model for future reproducibility if save_model: ARF.save_model(model_name)
def run_model(LinearSVM_params, category): """Full-scale training, validation and testing using all amines. Args: LinearSVM_params: A dictionary of the parameters for the LinearSVM model. See initialize() for more information. category: A string representing the category the model is classified under. """ # Unload common parameters config = LinearSVM_params['configs'][category] if LinearSVM_params[ 'configs'] else None verbose = LinearSVM_params['verbose'] warning = LinearSVM_params['warning'] stats_path = LinearSVM_params['stats_path'] model_name = LinearSVM_params['model_name'] print(f'Running model {model_name}') # Unload the training data specific parameters train_size = LinearSVM_params['train_size'] active_learning_iter = LinearSVM_params['active_learning_iter'] cross_validation = LinearSVM_params['cross_validate'] full = LinearSVM_params['full_dataset'] active_learning = LinearSVM_params['active_learning'] w_hx = LinearSVM_params['with_historical_data'] w_k = LinearSVM_params['with_k'] # Specify the desired operation fine_tuning = LinearSVM_params['fine_tuning'] save_model = LinearSVM_params['save_model'] to_params = True if fine_tuning: class_weights = [{ 0: i, 1: 1.0 - i } for i in np.linspace(.1, .9, num=9)] class_weights.append('balanced') class_weights.append(None) ft_params = { # 'penalty': ['l1', 'l2'], 'penalty': ['l1'], # 'loss': ['hinge', 'squared_hinge'], 'loss': ['squared_hinge'], 'dual': [False], # 'C': [.001, .01, .1, 1, 10], 'C': [i for i in np.linspace(0.001, 0.01, num=10)], # 'tol': [.0001, .001, .01, .1, 1], 'tol': [i for i in np.linspace(0.01, 0.1, num=10)], 'fit_intercept': [True], 'class_weight': class_weights, } _ = grid_search(ActiveLinearSVM, ft_params, train_size, active_learning_iter, active_learning=active_learning, w_hx=w_hx, w_k=w_k, info=True) else: # Load the desired sized dataset under desired option amine_list, x_t, y_t, x_v, y_v, all_data, all_labels = process_dataset( train_size=train_size, active_learning_iter=active_learning_iter, verbose=verbose, cross_validation=cross_validation, full=full, active_learning=active_learning, w_hx=w_hx, w_k=w_k) # print(amine_list) for amine in amine_list: if cross_validation: # print("Training and cross validation on {} amine.".format(amine)) # Create the LinearSVM model instance for the specific amine ALSVM = ActiveLinearSVM(amine=amine, config=config, verbose=verbose, stats_path=stats_path, model_name=model_name) # Load the training and validation set into the model ALSVM.load_dataset(x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine], all_labels[amine]) # Train the data on the training set ALSVM.train(warning=warning) # Conduct active learning with all the observations available in the pool if active_learning: ALSVM.active_learning(num_iter=active_learning_iter, warning=warning, to_params=to_params) else: ALSVM.store_metrics_to_params() # Save the model for future reproducibility if save_model: ALSVM.save_model(model_name)
SkipHandler(EloHierarchicalModel(KC=1, KI=0.75, alpha=0.8, beta=0.02)), # EloHierarchicalModel(alpha=0.25, beta=0.02), # EloConcepts(), ], dont=0, force_evaluate=0, force_run=0, runs=5, hue_order=False, answer_filters={ "long (50) student": data.filter_students_with_many_answers(), "long (30) student": data.filter_students_with_many_answers(number_of_answers=30), "long (11) student": data.filter_students_with_many_answers(number_of_answers=11), "response >5s-0.5": data.transform_response_by_time(((5, 0.5),)) }, # palette=sns.color_palette()[:2] * 4 ) # evaluator.Evaluator(d, EloHierarchicalModel(alpha=0.25, beta=0.02)).brier_graphs() # evaluator.Evaluator(d, EloPriorCurrentModel()).brier_graphs() # evaluator.Evaluator(d, ItemAvgModel()).brier_graphs() if 0: utils.grid_search(d, EloHierarchicalModel, # {"KC": 1, "KI": 0.75}, { {"alpha": 0.25, "beta": 0.02}, { # "alpha": np.arange(0.2, 1.3, 0.2), # "beta": np.arange(0., 0.2, 0.02), "KC": np.arange(1.5, 5.0, 0.25), "KI": np.arange(1.25, 4.5, 0.25), }, plot_axes=["KC", "KI"]) # }, plot_axes=["alpha", "beta"]) plt.show()
def grid_search_K(): grid_search(data, lambda **kwargs: TimeCombiner(AvgModel(), BasicTimeModel(**kwargs)), {"alpha": 0.6, "beta": 0.1}, {"K": np.arange(0, 1, 0.05)}, plot_axes='K', time=True, )
def run_model(GradientBoosting_params, category): """Full-scale training, validation and testing using all amines. Args: GradientBoosting_params: A dictionary of the parameters for the Gradient Boosting model. See initialize() for more information. category: A string representing the category the model is classified under. """ # Unload common parameters config = GradientBoosting_params['config'][ category] if GradientBoosting_params['config'] else None verbose = GradientBoosting_params['verbose'] warning = GradientBoosting_params['warning'] stats_path = GradientBoosting_params['stats_path'] result_dict = GradientBoosting_params['result_dict'] model_name = GradientBoosting_params['model_name'] print(f'Running model {model_name}') # Unload the training data specific parameters num_draws = GradientBoosting_params['num_draws'] train_size = GradientBoosting_params['train_size'] active_learning_iter = GradientBoosting_params['active_learning_iter'] active_learning = GradientBoosting_params['active_learning'] cross_validation = GradientBoosting_params['cross_validate'] full = GradientBoosting_params['full_dataset'] w_hx = GradientBoosting_params['with_historical_data'] w_k = GradientBoosting_params['with_k'] draw_success = GradientBoosting_params['draw_success'] # Specify the desired operation fine_tuning = GradientBoosting_params['fine_tuning'] save_model = GradientBoosting_params['save_model'] to_file = True if fine_tuning: ft_params = { 'loss': ['deviance', 'exponential'], 'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [100, 200, 500, 1000], 'criterion': ['friedman_mse', 'mse', 'mae'], 'max_depth': [i for i in range(1, 9)], 'max_features': ['auto', 'sqrt', 'log2', None], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [2, 5, 10], 'ccp_alpha': [.1 * i for i in range(1)] } result_path = './results/ft_{}.pkl'.format(model_name) grid_search(ActiveGradientBoosting, ft_params, result_path, num_draws, train_size, active_learning_iter, active_learning=active_learning, w_hx=w_hx, w_k=w_k, draw_success=draw_success, result_dict=result_dict, model_name=model_name) else: # Load the desired sized dataset under desired option dataset = process_dataset(num_draw=num_draws, train_size=train_size, active_learning_iter=active_learning_iter, verbose=verbose, cross_validation=cross_validation, full=full, active_learning=active_learning, w_hx=w_hx, w_k=w_k, success=draw_success) draws = list(dataset.keys()) amine_list = list(dataset[0]['x_t'].keys()) # print(training_batches.keys()) for amine in amine_list: if amine == 'XZUCBFLUEBDNSJ-UHFFFAOYSA-N' and draw_success: # Skipping the amine with only 1 successful experiment overall # Can't run 4-ii and 5-ii models on this amine continue else: # Create the GradientBoosting model instance for the specific amine AGB = ActiveGradientBoosting(amine=amine, config=config, verbose=verbose, stats_path=stats_path, result_dict=result_dict, model_name=model_name) for set_id in draws: # Unload the randomly drawn dataset values x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \ dataset[set_id]['y_t'], \ dataset[set_id]['x_v'], \ dataset[set_id]['y_v'], \ dataset[set_id]['all_data'], \ dataset[set_id]['all_labels'] # Load the training and validation set into the model AGB.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine], all_labels[amine]) # Train the data on the training set AGB.train(warning=warning) # Conduct active learning with all the observations available in the pool if active_learning: AGB.active_learning(num_iter=active_learning_iter, warning=warning) if to_file: AGB.store_metrics_to_file() # Save the model for future reproducibility if save_model: AGB.save_model(model_name)
def run_model(KNN_params, category): """Full-scale training, validation and testing using all amines. Args: KNN_params: A dictionary of the parameters for the KNN model. See initialize() for more information. category: A string representing the category the model is classified under. """ # Unload common parameters config = KNN_params['configs'][category] if KNN_params['configs'] else None verbose = KNN_params['verbose'] warning = KNN_params['warning'] stats_path = KNN_params['stats_path'] result_dict = KNN_params['result_dict'] model_name = KNN_params['model_name'] print(f'Running model {model_name}') # Unload the training data specific parameters num_draws = KNN_params['num_draws'] train_size = KNN_params['train_size'] active_learning_iter = KNN_params['active_learning_iter'] cross_validation = KNN_params['cross_validate'] full = KNN_params['full_dataset'] active_learning = KNN_params['active_learning'] w_hx = KNN_params['with_historical_data'] w_k = KNN_params['with_k'] draw_success = KNN_params['draw_success'] # Specify the desired operation fine_tuning = KNN_params['fine_tuning'] save_model = KNN_params['save_model'] to_file = True if fine_tuning: # Set all possible combinations ft_params = { 'n_neighbors': [i for i in range(1, 10)], 'leaf_size': [i for i in range(1, 51)], 'p': [i for i in range(1, 4)] } result_path = './results/ft_{}.pkl'.format(model_name) grid_search( ActiveKNN, ft_params, result_path, num_draws, train_size, active_learning_iter, active_learning=active_learning, w_hx=w_hx, w_k=w_k, draw_success=draw_success, result_dict=result_dict, model_name=model_name, ) else: # Load the desired sized dataset under desired option dataset = process_dataset( num_draw=num_draws, train_size=train_size, active_learning_iter=active_learning_iter, verbose=verbose, cross_validation=cross_validation, full=full, active_learning=active_learning, w_hx=w_hx, w_k=w_k, success=draw_success, ) draws = list(dataset.keys()) amine_list = list(dataset[0]['x_t'].keys()) for amine in amine_list: # Create the KNN model instance for the specific amine KNN = ActiveKNN(amine=amine, config=config, verbose=verbose, stats_path=stats_path, result_dict=result_dict, model_name=model_name) for set_id in draws: # Unload the randomly drawn dataset values x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \ dataset[set_id]['y_t'], \ dataset[set_id]['x_v'], \ dataset[set_id]['y_v'], \ dataset[set_id]['all_data'], \ dataset[set_id]['all_labels'] # Load the training and validation set into the model KNN.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine], all_labels[amine]) # Train the data on the training set KNN.train(warning=warning) # Conduct active learning with all the observations available in the pool if active_learning: KNN.active_learning(num_iter=active_learning_iter, warning=warning) if to_file: KNN.store_metrics_to_file() # Save the model for future reproducibility if save_model: KNN.save_model(model_name)