def _risk_assessment_helper(self, experiment_class, exp_path, debug=False, other=None): dataset_getter = DatasetGetter(None) best_config = self.model_selector.model_selection(dataset_getter, experiment_class, exp_path, self.model_configs, debug, other) # Retrain with the best configuration and test experiment = experiment_class(best_config['config'], exp_path) # Set up a log file for this experiment (I am in a forked process) logger = Logger(str(os.path.join(experiment.exp_path, 'experiment.log')), mode='a') dataset_getter.set_inner_k(None) training_scores, test_scores = [], [] # Mitigate bad random initializations for i in range(3): training_score, test_score = experiment.run_test(dataset_getter, logger, other) print(f'Final training run {i + 1}: {training_score}, {test_score}') training_scores.append(training_score) test_scores.append(test_score) training_score = sum(training_scores)/3 test_score = sum(test_scores)/3 logger.log('TR score: ' + str(training_score) + ' TS score: ' + str(test_score)) with open(os.path.join(self._HOLDOUT_FOLDER, self._ASSESSMENT_FILENAME), 'w') as fp: json.dump({'best_config': best_config, 'HOLDOUT_TR': training_score, 'HOLDOUT_TS': test_score}, fp)
def _model_selection_helper(self, dataset_getter, experiment_class, config, exp_config_name, other=None): """ :param dataset_getter: :param experiment_class: :param config: :param exp_config_name: :param other: :return: """ # Create the experiment object which will be responsible for running a specific experiment experiment = experiment_class(config, exp_config_name) # Set up a log file for this experiment (run in a separate process) logger = Logger(str(os.path.join(experiment.exp_path, 'experiment.log')), mode='a') logger.log('Configuration: ' + str(experiment.model_config)) config_filename = os.path.join(experiment.exp_path, self._CONFIG_FILENAME) # ------------- PREPARE DICTIONARY TO STORE RESULTS -------------- # selection_dict = { 'config': experiment.model_config.config_dict, 'TR_score': 0., 'VL_score': 0., } dataset_getter.set_inner_k(None) # need to stay this way training_score, validation_score = experiment.run_valid( dataset_getter, logger, other) selection_dict['TR_score'] = float(training_score) selection_dict['VL_score'] = float(validation_score) logger.log('TR Accuracy: ' + str(training_score) + ' VL Accuracy: ' + str(validation_score)) with open(config_filename, 'w') as fp: json.dump(selection_dict, fp)
class AbstractTool(object): """ This is a abstract class to represent a tool. """ def __init__(self, tool_name=None, path_for_log_file='tmp/', parameters=None): """ Class constructor @param tool_name: tool name for debugging purposes @@type tool_name: string @param path_for_log_file: path to save the logs generated by the tool. @@type path_for_log_file: string """ self.tool_name = tool_name self.log = Logger(tool_name, path_for_log_file) def execute_agent(self, agent): """ Executes the main method of the tool on a agent. @param agent: the agent the method should be executed on. @@type agent: class Agent """ raise NotImplementedError( "This method is abstract and must be implemented in derived classes." ) def execute_model(self, model): """ Executes the main method of the tool on a single model. @param model: the model the method should be executed on. @@type model: class derived from tools.AbstractModel class """ raise NotImplementedError( "This method is abstract and must be implemented in derived classes." ) def _create_log(self, data): print('Saving log...') self.log.log(data)
def _model_selection_helper(self, dataset_getter, experiment_class, config, exp_config_name, other=None): # Set up a log file for this experiment (run in a separate process) logger = Logger(str(os.path.join(exp_config_name, 'experiment.log')), mode='a') logger.log('Configuration: ' + str(config)) config_filename = os.path.join(exp_config_name, self._CONFIG_FILENAME) # ------------- PREPARE DICTIONARY TO STORE RESULTS -------------- # k_fold_dict = { 'config': config, 'folds': [{} for _ in range(self.folds)], 'avg_TR_score': 0., 'avg_VL_score': 0., 'std_TR_score': 0., 'std_VL_score': 0. } for k in range(self.folds): dataset_getter.set_inner_k(k) fold_exp_folder = os.path.join(exp_config_name, 'FOLD_' + str(k + 1)) # Create the experiment object which will be responsible for running a specific experiment experiment = experiment_class(config, fold_exp_folder) training_score, validation_score = experiment.run_valid( dataset_getter, logger, other) logger.log( str(k + 1) + ' split, TR Accuracy: ' + str(training_score) + ' VL Accuracy: ' + str(validation_score)) k_fold_dict['folds'][k]['TR_score'] = training_score k_fold_dict['folds'][k]['VL_score'] = validation_score tr_scores = np.array( [k_fold_dict['folds'][k]['TR_score'] for k in range(self.folds)]) vl_scores = np.array( [k_fold_dict['folds'][k]['VL_score'] for k in range(self.folds)]) k_fold_dict['avg_TR_score'] = tr_scores.mean() k_fold_dict['std_TR_score'] = tr_scores.std() k_fold_dict['avg_VL_score'] = vl_scores.mean() k_fold_dict['std_VL_score'] = vl_scores.std() logger.log('TR avg is ' + str(k_fold_dict['avg_TR_score']) + ' std is ' + str(k_fold_dict['std_TR_score']) + ' VL avg is ' + str(k_fold_dict['avg_VL_score']) + ' std is ' + str(k_fold_dict['std_VL_score'])) with open(config_filename, 'w') as fp: json.dump(k_fold_dict, fp)
def run_final_model(self, outer_k, debug): outer_folder = osp.join(self._ASSESSMENT_FOLDER, self._OUTER_FOLD_BASE + str(outer_k + 1)) config_fname = osp.join(outer_folder, self._SELECTION_FOLDER, self._WINNER_CONFIG) with open(config_fname, 'r') as f: best_config = json.load(f) dataset_getter_class = s2c(self.model_configs.dataset_getter) dataset_getter = dataset_getter_class( self.model_configs.data_root, self.splits_folder, s2c(self.model_configs.dataset_class), self.model_configs.dataset_name, self.outer_folds, self.inner_folds, self.model_configs.num_dataloader_workers, self.model_configs.pin_memory) # Tell the data provider to take data relative # to a specific OUTER split dataset_getter.set_outer_k(outer_k) dataset_getter.set_inner_k(None) # Mitigate bad random initializations for i in range(self.final_training_runs): final_run_exp_path = osp.join(outer_folder, f"final_run{i+1}") final_run_torch_path = osp.join(final_run_exp_path, f'run_{i+1}_results.torch') # Retrain with the best configuration and test # Set up a log file for this experiment (run in a separate process) logger = Logger(osp.join(final_run_exp_path, 'experiment.log'), mode='a') logger.log( json.dumps(dict(outer_k=dataset_getter.outer_k, inner_k=dataset_getter.inner_k, **best_config), sort_keys=False, indent=4)) if not debug: @ray.remote(num_cpus=1, num_gpus=self.gpus_per_task) def foo(): if not osp.exists(final_run_torch_path): experiment = self.experiment_class( best_config['config'], final_run_exp_path) res = experiment.run_test(dataset_getter, logger) torch.save(res, final_run_torch_path) return outer_k, i # Launch the job and append to list of final runs jobs future = foo.remote() self.final_runs_job_list.append(future) self.progress_manager.update_state( dict(type='START_FINAL_RUN', outer_fold=outer_k, run_id=i)) else: if not osp.exists(final_run_torch_path): experiment = self.experiment_class(best_config['config'], final_run_exp_path) training_score, test_score = experiment.run_test( dataset_getter, logger) torch.save((training_score, test_score), final_run_torch_path) if debug: self.process_final_runs(outer_k)
def model_selection(self, kfold_folder, outer_k, debug): """ Performs model selection by launching each configuration in parallel, unless debug is True. Each process trains the same configuration for each inner fold. :param kfold_folder: The root folder for model selection :param outer_k: the current outer fold to consider :param debug: whether to run the procedure in debug mode (no multiprocessing) """ SELECTION_FOLDER = osp.join(kfold_folder, self._SELECTION_FOLDER) # Create the dataset provider dataset_getter_class = s2c(self.model_configs.dataset_getter) dataset_getter = dataset_getter_class( self.model_configs.data_root, self.splits_folder, s2c(self.model_configs.dataset_class), self.model_configs.dataset_name, self.outer_folds, self.inner_folds, self.model_configs.num_dataloader_workers, self.model_configs.pin_memory) # Tell the data provider to take data relative # to a specific OUTER split dataset_getter.set_outer_k(outer_k) if not osp.exists(SELECTION_FOLDER): os.makedirs(SELECTION_FOLDER) # if the # of configs to try is 1, simply skip model selection if len(self.model_configs) > 1: # Launch one job for each inner_fold for each configuration for config_id, config in enumerate(self.model_configs): # I need to make a copy of this dictionary # It seems it gets shared between processes! cfg = deepcopy(config) # Create a separate folder for each configuration config_folder = osp.join( SELECTION_FOLDER, self._CONFIG_BASE + str(config_id + 1)) if not osp.exists(config_folder): os.makedirs(config_folder) for k in range(self.inner_folds): # Create a separate folder for each fold for each config. fold_exp_folder = osp.join( config_folder, self._INNER_FOLD_BASE + str(k + 1)) fold_results_torch_path = osp.join( fold_exp_folder, f'fold_{str(k+1)}_results.torch') # Tell the data provider to take data relative # to a specific INNER split dataset_getter.set_inner_k(k) logger = Logger(osp.join(fold_exp_folder, 'experiment.log'), mode='a') logger.log( json.dumps(dict(outer_k=dataset_getter.outer_k, inner_k=dataset_getter.inner_k, **config), sort_keys=False, indent=4)) if not debug: @ray.remote(num_cpus=1, num_gpus=self.gpus_per_task) def foo(): if not osp.exists(fold_results_torch_path): experiment = self.experiment_class( config, fold_exp_folder) res = experiment.run_valid( dataset_getter, logger) torch.save(res, fold_results_torch_path) return dataset_getter.outer_k, dataset_getter.inner_k, config_id # Launch the job and append to list of outer jobs future = foo.remote() self.outer_folds_job_list.append(future) self.progress_manager.update_state( dict(type='START_CONFIG', outer_fold=outer_k, inner_fold=k, config_id=config_id)) else: # debug mode if not osp.exists(fold_results_torch_path): experiment = self.experiment_class( config, fold_exp_folder) training_score, validation_score = experiment.run_valid( dataset_getter, logger) torch.save((training_score, validation_score), fold_results_torch_path) if debug: self.process_config(config_folder, deepcopy(config)) if debug: self.process_inner_results(SELECTION_FOLDER, config_id) else: # Performing model selection for a single configuration is useless with open(osp.join(SELECTION_FOLDER, self._WINNER_CONFIG), 'w') as fp: json.dump(dict(best_config_id=0, config=self.model_configs[0]), fp, sort_keys=False, indent=4)