def __init__(self, gradient_clipping_class_name, **kwargs): """ Istantiates the gradient clipper object :param gradient_clipping_class_name: :param kwargs: """ self.gradient_clipper = s2c(gradient_clipping_class_name)(**kwargs)
def __init__(self, model, optimizer_class_name, accumulate_gradients=False, **kwargs): super().__init__() self.optimizer = s2c(optimizer_class_name)(model.parameters(), **kwargs) self.accumulate_gradients = accumulate_gradients
def __init__(self, scheduler_class_name, optimizer, **kwargs): self.scheduler = s2c(scheduler_class_name)(optimizer, **kwargs)
def run_final_model(self, outer_k, debug): outer_folder = osp.join(self._ASSESSMENT_FOLDER, self._OUTER_FOLD_BASE + str(outer_k + 1)) config_fname = osp.join(outer_folder, self._SELECTION_FOLDER, self._WINNER_CONFIG) with open(config_fname, 'r') as f: best_config = json.load(f) dataset_getter_class = s2c(self.model_configs.dataset_getter) dataset_getter = dataset_getter_class( self.model_configs.data_root, self.splits_folder, s2c(self.model_configs.dataset_class), self.model_configs.dataset_name, self.outer_folds, self.inner_folds, self.model_configs.num_dataloader_workers, self.model_configs.pin_memory) # Tell the data provider to take data relative # to a specific OUTER split dataset_getter.set_outer_k(outer_k) dataset_getter.set_inner_k(None) # Mitigate bad random initializations for i in range(self.final_training_runs): final_run_exp_path = osp.join(outer_folder, f"final_run{i+1}") final_run_torch_path = osp.join(final_run_exp_path, f'run_{i+1}_results.torch') # Retrain with the best configuration and test # Set up a log file for this experiment (run in a separate process) logger = Logger(osp.join(final_run_exp_path, 'experiment.log'), mode='a') logger.log( json.dumps(dict(outer_k=dataset_getter.outer_k, inner_k=dataset_getter.inner_k, **best_config), sort_keys=False, indent=4)) if not debug: @ray.remote(num_cpus=1, num_gpus=self.gpus_per_task) def foo(): if not osp.exists(final_run_torch_path): experiment = self.experiment_class( best_config['config'], final_run_exp_path) res = experiment.run_test(dataset_getter, logger) torch.save(res, final_run_torch_path) return outer_k, i # Launch the job and append to list of final runs jobs future = foo.remote() self.final_runs_job_list.append(future) self.progress_manager.update_state( dict(type='START_FINAL_RUN', outer_fold=outer_k, run_id=i)) else: if not osp.exists(final_run_torch_path): experiment = self.experiment_class(best_config['config'], final_run_exp_path) training_score, test_score = experiment.run_test( dataset_getter, logger) torch.save((training_score, test_score), final_run_torch_path) if debug: self.process_final_runs(outer_k)
def model_selection(self, kfold_folder, outer_k, debug): """ Performs model selection by launching each configuration in parallel, unless debug is True. Each process trains the same configuration for each inner fold. :param kfold_folder: The root folder for model selection :param outer_k: the current outer fold to consider :param debug: whether to run the procedure in debug mode (no multiprocessing) """ SELECTION_FOLDER = osp.join(kfold_folder, self._SELECTION_FOLDER) # Create the dataset provider dataset_getter_class = s2c(self.model_configs.dataset_getter) dataset_getter = dataset_getter_class( self.model_configs.data_root, self.splits_folder, s2c(self.model_configs.dataset_class), self.model_configs.dataset_name, self.outer_folds, self.inner_folds, self.model_configs.num_dataloader_workers, self.model_configs.pin_memory) # Tell the data provider to take data relative # to a specific OUTER split dataset_getter.set_outer_k(outer_k) if not osp.exists(SELECTION_FOLDER): os.makedirs(SELECTION_FOLDER) # if the # of configs to try is 1, simply skip model selection if len(self.model_configs) > 1: # Launch one job for each inner_fold for each configuration for config_id, config in enumerate(self.model_configs): # I need to make a copy of this dictionary # It seems it gets shared between processes! cfg = deepcopy(config) # Create a separate folder for each configuration config_folder = osp.join( SELECTION_FOLDER, self._CONFIG_BASE + str(config_id + 1)) if not osp.exists(config_folder): os.makedirs(config_folder) for k in range(self.inner_folds): # Create a separate folder for each fold for each config. fold_exp_folder = osp.join( config_folder, self._INNER_FOLD_BASE + str(k + 1)) fold_results_torch_path = osp.join( fold_exp_folder, f'fold_{str(k+1)}_results.torch') # Tell the data provider to take data relative # to a specific INNER split dataset_getter.set_inner_k(k) logger = Logger(osp.join(fold_exp_folder, 'experiment.log'), mode='a') logger.log( json.dumps(dict(outer_k=dataset_getter.outer_k, inner_k=dataset_getter.inner_k, **config), sort_keys=False, indent=4)) if not debug: @ray.remote(num_cpus=1, num_gpus=self.gpus_per_task) def foo(): if not osp.exists(fold_results_torch_path): experiment = self.experiment_class( config, fold_exp_folder) res = experiment.run_valid( dataset_getter, logger) torch.save(res, fold_results_torch_path) return dataset_getter.outer_k, dataset_getter.inner_k, config_id # Launch the job and append to list of outer jobs future = foo.remote() self.outer_folds_job_list.append(future) self.progress_manager.update_state( dict(type='START_CONFIG', outer_fold=outer_k, inner_fold=k, config_id=config_id)) else: # debug mode if not osp.exists(fold_results_torch_path): experiment = self.experiment_class( config, fold_exp_folder) training_score, validation_score = experiment.run_valid( dataset_getter, logger) torch.save((training_score, validation_score), fold_results_torch_path) if debug: self.process_config(config_folder, deepcopy(config)) if debug: self.process_inner_results(SELECTION_FOLDER, config_id) else: # Performing model selection for a single configuration is useless with open(osp.join(SELECTION_FOLDER, self._WINNER_CONFIG), 'w') as fp: json.dump(dict(best_config_id=0, config=self.model_configs[0]), fp, sort_keys=False, indent=4)
def _istantiate_scorer(self, scorer): if isinstance(scorer, dict): args = scorer["args"] return s2c(scorer['class_name'])(*args) else: return s2c(scorer)()
def preprocess_data(options): data_info = options.pop("dataset") if "class_name" not in data_info: raise ValueError("You must specify 'class_name' in your dataset.") dataset_class = s2c(data_info.pop("class_name")) dataset_args = data_info.pop("args") data_root = data_info.pop("root") ################################ # more experimental stuff here dataset_kwargs = data_info.pop('other_args', {}) pre_transforms = None pre_transforms_opt = data_info.pop("pre_transform", None) if pre_transforms_opt is not None: pre_transforms = [] for pre_transform in pre_transforms_opt: pre_transform_class = s2c(pre_transform["class_name"]) args = pre_transform.pop("args", {}) pre_transforms.append(pre_transform_class(**args)) dataset_kwargs.update(pre_transform=Compose(pre_transforms)) pre_filters = None pre_filters_opt = data_info.pop("pre_filter", None) if pre_filters_opt is not None and check_argument(dataset_class, "pre_filter"): pre_filters = [] for pre_filter in pre_filters_opt: pre_filter_class = s2c(pre_filter["class_name"]) args = pre_filter.pop("args", {}) pre_filters.append(pre_filter_class(**args)) dataset_kwargs.update(pre_filter=Compose(pre_filters)) transforms = None transforms_opt = data_info.pop("transforms", None) if transforms_opt is not None: transforms = [] for transform in transforms_opt: transform_class = s2c(transform["class_name"]) args = transform.pop("args", {}) transforms.append(transform_class(**args)) dataset_kwargs.update(transform=Compose(transforms)) dataset_args.update(dataset_kwargs) ################################ dataset = dataset_class(**dataset_args) assert hasattr(dataset, 'name'), "Dataset instance should have a name attribute!" # Store dataset additional arguments in a separate file kwargs_path = osp.join(data_root, dataset.name, 'processed', 'dataset_kwargs.pt') torch.save(dataset_args, kwargs_path) # Process data splits splits_info = options.pop("splitter") splits_root = splits_info.pop("root") if "class_name" not in splits_info: raise ValueError("You must specify 'class_name' in your splitter.") splitter_class = s2c(splits_info.pop("class_name")) splitter_args = splits_info.pop("args") splitter = splitter_class(**splitter_args) splits_dir = get_or_create_dir(osp.join(splits_root, dataset.name)) splits_path = osp.join( splits_dir, f"{dataset.name}_outer{splitter.n_outer_folds}_inner{splitter.n_inner_folds}.splits" ) if not os.path.exists(splits_path): # If there is a single target for each element of the dataset, # we can try to stratify samples according to the target # ow (node/link tasks) it is best if the specific splitter does the job for us # todo: this code could be improved to handle different cases more elegantly has_targets, targets = get_graph_targets(dataset) splitter.split(dataset, targets=targets if has_targets else None) splitter.save(splits_path) else: print("Data splits are already present, I will not overwrite them.")