Пример #1
0
 def __init__(self, gradient_clipping_class_name, **kwargs):
     """
     Istantiates the gradient clipper object
     :param gradient_clipping_class_name:
     :param kwargs:
     """
     self.gradient_clipper = s2c(gradient_clipping_class_name)(**kwargs)
Пример #2
0
 def __init__(self,
              model,
              optimizer_class_name,
              accumulate_gradients=False,
              **kwargs):
     super().__init__()
     self.optimizer = s2c(optimizer_class_name)(model.parameters(),
                                                **kwargs)
     self.accumulate_gradients = accumulate_gradients
Пример #3
0
 def __init__(self, scheduler_class_name, optimizer, **kwargs):
     self.scheduler = s2c(scheduler_class_name)(optimizer, **kwargs)
Пример #4
0
    def run_final_model(self, outer_k, debug):
        outer_folder = osp.join(self._ASSESSMENT_FOLDER,
                                self._OUTER_FOLD_BASE + str(outer_k + 1))
        config_fname = osp.join(outer_folder, self._SELECTION_FOLDER,
                                self._WINNER_CONFIG)

        with open(config_fname, 'r') as f:
            best_config = json.load(f)

        dataset_getter_class = s2c(self.model_configs.dataset_getter)
        dataset_getter = dataset_getter_class(
            self.model_configs.data_root, self.splits_folder,
            s2c(self.model_configs.dataset_class),
            self.model_configs.dataset_name, self.outer_folds,
            self.inner_folds, self.model_configs.num_dataloader_workers,
            self.model_configs.pin_memory)
        # Tell the data provider to take data relative
        # to a specific OUTER split
        dataset_getter.set_outer_k(outer_k)
        dataset_getter.set_inner_k(None)

        # Mitigate bad random initializations
        for i in range(self.final_training_runs):

            final_run_exp_path = osp.join(outer_folder, f"final_run{i+1}")
            final_run_torch_path = osp.join(final_run_exp_path,
                                            f'run_{i+1}_results.torch')

            # Retrain with the best configuration and test
            # Set up a log file for this experiment (run in a separate process)
            logger = Logger(osp.join(final_run_exp_path, 'experiment.log'),
                            mode='a')
            logger.log(
                json.dumps(dict(outer_k=dataset_getter.outer_k,
                                inner_k=dataset_getter.inner_k,
                                **best_config),
                           sort_keys=False,
                           indent=4))

            if not debug:

                @ray.remote(num_cpus=1, num_gpus=self.gpus_per_task)
                def foo():
                    if not osp.exists(final_run_torch_path):

                        experiment = self.experiment_class(
                            best_config['config'], final_run_exp_path)
                        res = experiment.run_test(dataset_getter, logger)
                        torch.save(res, final_run_torch_path)
                    return outer_k, i

                # Launch the job and append to list of final runs jobs
                future = foo.remote()
                self.final_runs_job_list.append(future)
                self.progress_manager.update_state(
                    dict(type='START_FINAL_RUN', outer_fold=outer_k, run_id=i))
            else:
                if not osp.exists(final_run_torch_path):
                    experiment = self.experiment_class(best_config['config'],
                                                       final_run_exp_path)
                    training_score, test_score = experiment.run_test(
                        dataset_getter, logger)
                    torch.save((training_score, test_score),
                               final_run_torch_path)
        if debug:
            self.process_final_runs(outer_k)
Пример #5
0
    def model_selection(self, kfold_folder, outer_k, debug):
        """
        Performs model selection by launching each configuration in parallel, unless debug is True. Each process
        trains the same configuration for each inner fold.
        :param kfold_folder: The root folder for model selection
        :param outer_k: the current outer fold to consider
        :param debug: whether to run the procedure in debug mode (no multiprocessing)
        """
        SELECTION_FOLDER = osp.join(kfold_folder, self._SELECTION_FOLDER)

        # Create the dataset provider
        dataset_getter_class = s2c(self.model_configs.dataset_getter)
        dataset_getter = dataset_getter_class(
            self.model_configs.data_root, self.splits_folder,
            s2c(self.model_configs.dataset_class),
            self.model_configs.dataset_name, self.outer_folds,
            self.inner_folds, self.model_configs.num_dataloader_workers,
            self.model_configs.pin_memory)

        # Tell the data provider to take data relative
        # to a specific OUTER split
        dataset_getter.set_outer_k(outer_k)

        if not osp.exists(SELECTION_FOLDER):
            os.makedirs(SELECTION_FOLDER)

        # if the # of configs to try is 1, simply skip model selection
        if len(self.model_configs) > 1:

            # Launch one job for each inner_fold for each configuration
            for config_id, config in enumerate(self.model_configs):
                # I need to make a copy of this dictionary
                # It seems it gets shared between processes!
                cfg = deepcopy(config)

                # Create a separate folder for each configuration
                config_folder = osp.join(
                    SELECTION_FOLDER, self._CONFIG_BASE + str(config_id + 1))
                if not osp.exists(config_folder):
                    os.makedirs(config_folder)

                for k in range(self.inner_folds):
                    # Create a separate folder for each fold for each config.
                    fold_exp_folder = osp.join(
                        config_folder, self._INNER_FOLD_BASE + str(k + 1))
                    fold_results_torch_path = osp.join(
                        fold_exp_folder, f'fold_{str(k+1)}_results.torch')

                    # Tell the data provider to take data relative
                    # to a specific INNER split
                    dataset_getter.set_inner_k(k)

                    logger = Logger(osp.join(fold_exp_folder,
                                             'experiment.log'),
                                    mode='a')
                    logger.log(
                        json.dumps(dict(outer_k=dataset_getter.outer_k,
                                        inner_k=dataset_getter.inner_k,
                                        **config),
                                   sort_keys=False,
                                   indent=4))
                    if not debug:

                        @ray.remote(num_cpus=1, num_gpus=self.gpus_per_task)
                        def foo():
                            if not osp.exists(fold_results_torch_path):
                                experiment = self.experiment_class(
                                    config, fold_exp_folder)
                                res = experiment.run_valid(
                                    dataset_getter, logger)
                                torch.save(res, fold_results_torch_path)
                            return dataset_getter.outer_k, dataset_getter.inner_k, config_id

                        # Launch the job and append to list of outer jobs
                        future = foo.remote()
                        self.outer_folds_job_list.append(future)
                        self.progress_manager.update_state(
                            dict(type='START_CONFIG',
                                 outer_fold=outer_k,
                                 inner_fold=k,
                                 config_id=config_id))
                    else:  # debug mode
                        if not osp.exists(fold_results_torch_path):
                            experiment = self.experiment_class(
                                config, fold_exp_folder)
                            training_score, validation_score = experiment.run_valid(
                                dataset_getter, logger)
                            torch.save((training_score, validation_score),
                                       fold_results_torch_path)

                if debug:
                    self.process_config(config_folder, deepcopy(config))
            if debug:
                self.process_inner_results(SELECTION_FOLDER, config_id)
        else:
            # Performing model selection for a single configuration is useless
            with open(osp.join(SELECTION_FOLDER, self._WINNER_CONFIG),
                      'w') as fp:
                json.dump(dict(best_config_id=0, config=self.model_configs[0]),
                          fp,
                          sort_keys=False,
                          indent=4)
Пример #6
0
 def _istantiate_scorer(self, scorer):
     if isinstance(scorer, dict):
         args = scorer["args"]
         return s2c(scorer['class_name'])(*args)
     else:
         return s2c(scorer)()
Пример #7
0
def preprocess_data(options):

    data_info = options.pop("dataset")
    if "class_name" not in data_info:
        raise ValueError("You must specify 'class_name' in your dataset.")
    dataset_class = s2c(data_info.pop("class_name"))
    dataset_args = data_info.pop("args")
    data_root = data_info.pop("root")

    ################################

    # more experimental stuff here

    dataset_kwargs = data_info.pop('other_args', {})

    pre_transforms = None
    pre_transforms_opt = data_info.pop("pre_transform", None)
    if pre_transforms_opt is not None:
        pre_transforms = []
        for pre_transform in pre_transforms_opt:
            pre_transform_class = s2c(pre_transform["class_name"])
            args = pre_transform.pop("args", {})
            pre_transforms.append(pre_transform_class(**args))
        dataset_kwargs.update(pre_transform=Compose(pre_transforms))

    pre_filters = None
    pre_filters_opt = data_info.pop("pre_filter", None)
    if pre_filters_opt is not None and check_argument(dataset_class,
                                                      "pre_filter"):
        pre_filters = []
        for pre_filter in pre_filters_opt:
            pre_filter_class = s2c(pre_filter["class_name"])
            args = pre_filter.pop("args", {})
            pre_filters.append(pre_filter_class(**args))
        dataset_kwargs.update(pre_filter=Compose(pre_filters))

    transforms = None
    transforms_opt = data_info.pop("transforms", None)
    if transforms_opt is not None:
        transforms = []
        for transform in transforms_opt:
            transform_class = s2c(transform["class_name"])
            args = transform.pop("args", {})
            transforms.append(transform_class(**args))
        dataset_kwargs.update(transform=Compose(transforms))

    dataset_args.update(dataset_kwargs)

    ################################

    dataset = dataset_class(**dataset_args)
    assert hasattr(dataset,
                   'name'), "Dataset instance should have a name attribute!"

    # Store dataset additional arguments in a separate file
    kwargs_path = osp.join(data_root, dataset.name, 'processed',
                           'dataset_kwargs.pt')
    torch.save(dataset_args, kwargs_path)

    # Process data splits

    splits_info = options.pop("splitter")
    splits_root = splits_info.pop("root")
    if "class_name" not in splits_info:
        raise ValueError("You must specify 'class_name' in your splitter.")
    splitter_class = s2c(splits_info.pop("class_name"))
    splitter_args = splits_info.pop("args")
    splitter = splitter_class(**splitter_args)

    splits_dir = get_or_create_dir(osp.join(splits_root, dataset.name))
    splits_path = osp.join(
        splits_dir,
        f"{dataset.name}_outer{splitter.n_outer_folds}_inner{splitter.n_inner_folds}.splits"
    )

    if not os.path.exists(splits_path):
        # If there is a single target for each element of the dataset,
        # we can try to stratify samples according to the target
        # ow (node/link tasks) it is best if the specific splitter does the job for us
        # todo: this code could be improved to handle different cases more elegantly
        has_targets, targets = get_graph_targets(dataset)
        splitter.split(dataset, targets=targets if has_targets else None)
        splitter.save(splits_path)
    else:
        print("Data splits are already present, I will not overwrite them.")