Пример #1
0
def experiment(
    experiment_config,
    exp_prefix,
    variant,
    gpu_kwargs=None,
    log_to_wandb=False,
):
    """
    Reset timers
    (Useful if running multiple seeds from same command)
    """

    gt.reset()
    gt.start()
    """
    Setup logging
    """

    seed = variant['seed']
    setup_logger(exp_prefix,
                 variant=variant,
                 seed=seed,
                 log_to_wandb=log_to_wandb)
    output_csv = logger.get_tabular_output()
    """
    Set GPU mode for pytorch (+ possible other things later)
    """

    if gpu_kwargs is None:
        gpu_kwargs = {'mode': False}
    ptu.set_gpu_mode(**gpu_kwargs)
    """
    Set experiment seeds
    """

    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    """
    Environment setup
    """

    envs_list = variant.get('envs_list', None)

    if envs_list is None:
        expl_env, env_infos = make_env(variant['env_name'],
                                       **variant.get('env_kwargs', {}))

    else:
        # TODO: not sure if this is tested
        if len(envs_list) == 0:
            raise AttributeError('length of envs_list is zero')
        switch_every = variant['switch_every']
        expl_envs = []
        for env_params in envs_list:
            expl_env, env_infos = make_env(**env_params)
            expl_envs.append(expl_env)
        expl_env = ContinualLifelongEnv(expl_envs[0], switch_every, expl_envs)

    obs_dim = get_dim(expl_env.observation_space)
    action_dim = get_dim(expl_env.action_space)

    if env_infos['mujoco']:
        replay_buffer = MujocoReplayBuffer(variant['replay_buffer_size'],
                                           expl_env)
    else:
        replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'],
                                        expl_env)

    eval_env = FollowerEnv(expl_env)
    """
    Import any teacher data
    """

    if 'teacher_data_files' in variant:
        for data_file in variant['teacher_data_files']:
            if 'max_teacher_transitions' in variant:
                add_transitions(
                    replay_buffer,
                    data_file,
                    obs_dim,
                    action_dim,
                    max_transitions=variant['max_teacher_transitions'],
                )
            else:
                add_transitions(replay_buffer, data_file, obs_dim, action_dim)
    """
    Experiment-specific configuration
    """

    config = experiment_config['get_config'](
        variant,
        expl_env=expl_env,
        eval_env=eval_env,
        obs_dim=obs_dim,
        action_dim=action_dim,
        replay_buffer=replay_buffer,
    )

    if 'load_config' in experiment_config:
        experiment_config['load_config'](config, variant, gpu_kwargs)

    if 'algorithm_kwargs' not in config:
        config['algorithm_kwargs'] = variant.get('algorithm_kwargs', dict())
    if 'offline_kwargs' not in config:
        config['offline_kwargs'] = variant.get('offline_kwargs', dict())
    """
    Path collectors for sampling from environment
    """

    collector_type = variant.get('collector_type', 'step')
    exploration_policy = config['exploration_policy']
    if collector_type == 'step':
        expl_path_collector = MdpStepCollector(expl_env, exploration_policy)
    elif collector_type == 'batch':
        expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    elif collector_type == 'batch_latent':
        expl_path_collector = LatentPathCollector(
            sample_latent_every=None,
            env=expl_env,
            policy=exploration_policy,
        )
    elif collector_type == 'rf':
        expl_path_collector = RFCollector(expl_env, exploration_policy)
    else:
        raise NotImplementedError(
            'collector_type of experiment not recognized')

    if collector_type == 'gcr':
        eval_path_collector = GoalConditionedReplayStepCollector(
            eval_env,
            config['evaluation_policy'],
            replay_buffer,
            variant['resample_goal_every'],
        )
    else:
        eval_path_collector = MdpPathCollector(
            eval_env,
            config['evaluation_policy'],
        )
    """
    Finish timer
    """

    gt.stamp('initialization', unique=False)
    """
    Offline RL pretraining
    """

    if 'get_offline_algorithm' in experiment_config and variant.get(
            'do_offline_training', False):
        logger.set_tabular_output(
            os.path.join(logger.log_dir, 'offline_progress.csv'))

        offline_algorithm = experiment_config['get_offline_algorithm'](
            config,
            eval_path_collector=eval_path_collector,
        )
        offline_algorithm.to(ptu.device)
        offline_algorithm.train()

        logger.set_tabular_output(output_csv)
    """
    Generate algorithm that performs training
    """

    if 'get_algorithm' in experiment_config and variant.get(
            'do_online_training', True):
        algorithm = experiment_config['get_algorithm'](
            config,
            expl_path_collector=expl_path_collector,
            eval_path_collector=eval_path_collector,
        )
        algorithm.to(ptu.device)
        algorithm.train()
Пример #2
0
    def _train(self):

        batch_idxes = np.arange(self.num_tasks)

        gt.start()

        for epoch in gt.timed_for(
                trange(self._start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            # Distribute the evaluation. We ship the
            # params of each needed network to the
            # remote path collector

            params_list = []
            for net in self.policy.networks:
                params_list.append(ptu.state_dict_cpu(net))

            self.path_collector.set_policy_params(params_list)

            evaluation_train_obj_id_list = []
            count = 0
            while count < len(self.train_goals):
                if len(self.train_goals) - count < self.num_workers:
                    evaluation_obj_id = self.path_collector.async_evaluate(
                        self.train_goals[count:])
                    count = len(self.train_goals)
                else:
                    evaluation_obj_id = self.path_collector.async_evaluate(
                        self.train_goals[count:count + self.num_workers])
                    count += self.num_workers
                evaluation_train_obj_id_list.extend(evaluation_obj_id)

            assert len(evaluation_train_obj_id_list) == len(
                self.train_goals
            ), f'{len(evaluation_train_obj_id_list)}, {len(self.train_goals)}'

            evaluation_wd_obj_id_list = []
            count = 0
            while count < len(self.wd_goals):
                if len(self.wd_goals) - count < self.num_workers:
                    evaluation_obj_id = self.path_collector.async_evaluate(
                        self.wd_goals[count:])
                    count = len(self.wd_goals)
                else:
                    evaluation_obj_id = self.path_collector.async_evaluate(
                        self.wd_goals[count:count + self.num_workers])
                    count += self.num_workers
                evaluation_wd_obj_id_list.extend(evaluation_obj_id)

            assert len(evaluation_wd_obj_id_list) == len(self.wd_goals)

            # evaluation_ood_obj_id_list = []
            # count = 0
            # while count < len(self.ood_goals) :
            #     if len(self.ood_goals) - count < self.num_workers:
            #         evaluation_obj_id = self.path_collector.async_evaluate(self.ood_goals[count:])
            #         count = len(self.ood_goals)
            #     else:
            #         evaluation_obj_id = self.path_collector.async_evaluate(self.ood_goals[count:count + self.num_workers])
            #         count += self.num_workers
            #     evaluation_ood_obj_id_list.extend(evaluation_obj_id)

            # assert len(evaluation_ood_obj_id_list) == len(self.ood_goals)

            gt.stamp('set_up_evaluation', unique=False)

            train_batch_obj_id = self.train_buffer.sample_training_data(
                batch_idxes)

            for _ in trange(self.num_train_loops_per_epoch):
                train_raw_batch = ray.get(train_batch_obj_id)

                gt.stamp('sample_training_data', unique=False)

                # In this way, we can start the data sampling job for the
                # next training while doing training for the current loop.
                train_batch_obj_id = self.train_buffer.sample_training_data(
                    batch_idxes)

                gt.stamp('set_up_sampling', unique=False)

                train_data = self.construct_training_batch(train_raw_batch)
                gt.stamp('construct_training_batch', unique=False)

                self.policy.train(train_data)
                gt.stamp('training', unique=False)

            eval_train_returns = ray.get(evaluation_train_obj_id_list)

            self.avg_train_episode_returns = [
                item[0] for item in eval_train_returns
            ]
            self.final_train_achieved = [
                item[1] for item in eval_train_returns
            ]
            self.train_avg_returns = np.mean(self.avg_train_episode_returns)

            eval_wd_returns = ray.get(evaluation_wd_obj_id_list)

            self.avg_wd_episode_returns = [item[0] for item in eval_wd_returns]
            self.final_wd_achieved = [item[1] for item in eval_wd_returns]
            self.wd_avg_returns = np.mean(self.avg_wd_episode_returns)

            # eval_ood_returns = ray.get(evaluation_ood_obj_id_list)

            # self.avg_ood_episode_returns = [item[0] for item in eval_ood_returns]
            # self.final_ood_achieved = [item[1] for item in eval_ood_returns]
            # self.ood_avg_returns = np.mean(self.avg_ood_episode_returns)

            gt.stamp('evaluation', unique=False)

            self._end_epoch(epoch)
        coreset = None  # not needed
        data.x_mean, data.x_std = data.get_statistics(
            data.X[data.index['train']])
    else:
        raise ValueError('Invalid coreset algorithm: {}'.format(args.coreset))

    test_performances = {
        'LL': [],
        'RMSE': [],
        'wt': [0.],
        'wt_batch': [0.],
        'num_samples': []
    }
    test_nll, test_performance = np.zeros(1, ), np.zeros(1, )

    gt.start()
    while len(data.index['train']) < args.init_num_labeled + args.budget:
        print('{}: Number of samples {}/{}'.format(
            args.seed,
            len(data.index['train']) - args.init_num_labeled, args.budget))

        optim_params = {
            'num_epochs': args.training_epochs,
            'batch_size': get_batch_size(args.dataset, data),
            'weight_decay': args.weight_decay,
            'initial_lr': args.initial_lr
        }
        nl = NeuralLinearTB(data, out_features=out_features, **kwargs)
        # nl = NeuralLinear(data, out_features=out_features, **kwargs)
        nl = utils.to_gpu(nl)
        nl.optimize(data, **optim_params)