Пример #1
0
    def train_batch(self, start_epoch):
        self._current_path_builder = PathBuilder()
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            set_to_train_mode(self.training_env)
            observation = self._start_new_rollout()
            # This implementation is rather naive. If you want to (e.g.)
            # parallelize data collection, this would be the place to do it.
            for _ in range(self.num_env_steps_per_epoch):
                observation = self._take_step_in_env(observation)
            gt.stamp('sample')

            self._try_to_train()
            gt.stamp('train')

            set_to_eval_mode(self.env)
            self._try_to_eval(epoch)
            gt.stamp('eval')

            self._try_to_fit(epoch)
            gt.stamp('env_fit')
            self.logger.record_tabular(self.env_loss_key, self.env_loss)

            self._end_epoch(epoch)
            self.logger.dump_tabular(with_prefix=False, with_timestamp=False)
Пример #2
0
    def train_online(self, start_epoch=0):
        self._current_path_builder = PathBuilder()
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            set_to_train_mode(self.training_env)
            observation = self._start_new_rollout()
            for _ in range(self.num_env_steps_per_epoch):
                observation = self._take_step_in_env(observation)
                gt.stamp('sample')

                self._try_to_fit(epoch)
                gt.stamp('env_fit')

                self._try_to_train()
                gt.stamp('train')

            self.logger.record_tabular(self.env_loss_key, self.env_loss)

            set_to_eval_mode(self.env)
            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch(epoch)

            self.logger.dump_tabular(with_prefix=False, with_timestamp=False)
    def train(self):
        '''
        meta-training loop
        '''
        self.pretrain()
        params = self.get_epoch_snapshot(-1)
        logger.save_itr_params(-1, params)
        gt.reset()
        gt.set_def_unique(False)
        self._current_path_builder = PathBuilder()

        # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate
        for it_ in gt.timed_for(
                range(self.num_iterations),
                save_itrs=True,
        ):
            self._start_epoch(it_)
            self.training_mode(True)
            if it_ == 0:
                print('collecting initial pool of data for train and eval')
                # temp for evaluating
                for idx in self.train_tasks:
                    self.task_idx = idx
                    self.env.reset_task(idx)
                    self.collect_data(self.num_initial_steps, 1, np.inf)
            # Sample data from train tasks.
            for i in range(self.num_tasks_sample):
                idx = np.random.randint(len(self.train_tasks))
                self.task_idx = idx
                self.env.reset_task(idx)
                self.enc_replay_buffer.task_buffers[idx].clear()

                # collect some trajectories with z ~ prior
                if self.num_steps_prior > 0:
                    self.collect_data(self.num_steps_prior, 1, np.inf)
                # collect some trajectories with z ~ posterior
                if self.num_steps_posterior > 0:
                    self.collect_data(self.num_steps_posterior, 1,
                                      self.update_post_train)
                # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior
                if self.num_extra_rl_steps_posterior > 0:
                    self.collect_data(self.num_extra_rl_steps_posterior,
                                      1,
                                      self.update_post_train,
                                      add_to_enc_buffer=False)

            # Sample train tasks and compute gradient updates on parameters.
            for train_step in range(self.num_train_steps_per_itr):
                indices = np.random.choice(self.train_tasks, self.meta_batch)
                self._do_training(indices)
                self._n_train_steps_total += 1
            gt.stamp('train')

            self.training_mode(False)

            # eval
            self._try_to_eval(it_)
            gt.stamp('eval')

            self._end_epoch()
Пример #4
0
    def train_online(self, start_epoch=0):
        if not self.environment_farming:
            observation = self._start_new_rollout()
        self._current_path_builder = PathBuilder()
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)

            for _ in range(self.num_env_steps_per_epoch):
                if not self.environment_farming:
                    observation = self.play_one_step(observation)
                else:
                    # acquire a remote environment
                    remote_env = self.farmer.force_acq_env()
                    self.play_ignore(remote_env)

                # Training out of threads
                self._try_to_train()
                gt.stamp('train')

            if epoch % 10 == 0:
                self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch()
Пример #5
0
    def train_online(self, start_epoch=0):
        self._current_path_builder = PathBuilder()

        observation = self._start_new_rollout()
        self.sample_z = self.sample_z_vec()
        observation = np.concatenate([observation, self.sample_z])

        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            set_to_train_mode(self.training_env)
            for t in range(self.num_env_steps_per_epoch):
                #print("step", t, "pool", self.replay_buffer.num_steps_can_sample())

                observation = self._take_step_in_env(observation)
                gt.stamp('sample')

                self._try_to_train()
                gt.stamp('train')

            set_to_eval_mode(self.env)
            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch(epoch)
Пример #6
0
    def debug(self):
        gt.reset()
        gt.set_def_unique(False)

        for i in gt.timed_for(range(self.iteration_num), save_itrs=True):
            self._start_iteration(i)

            if i % self.task_sample_frequency == 0:
                self.logger.log('Data Collection')
                task = self._sample_task()
                rollouts = self._collect_traj(task, debug=True)
                self._n_rollouts_total += 1
                self.dataset.extend(rollouts)
            gt.stamp('sample')

            self.logger.log('Adaptation Update')

            for _ in range(self.adaptation_update_num):
                trajs = self._sample_traj(debug=True)
                self.theta_loss = self._compute_adaptation_loss(
                    self.theta, trajs)
                self._meta_update(self.theta_loss)

            gt.stamp('adaptation')
            gt.stamp('meta')

            if i % self.eval_frequency == 0:
                self.logger.log('Evaluation')
                self.evaluate()
            gt.stamp('eval')

            self._end_iteration(i)
Пример #7
0
    def train_online(self, start_epoch=0):
        # No need for training mode to be True when generating trajectories
        # training mode is automatically set to True
        # in _try_to_train and before exiting
        # it that function it reverts it to False
        self.training_mode(False)
        self._current_path_builder = PathBuilder()
        self._n_rollouts_total = 0

        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            print('EPOCH STARTED')
            # print('epoch')
            for _ in range(self.num_rollouts_per_epoch):
                # print('rollout')
                task_params, obs_task_params = self.train_task_params_sampler.sample(
                )
                self.generate_exploration_rollout(
                    task_params=task_params, obs_task_params=obs_task_params)

                # print(self._n_rollouts_total)
                if self._n_rollouts_total % self.num_rollouts_between_updates == 0:
                    gt.stamp('sample')
                    # print('train')
                    if not self.do_not_train: self._try_to_train(epoch)
                    gt.stamp('train')

            if not self.do_not_eval:
                self._try_to_eval(epoch)
                gt.stamp('eval')

            self._end_epoch()
Пример #8
0
    def train_batch(self, start_epoch):
        self._current_path_builder = PathBuilder()

        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):

            self._start_epoch(epoch)
            set_to_train_mode(self.training_env)
            observation = self._start_new_rollout()
            # This implementation is rather naive. If you want to (e.g.)
            # parallelize data collection, this would be the place to do it.
            for i in range(self.num_env_steps_per_epoch):
                observation, terminal = self._take_step_in_env(observation)

                #print(i, terminal)
            assert terminal[0] == True
            gt.stamp('sample')

            self._try_to_train()
            gt.stamp('train')

            set_to_eval_mode(self.env)
            #print(i, terminal)
            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch(epoch)
Пример #9
0
    def _train(self):

        for epoch in gt.timed_for(
                range(self._start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self.eval_path_collector.collect_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
                discard_incomplete_paths=True,
            )
            gt.stamp('evaluation sampling')

            new_expl_paths = self.expl_path_collector.collect_new_paths(
                self.max_path_length,
                self.num_expl_steps_per_train_loop,
                discard_incomplete_paths=False,
            )
            gt.stamp('exploration sampling', unique=False)
            self.replay_buffer.add_paths(new_expl_paths)
            gt.stamp('data storing', unique=False)

            self.training_mode(True)
            train_data = self.replay_buffer.random_batch(self.batch_size)
            self.algo.train(train_data)
            gt.stamp('training', unique=False)
            self.training_mode(False)

            self._end_epoch(epoch)
Пример #10
0
def experiment(variant):
    cuda = True
    from gym.envs.mujoco import HalfCheetahEnv
    from mujoco_torch.core.bridge import MjCudaRender
    R = 84
    env = HalfCheetahEnv()
    c = Convnet(6, output_activation=torch.tanh, input_channels=3)
    if cuda:
        c.cuda()

    gt.stamp("start")
    for i in range(100):
        img = env.sim.render(R, R, device_id=1)

    gt.stamp("warmstart")
    for i in gt.timed_for(range(1000)):
        env.step(np.random.rand(6))
        gt.stamp('step')

        img = env.sim.render(R, R, device_id=1)
        gt.stamp('render')

        x = np_to_var(img)
        if cuda:
            x = x.cuda()
            torch.cuda.synchronize()
        gt.stamp('transfer')
        # cv2.imshow("img", img)
        # cv2.waitKey(1)
    gt.stamp("end")

    print(img)

    print(gt.report(include_itrs=False))
Пример #11
0
    def _train(self, env, policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """
        self._init_training()
        self.sampler.initialize(env, policy, pool)

        evaluation_env = deep_clone(env) if self._eval_n_episodes else None
        # TODO: use Ezpickle to deep_clone???
        # evaluation_env = env

        with tf_utils.get_default_session().as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(
                    range(self._n_epochs + 1), save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(
                            iteration=t + epoch * self._epoch_length,
                            batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(policy, evaluation_env)
                gt.stamp('eval')

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)

                time_itrs = gt.get_times().stamps.itrs
                time_eval = time_itrs['eval'][-1]
                time_total = gt.get_times().total
                time_train = time_itrs.get('train', [0])[-1]
                time_sample = time_itrs.get('sample', [0])[-1]

                logger.record_tabular('time-train', time_train)
                logger.record_tabular('time-eval', time_eval)
                logger.record_tabular('time-sample', time_sample)
                logger.record_tabular('time-total', time_total)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

            self.sampler.terminate()
Пример #12
0
    def train(self, start_epoch=0):
        # Get snapshot of initial algo state
        if start_epoch == 0:
            self._log_initial_data()

        self.training_mode(False)
        self._n_env_steps_total = start_epoch * self.num_train_steps_per_epoch

        gt.reset()
        gt.set_def_unique(False)

        self._current_path = PathBuilder()
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            obs = self._start_new_rollout()
            for ss in range(self.num_train_steps_per_epoch):
                obs = self._take_step_in_env(obs)
                gt.stamp('sample')

                if self._algo_mode == 'online':
                    self._try_to_train()
                    gt.stamp('train')

            if self._algo_mode == 'episode':
                self._try_to_train()
                gt.stamp('train')

            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch(epoch)
Пример #13
0
    def _train(self):
        self.training_mode(False)

        for epoch in gt.timed_for(range(self._start_epoch, self.num_epochs),
                                  save_itrs=True):
            print(
                f"in train, with eval to go: {self.num_eval_steps_per_epoch}")
            for step in range(self.num_eval_steps_per_epoch):

                self.eval_data_collector.collect_one_step(
                    step, self.num_eval_steps_per_epoch)
            gt.stamp("evaluation sampling")
            print("done with eval")

            for _ in range(self.num_train_loops_per_epoch):
                # this if check could be moved inside the function
                if self.use_linear_lr_decay:
                    # decrease learning rate linearly
                    self.trainer.decay_lr(epoch, self.num_epochs)

                for step in range(self.num_expl_steps_per_train_loop):
                    self.expl_data_collector.collect_one_step(
                        step, self.num_expl_steps_per_train_loop)
                    # time.sleep(1)

                gt.stamp("exploration sampling", unique=False)

                rollouts = self.expl_data_collector.get_rollouts()
                gt.stamp("data storing", unique=False)
                self.training_mode(True)
                self.trainer.train(rollouts)
                gt.stamp("training", unique=False)
                self.training_mode(False)

            self._end_epoch(epoch)
Пример #14
0
    def train_online(self, start_epoch=0):
        # No need for training mode to be True when generating trajectories
        # training mode is automatically set to True
        # in _try_to_train and before exiting
        # it that function it reverts it to False
        self.training_mode(False)
        self._current_path_builder = PathBuilder()

        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            for _ in range(self.num_rollouts_per_epoch):
                task_params, obs_task_params = self.train_task_params_sampler.sample(
                )
                self.generate_exploration_rollout(
                    task_params=task_params, obs_task_params=obs_task_params)

            # essentially in each epoch we gather data then do a certain amount of training
            gt.stamp('sample')
            if not self.do_not_train: self._try_to_train()
            gt.stamp('train')

            if epoch % self.freq_eval == 0:
                # and then we evaluate it
                if not self.do_not_eval: self._try_to_eval(epoch)
                gt.stamp('eval')

            self._end_epoch()
Пример #15
0
    def start_training(self, start_epoch=0):
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            steps_this_epoch = 0
            steps_since_train_call = 0
            while steps_this_epoch < self.min_steps_per_epoch:
                task_params = self.train_task_params_sampler.sample()
                rollout_len = self.do_task_rollout(task_params)

                steps_this_epoch += rollout_len
                steps_since_train_call += rollout_len

                if steps_since_train_call > self.min_steps_between_train_calls:
                    steps_since_train_call = 0
                    gt.stamp('sample')
                    self._try_to_train(epoch)
                    gt.stamp('train')

            gt.stamp('sample')
            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch()
    def _train(self):
        # Pretrain the model at the beginning of training until convergence
        # Note that convergence is measured against a holdout set of max size 8192
        if self.train_at_start:
            self.model_trainer.train_from_buffer(
                self.replay_buffer,
                max_grad_steps=self.model_max_grad_steps,
                epochs_since_last_update=self.model_epochs_since_last_update,
            )
        gt.stamp('model training', unique=False)

        for epoch in gt.timed_for(
            range(self._start_epoch, self.num_epochs),
            save_itrs=True,
        ):
            self.eval_data_collector.collect_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
                discard_incomplete_paths=True,
            )
            gt.stamp('evaluation sampling')

            self.training_mode(True)
            for _ in range(self.num_train_loops_per_epoch):
                for t in range(self.num_trains_per_train_loop):
                    train_data = self.replay_buffer.random_batch(self.batch_size)
                    self.trainer.train(train_data)
                    gt.stamp('policy training', unique=False)
            self.training_mode(False)

            self._end_epoch(epoch)
Пример #17
0
    def _train(self):
        if self.min_num_steps_before_training > 0:
            init_expl_paths = self.expl_data_collector.collect_new_paths(
                self.max_path_length,
                self.min_num_steps_before_training,
            )
            self.replay_buffer.add_paths(init_expl_paths)
            self.expl_data_collector.end_epoch(-1)

        for epoch in gt.timed_for(
                range(self._start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self.eval_data_collector.collect_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
            )
            gt.stamp('evaluation sampling')

            for _ in range(self.num_train_loops_per_epoch):
                new_expl_paths = self.expl_data_collector.collect_new_paths(
                    self.max_path_length,
                    self.num_expl_steps_per_train_loop,
                )
                gt.stamp('exploration sampling', unique=False)

                self.replay_buffer.add_paths(new_expl_paths)
                gt.stamp('data storing', unique=False)

                for _ in range(self.num_trains_per_train_loop):
                    train_data = self.replay_buffer.random_batch(self.batch_size)
                    self.trainer.train(train_data)
                gt.stamp('training', unique=False)

            self._end_epoch(epoch)
Пример #18
0
    def train_online(self, start_epoch=0):
        self._current_path_builder = PathBuilder()
        if self.epoch_list is not None:
            iters = list(self.epoch_list)
        else:
            iters = list(range(start_epoch, self.num_epochs, self.epoch_freq))
        if self.num_epochs - 1 not in iters and self.num_epochs - 1 > iters[-1]:
            iters.append(self.num_epochs - 1)
        for epoch in gt.timed_for(
                iters,
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            env_utils.mode(self.training_env, 'train')
            observation = self._start_new_rollout()
            for _ in range(self.num_env_steps_per_epoch):
                if self.do_training:
                    observation = self._take_step_in_env(observation)

                gt.stamp('sample')
                self._try_to_train()
                gt.stamp('train')
            env_utils.mode(self.env, 'eval')
            # TODO steven: move dump_tabular to be conditionally called in
            # end_epoch and move post_epoch after eval
            self._post_epoch(epoch)
            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch()
Пример #19
0
    def train(self, start_epoch=0):

        batch_idxes = np.arange(self.num_tasks)
        batch_idxes = np.concatenate([batch_idxes[self.train_goal_id:], batch_idxes[:self.train_goal_id]])

        for epoch in gt.timed_for(
                trange(start_epoch, self.num_epochs),
                save_itrs=True,
        ):

            # Sample meta training tasks. And transfer the
            # transitions sampling job to each remote replay buffer.

            train_batch_obj_id = self.train_buffer.sample_training_data(batch_idxes)

            for _ in range(self.num_train_loops_per_epoch):
                train_raw_batch = ray.get(train_batch_obj_id)
                gt.stamp('sample_training_data', unique=False)

                # In this way, we can start the data sampling job for the
                # next training while doing training for the current loop.
                train_batch_obj_id = self.train_buffer.sample_training_data(batch_idxes)
                gt.stamp('set_up_sampling', unique=False)

                train_data = self.construct_training_batch(train_raw_batch)
                gt.stamp('construct_training_batch', unique=False)
                
                self.trainer.train(train_data, batch_idxes, epoch)
                
            gt.stamp('training', unique=False)

            self._end_epoch(epoch)
Пример #20
0
    def _train(self):
        """Called by superclass BaseRLAlgorithm, conducts the training loop.

        Before training (i.e., the minimum number of steps before trainnig) Get
        new paths for _exploration_, with noise added (in the case of DDPG).
        Add the paths to replay buffer.

        Then we begin the actual cycle of evaluation and exploration. Each
        epoch consists of an evaluator data collector collecting paths
        (discarding incomplete ones), and then exploration data collection, and
        only exploration data is added to the buffer. The number of training
        loops is 1 by default so usually it will be one cycle of (evaluate,
        explore). Each explore, though, will do a bunch of training loops,
        e.g., 1000 by default.

        When we talk about 'steps' we really should be talking about training
        (or exploration) steps, right? The evaluation steps is for reporting
        results.
        """
        if self.min_num_steps_before_training > 0:
            init_expl_paths = self.expl_data_collector.collect_new_paths(
                self.max_path_length,
                self.min_num_steps_before_training,
                discard_incomplete_paths=False,
            )
            self.replay_buffer.add_paths(init_expl_paths)
            self.expl_data_collector.end_epoch(-1)

        for epoch in gt.timed_for(
                range(self._start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self.eval_data_collector.collect_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
                discard_incomplete_paths=True,
            )
            gt.stamp('evaluation sampling')

            for _ in range(self.num_train_loops_per_epoch):
                new_expl_paths = self.expl_data_collector.collect_new_paths(
                    self.max_path_length,
                    self.num_expl_steps_per_train_loop,
                    discard_incomplete_paths=False,
                )
                gt.stamp('exploration sampling', unique=False)

                self.replay_buffer.add_paths(new_expl_paths)
                gt.stamp('data storing', unique=False)

                self.training_mode(True)
                for _ in range(self.num_trains_per_train_loop):
                    train_data = self.replay_buffer.random_batch(
                        self.batch_size)
                    self.trainer.train(train_data)
                gt.stamp('training', unique=False)
                self.training_mode(False)

            self._end_epoch(epoch)
Пример #21
0
def experiment(variant):

    root = 0

    E = 20
    R = 84
    U = 6
    cuda = True

    envs = []

    for e in range(E):
        env = HalfCheetahEnv()
        envs.append(env)

    c = Convnet(6, output_activation=torch.tanh, input_channels=3)
    if cuda:
        c.cuda()

    # viewer = mujoco_py.MjRenderContextOffscreen(env.sim, device_id=1)
    # env.sim.add_render_context(viewer)

    def step(i, stamp=True):
        imgs = []
        if i % 100 == 0:
            for e in envs:
                e.reset()
        for e in envs:
            img = e.sim.render(R, R, device_id=0).transpose()
            imgs.append(img)
        gt.stamp('render') if stamp else 0

        imgs = np.array(imgs)

        torch_img = np_to_var(imgs)
        if cuda:
            torch_img = torch_img.cuda()
            torch.cuda.synchronize()
        gt.stamp('transfer') if stamp else 0

        u = get_numpy(c.forward(torch_img).cpu())
        torch.cuda.synchronize()
        gt.stamp('forward') if stamp else 0

        for i, e in enumerate(envs):
            e.step(u[i, :])
        gt.stamp('step') if stamp else 0

    for i in range(10):
        step(i, False)

    gt.stamp('start')
    for i in gt.timed_for(range(100)):
        step(i)
    gt.stamp('end')

    print(gt.report(include_itrs=False, format_options=dict(itr_num_width=10)))
Пример #22
0
 def main_loop(self, max_epochs):
     # populate replay buffer before training
     for epoch in gt.timed_for(range(max_epochs)):
         if epoch % self.args.eval_every == 0:
             self.eval_step(
                 epoch
             )  # somehow this is not deterministic for decentralized?
         self.train_step(epoch)
     self.finish_training()
Пример #23
0
    def _train(self, env, policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """

        self._init_training(env, policy, pool)
        self.sampler.initialize(env, policy, pool)

        with self._sess.as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    # TODO.codeconsolidation: Add control interval to sampler
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(iteration=t +
                                          epoch * self._epoch_length,
                                          batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(epoch)

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                gt.stamp('eval')

            self.sampler.terminate()
Пример #24
0
    def _train(self, env, policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """
        self._init_training()
        self.sampler.initialize(env, policy, pool)

        evaluation_env = deep_clone(env) if self._eval_n_episodes else None

        with tf_utils.get_default_session().as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(
                    range(self._n_epochs + 1), save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(
                            iteration=t + epoch * self._epoch_length,
                            batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(policy, evaluation_env)
                gt.stamp('eval')

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)

                time_itrs = gt.get_times().stamps.itrs
                time_eval = time_itrs['eval'][-1]
                time_total = gt.get_times().total
                time_train = time_itrs.get('train', [0])[-1]
                time_sample = time_itrs.get('sample', [0])[-1]

                logger.record_tabular('time-train', time_train)
                logger.record_tabular('time-eval', time_eval)
                logger.record_tabular('time-sample', time_sample)
                logger.record_tabular('time-total', time_total)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()
Пример #25
0
 def train(self):
     """Negative epochs are offline, positive epochs are online"""
     for self.epoch in gt.timed_for(
             range(self._start_epoch, self.num_epochs),
             save_itrs=True,
     ):
         self.offline_rl = self.epoch < 0
         self._begin_epoch(self.epoch)
         self._train()
         self._end_epoch(self.epoch)
Пример #26
0
    def train(self):
        if self.min_num_steps_before_training > 0:
            for _ in range(0, self.min_num_steps_before_training,
                           self.max_path_length):
                patch_trajectory = rollout(self.expl_env, self.trainer.policy,
                                           self.trainer.qf1, self.trainer.qf2,
                                           self.max_path_length,
                                           self.rnn_seq_len)
                self.replay_buffer.add_trajectory(patch_trajectory)

        for epoch in gt.timed_for(
                range(self._start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            rewards, seen_area, total_rotate, right_rotate = eval_rollout(
                self.eval_env, self.trainer.eval_policy, epoch,
                self.num_eval_steps_per_epoch)
            self.writer.add_scalar('eval/mean_reward', np.mean(rewards), epoch)
            self.writer.add_scalar('eval/mean_sean_area', np.mean(seen_area),
                                   epoch)
            self.writer.add_scalar('eval/max_reward', np.max(rewards), epoch)
            self.writer.add_scalar('eval/max_sean_area', np.max(seen_area),
                                   epoch)
            self.writer.add_scalar('eval/min_reward', np.min(rewards), epoch)
            self.writer.add_scalar('eval/min_sean_area', np.min(seen_area),
                                   epoch)
            self.writer.add_scalar(
                'eval/mean_rotate_ratio',
                abs(0.5 - np.sum(right_rotate) / np.sum(total_rotate)), epoch)

            gt.stamp('evalution_sampling', unique=False)

            for _ in range(self.num_train_loops_per_epoch):
                for _ in range(0, self.num_expl_steps_per_train_loop,
                               self.max_path_length):
                    patch_trajectory = rollout(self.expl_env,
                                               self.trainer.policy,
                                               self.trainer.qf1,
                                               self.trainer.qf2,
                                               self.max_path_length,
                                               self.rnn_seq_len)
                    gt.stamp('exploration sampling', unique=False)

                    self.replay_buffer.add_trajectory(patch_trajectory)
                    gt.stamp('data storing', unique=False)

                self.training_mode(True)
                for _ in range(self.num_trains_per_train_loop):
                    train_batch_data = self.replay_buffer.random_batch(
                        self.batch_size)
                    self.trainer.train(train_batch_data)
                gt.stamp('training', unique=False)
                self.training_mode(False)

            self._end_epoch()
    def _train(self):
        if self.min_num_steps_before_training > 0:
            init_expl_paths = self.expl_data_collector.collect_new_paths(
                self.max_path_length,
                self.min_num_steps_before_training,
                discard_incomplete_paths=False,
            )
            self.replay_buffer.add_paths(init_expl_paths)
            self.expl_data_collector.end_epoch(-1)
            self.estimate_obs_stats(init_expl_paths[0]['observations'],
                                    init_flag=True)

        for epoch in gt.timed_for(
                range(self._start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self.eval_data_collector.collect_normalized_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
                discard_incomplete_paths=True,
                input_mean=self._obs_mean,
                input_std=self._obs_std,
            )
            gt.stamp('evaluation sampling')

            for _ in range(self.num_train_loops_per_epoch):
                new_expl_paths = self.expl_data_collector.collect_normalized_new_paths(
                    self.max_path_length,
                    self.num_expl_steps_per_train_loop,
                    discard_incomplete_paths=False,
                    input_mean=self._obs_mean,
                    input_std=self._obs_std,
                )
                gt.stamp('exploration sampling', unique=False)

                self.replay_buffer.add_paths(new_expl_paths)
                gt.stamp('data storing', unique=False)

                self.training_mode(True)
                for _ in range(self.num_trains_per_train_loop):
                    train_data = self.replay_buffer.random_batch(
                        self.batch_size)
                    self.estimate_obs_stats(train_data['observations'],
                                            init_flag=False)
                    train_data['observations'] = self.apply_normalize_obs(
                        train_data['observations'])
                    self.trainer.train(train_data)
                gt.stamp('training', unique=False)
                self.training_mode(False)

            self._end_epoch(epoch)
            if self.save_frequency > 0:
                if epoch % self.save_frequency == 0:
                    self.trainer.save_models(epoch)
                    self.replay_buffer.save_buffer(epoch)
Пример #28
0
    def _train(self):
        if self.min_num_steps_before_training > 0:
            init_expl_paths = self.expl_data_collector.collect_new_paths(
                self.max_path_length,
                self.min_num_steps_before_training,
                discard_incomplete_paths=False,
            )
            self.replay_buffer.add_paths(init_expl_paths)
            self.expl_data_collector.end_epoch(-1)

        for epoch in gt.timed_for(
                range(self._start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            # curriculum update - Tenho as probabilidades novas
            if epoch >= self.min_ep_curriculum and epoch % self.curr_update_freq == 0:
                gt.stamp('curriculum update')
                self.proba = self.curr_fn(self.trainer.policy,
                                     self.trainer.qf1,
                                     self.trainer.qf2,
                                     self.evaluation_env,
                                     **self.curr_kwargs)
                self.evaluation_env.set_init_proba(self.proba)
                self.exploration_env.set_init_proba(self.proba)

                self.all_probas[epoch] = self.proba

            # Eval step
            self.curr_state = self.eval_data_collector.collect_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
                discard_incomplete_paths=True,
            )
            gt.stamp('evaluation sampling')

            for _ in range(self.num_train_loops_per_epoch):
                new_expl_paths = self.expl_data_collector.collect_new_paths(
                    self.max_path_length,
                    self.num_expl_steps_per_train_loop,
                    discard_incomplete_paths=False,
                )
                gt.stamp('exploration sampling', unique=False)

                self.replay_buffer.add_paths(new_expl_paths)
                gt.stamp('data storing', unique=False)

                self.training_mode(True)
                for _ in range(self.num_trains_per_train_loop):
                    train_data = self.replay_buffer.random_batch(
                        self.batch_size)
                    self.trainer.train(train_data)
                gt.stamp('training', unique=False)
                self.training_mode(False)

            self._end_epoch(epoch)
Пример #29
0
    def _train(self):
        st = time.time()
        if self.min_num_steps_before_training > 0:
            init_expl_paths = self.expl_data_collector.collect_new_paths(
                self.max_path_length,
                self.min_num_steps_before_training,
                runtime_policy=self.pretrain_policy,
            )
            self.replay_buffer.add_paths(init_expl_paths)
            self.expl_data_collector.end_epoch(-1)
        self.total_train_expl_time += time.time() - st
        self.trainer.buffer = self.replay_buffer  # TODO: make a cleaner of doing this
        self.training_mode(True)
        for _ in range(self.num_pretrain_steps):
            train_data = self.replay_buffer.random_batch(self.batch_size)
            self.trainer.train(train_data)
        self.training_mode(False)

        for epoch in gt.timed_for(
            range(self._start_epoch, self.num_epochs),
            save_itrs=True,
        ):
            self.eval_data_collector.collect_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
            )
            gt.stamp("evaluation sampling")
            st = time.time()
            for _ in range(self.num_train_loops_per_epoch):
                new_expl_paths = self.expl_data_collector.collect_new_paths(
                    self.max_path_length,
                    self.num_expl_steps_per_train_loop,
                )
                gt.stamp("exploration sampling", unique=False)

                self.replay_buffer.add_paths(new_expl_paths)
                gt.stamp("data storing", unique=False)

                self.training_mode(True)
                for train_step in range(self.num_trains_per_train_loop):
                    train_data = self.replay_buffer.random_batch(self.batch_size)
                    self.trainer.train(train_data)
                gt.stamp("training", unique=False)
                self.training_mode(False)

            if self.eval_buffer:
                eval_data = self.eval_buffer.random_batch(self.batch_size)
                self.trainer.evaluate(eval_data, buffer_data=False)
                eval_data = self.replay_buffer.random_batch(self.batch_size)
                self.trainer.evaluate(eval_data, buffer_data=True)
            self.total_train_expl_time += time.time() - st

            self._end_epoch(epoch)
Пример #30
0
    def _train(self):
        if self.min_num_steps_before_training > 0:
            init_expl_paths = self.expl_data_collector.collect_new_paths(
                self.max_path_length,
                self.min_num_steps_before_training,
                discard_incomplete_paths=False,
            )
            self.replay_buffer.add_paths(init_expl_paths)
            self.expl_data_collector.end_epoch(-1)

            self._fit_input_stats()

        for epoch in gt.timed_for(
                range(self._start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self.eval_data_collector.collect_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
                discard_incomplete_paths=True,
            )
            gt.stamp('evaluation sampling')

            self.training_mode(True)
            if self.replay_buffer.num_steps_can_sample() > 0:
                self.model_trainer.train_from_buffer(
                    self.replay_buffer,
                    max_grad_steps=self.model_max_grad_steps,
                    epochs_since_last_update=self.
                    model_epochs_since_last_update,
                )
            gt.stamp('model training', unique=False)

            for _ in range(self.num_train_loops_per_epoch):
                new_expl_paths = self.expl_data_collector.collect_new_paths(
                    self.max_path_length,
                    self.num_expl_steps_per_train_loop,
                    discard_incomplete_paths=False,
                )
                gt.stamp('exploration sampling', unique=False)

                self.replay_buffer.add_paths(new_expl_paths)
                gt.stamp('data storing', unique=False)

                self.training_mode(True)
                for _ in range(self.num_trains_per_train_loop):
                    self.trainer.train_from_paths(new_expl_paths)
                gt.stamp('training', unique=False)
                self.training_mode(False)

            self._fit_input_stats()

            self._end_epoch(epoch)
Пример #31
0
def experiment(variant):
    E = 10
    R = 84
    cuda = True

    envs = []

    renderer = MjCudaRender(R, R)


    for e in range(E):
        env = HalfCheetahEnv()
        envs.append(env)
    c = Convnet(6, output_activation=torch.tanh, input_channels=3)
    if cuda:
        c.cuda()

    # viewer = mujoco_py.MjRenderContextOffscreen(env.sim, device_id=1)
    # env.sim.add_render_context(viewer)

    def step(stamp=True):
        imgs = []
        if i % 100 == 0:
            for e in range(E):
                envs[e].reset()
        for e in range(E):
            # img = renderer.get_cuda_tensor(envs[e].sim)
            img = envs[e].sim.render(R, R, device_id=1).transpose()
        gt.stamp('render') if stamp else 0

        # imgs =np.array(imgs)
        # torch_img = np_to_var(imgs)
        # if cuda:
        #     torch_img = torch_img.cuda()
        #     torch.cuda.synchronize()
        # gt.stamp('transfer') if stamp else 0

        # u = get_numpy(c.forward(torch_img).cpu())
        # torch.cuda.synchronize()
        # gt.stamp('forward') if stamp else 0

        # for e in range(E):
        #     envs[e].step(u[e, :])
        # gt.stamp('step') if stamp else 0

    for i in range(10):
        step(False)

    gt.stamp('start')
    for i in gt.timed_for(range(100)):
        step()
    gt.stamp('end')
Пример #32
0
    def _train(self, env, policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """

        self._init_training(env, policy, pool)

        with self._sess.as_default():
            observation = env.reset()
            policy.reset()

            path_length = 0
            path_return = 0
            last_path_return = 0
            max_path_return = -np.inf
            n_episodes = 0
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(
                    range(self._n_epochs + 1), save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                if self.iter_callback is not None:
                    self.iter_callback(locals(), globals())

                for t in range(self._epoch_length):
                    iteration = t + epoch * self._epoch_length

                    action, _ = policy.get_action(observation)
                    next_ob, reward, terminal, info = env.step(action)
                    path_length += 1
                    path_return += reward

                    self.pool.add_sample(
                        observation,
                        action,
                        reward,
                        terminal,
                        next_ob,
                    )

                    if terminal or path_length >= self._max_path_length:
                        observation = env.reset()
                        policy.reset()
                        path_length = 0
                        max_path_return = max(max_path_return, path_return)
                        last_path_return = path_return

                        path_return = 0
                        n_episodes += 1

                    else:
                        observation = next_ob
                    gt.stamp('sample')

                    if self.pool.size >= self._min_pool_size:
                        for i in range(self._n_train_repeat):
                            batch = self.pool.random_batch(self._batch_size)
                            self._do_training(iteration, batch)

                    gt.stamp('train')

                self._evaluate(epoch)

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)
                logger.record_tabular('episodes', n_episodes)
                logger.record_tabular('max-path-return', max_path_return)
                logger.record_tabular('last-path-return', last_path_return)
                logger.record_tabular('pool-size', self.pool.size)

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                gt.stamp('eval')

            env.terminate()