def train_batch(self, start_epoch): self._current_path_builder = PathBuilder() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) set_to_train_mode(self.training_env) observation = self._start_new_rollout() # This implementation is rather naive. If you want to (e.g.) # parallelize data collection, this would be the place to do it. for _ in range(self.num_env_steps_per_epoch): observation = self._take_step_in_env(observation) gt.stamp('sample') self._try_to_train() gt.stamp('train') set_to_eval_mode(self.env) self._try_to_eval(epoch) gt.stamp('eval') self._try_to_fit(epoch) gt.stamp('env_fit') self.logger.record_tabular(self.env_loss_key, self.env_loss) self._end_epoch(epoch) self.logger.dump_tabular(with_prefix=False, with_timestamp=False)
def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) set_to_train_mode(self.training_env) observation = self._start_new_rollout() for _ in range(self.num_env_steps_per_epoch): observation = self._take_step_in_env(observation) gt.stamp('sample') self._try_to_fit(epoch) gt.stamp('env_fit') self._try_to_train() gt.stamp('train') self.logger.record_tabular(self.env_loss_key, self.env_loss) set_to_eval_mode(self.env) self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch(epoch) self.logger.dump_tabular(with_prefix=False, with_timestamp=False)
def train(self): ''' meta-training loop ''' self.pretrain() params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) gt.reset() gt.set_def_unique(False) self._current_path_builder = PathBuilder() # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate for it_ in gt.timed_for( range(self.num_iterations), save_itrs=True, ): self._start_epoch(it_) self.training_mode(True) if it_ == 0: print('collecting initial pool of data for train and eval') # temp for evaluating for idx in self.train_tasks: self.task_idx = idx self.env.reset_task(idx) self.collect_data(self.num_initial_steps, 1, np.inf) # Sample data from train tasks. for i in range(self.num_tasks_sample): idx = np.random.randint(len(self.train_tasks)) self.task_idx = idx self.env.reset_task(idx) self.enc_replay_buffer.task_buffers[idx].clear() # collect some trajectories with z ~ prior if self.num_steps_prior > 0: self.collect_data(self.num_steps_prior, 1, np.inf) # collect some trajectories with z ~ posterior if self.num_steps_posterior > 0: self.collect_data(self.num_steps_posterior, 1, self.update_post_train) # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior if self.num_extra_rl_steps_posterior > 0: self.collect_data(self.num_extra_rl_steps_posterior, 1, self.update_post_train, add_to_enc_buffer=False) # Sample train tasks and compute gradient updates on parameters. for train_step in range(self.num_train_steps_per_itr): indices = np.random.choice(self.train_tasks, self.meta_batch) self._do_training(indices) self._n_train_steps_total += 1 gt.stamp('train') self.training_mode(False) # eval self._try_to_eval(it_) gt.stamp('eval') self._end_epoch()
def train_online(self, start_epoch=0): if not self.environment_farming: observation = self._start_new_rollout() self._current_path_builder = PathBuilder() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) for _ in range(self.num_env_steps_per_epoch): if not self.environment_farming: observation = self.play_one_step(observation) else: # acquire a remote environment remote_env = self.farmer.force_acq_env() self.play_ignore(remote_env) # Training out of threads self._try_to_train() gt.stamp('train') if epoch % 10 == 0: self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch()
def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() observation = self._start_new_rollout() self.sample_z = self.sample_z_vec() observation = np.concatenate([observation, self.sample_z]) for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) set_to_train_mode(self.training_env) for t in range(self.num_env_steps_per_epoch): #print("step", t, "pool", self.replay_buffer.num_steps_can_sample()) observation = self._take_step_in_env(observation) gt.stamp('sample') self._try_to_train() gt.stamp('train') set_to_eval_mode(self.env) self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch(epoch)
def debug(self): gt.reset() gt.set_def_unique(False) for i in gt.timed_for(range(self.iteration_num), save_itrs=True): self._start_iteration(i) if i % self.task_sample_frequency == 0: self.logger.log('Data Collection') task = self._sample_task() rollouts = self._collect_traj(task, debug=True) self._n_rollouts_total += 1 self.dataset.extend(rollouts) gt.stamp('sample') self.logger.log('Adaptation Update') for _ in range(self.adaptation_update_num): trajs = self._sample_traj(debug=True) self.theta_loss = self._compute_adaptation_loss( self.theta, trajs) self._meta_update(self.theta_loss) gt.stamp('adaptation') gt.stamp('meta') if i % self.eval_frequency == 0: self.logger.log('Evaluation') self.evaluate() gt.stamp('eval') self._end_iteration(i)
def train_online(self, start_epoch=0): # No need for training mode to be True when generating trajectories # training mode is automatically set to True # in _try_to_train and before exiting # it that function it reverts it to False self.training_mode(False) self._current_path_builder = PathBuilder() self._n_rollouts_total = 0 for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) print('EPOCH STARTED') # print('epoch') for _ in range(self.num_rollouts_per_epoch): # print('rollout') task_params, obs_task_params = self.train_task_params_sampler.sample( ) self.generate_exploration_rollout( task_params=task_params, obs_task_params=obs_task_params) # print(self._n_rollouts_total) if self._n_rollouts_total % self.num_rollouts_between_updates == 0: gt.stamp('sample') # print('train') if not self.do_not_train: self._try_to_train(epoch) gt.stamp('train') if not self.do_not_eval: self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch()
def train_batch(self, start_epoch): self._current_path_builder = PathBuilder() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) set_to_train_mode(self.training_env) observation = self._start_new_rollout() # This implementation is rather naive. If you want to (e.g.) # parallelize data collection, this would be the place to do it. for i in range(self.num_env_steps_per_epoch): observation, terminal = self._take_step_in_env(observation) #print(i, terminal) assert terminal[0] == True gt.stamp('sample') self._try_to_train() gt.stamp('train') set_to_eval_mode(self.env) #print(i, terminal) self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch(epoch)
def _train(self): for epoch in gt.timed_for( range(self._start_epoch, self.num_epochs), save_itrs=True, ): self.eval_path_collector.collect_new_paths( self.max_path_length, self.num_eval_steps_per_epoch, discard_incomplete_paths=True, ) gt.stamp('evaluation sampling') new_expl_paths = self.expl_path_collector.collect_new_paths( self.max_path_length, self.num_expl_steps_per_train_loop, discard_incomplete_paths=False, ) gt.stamp('exploration sampling', unique=False) self.replay_buffer.add_paths(new_expl_paths) gt.stamp('data storing', unique=False) self.training_mode(True) train_data = self.replay_buffer.random_batch(self.batch_size) self.algo.train(train_data) gt.stamp('training', unique=False) self.training_mode(False) self._end_epoch(epoch)
def experiment(variant): cuda = True from gym.envs.mujoco import HalfCheetahEnv from mujoco_torch.core.bridge import MjCudaRender R = 84 env = HalfCheetahEnv() c = Convnet(6, output_activation=torch.tanh, input_channels=3) if cuda: c.cuda() gt.stamp("start") for i in range(100): img = env.sim.render(R, R, device_id=1) gt.stamp("warmstart") for i in gt.timed_for(range(1000)): env.step(np.random.rand(6)) gt.stamp('step') img = env.sim.render(R, R, device_id=1) gt.stamp('render') x = np_to_var(img) if cuda: x = x.cuda() torch.cuda.synchronize() gt.stamp('transfer') # cv2.imshow("img", img) # cv2.waitKey(1) gt.stamp("end") print(img) print(gt.report(include_itrs=False))
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training() self.sampler.initialize(env, policy, pool) evaluation_env = deep_clone(env) if self._eval_n_episodes else None # TODO: use Ezpickle to deep_clone??? # evaluation_env = env with tf_utils.get_default_session().as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training( iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(policy, evaluation_env) gt.stamp('eval') params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) time_itrs = gt.get_times().stamps.itrs time_eval = time_itrs['eval'][-1] time_total = gt.get_times().total time_train = time_itrs.get('train', [0])[-1] time_sample = time_itrs.get('sample', [0])[-1] logger.record_tabular('time-train', time_train) logger.record_tabular('time-eval', time_eval) logger.record_tabular('time-sample', time_sample) logger.record_tabular('time-total', time_total) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() self.sampler.terminate()
def train(self, start_epoch=0): # Get snapshot of initial algo state if start_epoch == 0: self._log_initial_data() self.training_mode(False) self._n_env_steps_total = start_epoch * self.num_train_steps_per_epoch gt.reset() gt.set_def_unique(False) self._current_path = PathBuilder() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) obs = self._start_new_rollout() for ss in range(self.num_train_steps_per_epoch): obs = self._take_step_in_env(obs) gt.stamp('sample') if self._algo_mode == 'online': self._try_to_train() gt.stamp('train') if self._algo_mode == 'episode': self._try_to_train() gt.stamp('train') self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch(epoch)
def _train(self): self.training_mode(False) for epoch in gt.timed_for(range(self._start_epoch, self.num_epochs), save_itrs=True): print( f"in train, with eval to go: {self.num_eval_steps_per_epoch}") for step in range(self.num_eval_steps_per_epoch): self.eval_data_collector.collect_one_step( step, self.num_eval_steps_per_epoch) gt.stamp("evaluation sampling") print("done with eval") for _ in range(self.num_train_loops_per_epoch): # this if check could be moved inside the function if self.use_linear_lr_decay: # decrease learning rate linearly self.trainer.decay_lr(epoch, self.num_epochs) for step in range(self.num_expl_steps_per_train_loop): self.expl_data_collector.collect_one_step( step, self.num_expl_steps_per_train_loop) # time.sleep(1) gt.stamp("exploration sampling", unique=False) rollouts = self.expl_data_collector.get_rollouts() gt.stamp("data storing", unique=False) self.training_mode(True) self.trainer.train(rollouts) gt.stamp("training", unique=False) self.training_mode(False) self._end_epoch(epoch)
def train_online(self, start_epoch=0): # No need for training mode to be True when generating trajectories # training mode is automatically set to True # in _try_to_train and before exiting # it that function it reverts it to False self.training_mode(False) self._current_path_builder = PathBuilder() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) for _ in range(self.num_rollouts_per_epoch): task_params, obs_task_params = self.train_task_params_sampler.sample( ) self.generate_exploration_rollout( task_params=task_params, obs_task_params=obs_task_params) # essentially in each epoch we gather data then do a certain amount of training gt.stamp('sample') if not self.do_not_train: self._try_to_train() gt.stamp('train') if epoch % self.freq_eval == 0: # and then we evaluate it if not self.do_not_eval: self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch()
def start_training(self, start_epoch=0): for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) steps_this_epoch = 0 steps_since_train_call = 0 while steps_this_epoch < self.min_steps_per_epoch: task_params = self.train_task_params_sampler.sample() rollout_len = self.do_task_rollout(task_params) steps_this_epoch += rollout_len steps_since_train_call += rollout_len if steps_since_train_call > self.min_steps_between_train_calls: steps_since_train_call = 0 gt.stamp('sample') self._try_to_train(epoch) gt.stamp('train') gt.stamp('sample') self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch()
def _train(self): # Pretrain the model at the beginning of training until convergence # Note that convergence is measured against a holdout set of max size 8192 if self.train_at_start: self.model_trainer.train_from_buffer( self.replay_buffer, max_grad_steps=self.model_max_grad_steps, epochs_since_last_update=self.model_epochs_since_last_update, ) gt.stamp('model training', unique=False) for epoch in gt.timed_for( range(self._start_epoch, self.num_epochs), save_itrs=True, ): self.eval_data_collector.collect_new_paths( self.max_path_length, self.num_eval_steps_per_epoch, discard_incomplete_paths=True, ) gt.stamp('evaluation sampling') self.training_mode(True) for _ in range(self.num_train_loops_per_epoch): for t in range(self.num_trains_per_train_loop): train_data = self.replay_buffer.random_batch(self.batch_size) self.trainer.train(train_data) gt.stamp('policy training', unique=False) self.training_mode(False) self._end_epoch(epoch)
def _train(self): if self.min_num_steps_before_training > 0: init_expl_paths = self.expl_data_collector.collect_new_paths( self.max_path_length, self.min_num_steps_before_training, ) self.replay_buffer.add_paths(init_expl_paths) self.expl_data_collector.end_epoch(-1) for epoch in gt.timed_for( range(self._start_epoch, self.num_epochs), save_itrs=True, ): self.eval_data_collector.collect_new_paths( self.max_path_length, self.num_eval_steps_per_epoch, ) gt.stamp('evaluation sampling') for _ in range(self.num_train_loops_per_epoch): new_expl_paths = self.expl_data_collector.collect_new_paths( self.max_path_length, self.num_expl_steps_per_train_loop, ) gt.stamp('exploration sampling', unique=False) self.replay_buffer.add_paths(new_expl_paths) gt.stamp('data storing', unique=False) for _ in range(self.num_trains_per_train_loop): train_data = self.replay_buffer.random_batch(self.batch_size) self.trainer.train(train_data) gt.stamp('training', unique=False) self._end_epoch(epoch)
def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() if self.epoch_list is not None: iters = list(self.epoch_list) else: iters = list(range(start_epoch, self.num_epochs, self.epoch_freq)) if self.num_epochs - 1 not in iters and self.num_epochs - 1 > iters[-1]: iters.append(self.num_epochs - 1) for epoch in gt.timed_for( iters, save_itrs=True, ): self._start_epoch(epoch) env_utils.mode(self.training_env, 'train') observation = self._start_new_rollout() for _ in range(self.num_env_steps_per_epoch): if self.do_training: observation = self._take_step_in_env(observation) gt.stamp('sample') self._try_to_train() gt.stamp('train') env_utils.mode(self.env, 'eval') # TODO steven: move dump_tabular to be conditionally called in # end_epoch and move post_epoch after eval self._post_epoch(epoch) self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch()
def train(self, start_epoch=0): batch_idxes = np.arange(self.num_tasks) batch_idxes = np.concatenate([batch_idxes[self.train_goal_id:], batch_idxes[:self.train_goal_id]]) for epoch in gt.timed_for( trange(start_epoch, self.num_epochs), save_itrs=True, ): # Sample meta training tasks. And transfer the # transitions sampling job to each remote replay buffer. train_batch_obj_id = self.train_buffer.sample_training_data(batch_idxes) for _ in range(self.num_train_loops_per_epoch): train_raw_batch = ray.get(train_batch_obj_id) gt.stamp('sample_training_data', unique=False) # In this way, we can start the data sampling job for the # next training while doing training for the current loop. train_batch_obj_id = self.train_buffer.sample_training_data(batch_idxes) gt.stamp('set_up_sampling', unique=False) train_data = self.construct_training_batch(train_raw_batch) gt.stamp('construct_training_batch', unique=False) self.trainer.train(train_data, batch_idxes, epoch) gt.stamp('training', unique=False) self._end_epoch(epoch)
def _train(self): """Called by superclass BaseRLAlgorithm, conducts the training loop. Before training (i.e., the minimum number of steps before trainnig) Get new paths for _exploration_, with noise added (in the case of DDPG). Add the paths to replay buffer. Then we begin the actual cycle of evaluation and exploration. Each epoch consists of an evaluator data collector collecting paths (discarding incomplete ones), and then exploration data collection, and only exploration data is added to the buffer. The number of training loops is 1 by default so usually it will be one cycle of (evaluate, explore). Each explore, though, will do a bunch of training loops, e.g., 1000 by default. When we talk about 'steps' we really should be talking about training (or exploration) steps, right? The evaluation steps is for reporting results. """ if self.min_num_steps_before_training > 0: init_expl_paths = self.expl_data_collector.collect_new_paths( self.max_path_length, self.min_num_steps_before_training, discard_incomplete_paths=False, ) self.replay_buffer.add_paths(init_expl_paths) self.expl_data_collector.end_epoch(-1) for epoch in gt.timed_for( range(self._start_epoch, self.num_epochs), save_itrs=True, ): self.eval_data_collector.collect_new_paths( self.max_path_length, self.num_eval_steps_per_epoch, discard_incomplete_paths=True, ) gt.stamp('evaluation sampling') for _ in range(self.num_train_loops_per_epoch): new_expl_paths = self.expl_data_collector.collect_new_paths( self.max_path_length, self.num_expl_steps_per_train_loop, discard_incomplete_paths=False, ) gt.stamp('exploration sampling', unique=False) self.replay_buffer.add_paths(new_expl_paths) gt.stamp('data storing', unique=False) self.training_mode(True) for _ in range(self.num_trains_per_train_loop): train_data = self.replay_buffer.random_batch( self.batch_size) self.trainer.train(train_data) gt.stamp('training', unique=False) self.training_mode(False) self._end_epoch(epoch)
def experiment(variant): root = 0 E = 20 R = 84 U = 6 cuda = True envs = [] for e in range(E): env = HalfCheetahEnv() envs.append(env) c = Convnet(6, output_activation=torch.tanh, input_channels=3) if cuda: c.cuda() # viewer = mujoco_py.MjRenderContextOffscreen(env.sim, device_id=1) # env.sim.add_render_context(viewer) def step(i, stamp=True): imgs = [] if i % 100 == 0: for e in envs: e.reset() for e in envs: img = e.sim.render(R, R, device_id=0).transpose() imgs.append(img) gt.stamp('render') if stamp else 0 imgs = np.array(imgs) torch_img = np_to_var(imgs) if cuda: torch_img = torch_img.cuda() torch.cuda.synchronize() gt.stamp('transfer') if stamp else 0 u = get_numpy(c.forward(torch_img).cpu()) torch.cuda.synchronize() gt.stamp('forward') if stamp else 0 for i, e in enumerate(envs): e.step(u[i, :]) gt.stamp('step') if stamp else 0 for i in range(10): step(i, False) gt.stamp('start') for i in gt.timed_for(range(100)): step(i) gt.stamp('end') print(gt.report(include_itrs=False, format_options=dict(itr_num_width=10)))
def main_loop(self, max_epochs): # populate replay buffer before training for epoch in gt.timed_for(range(max_epochs)): if epoch % self.args.eval_every == 0: self.eval_step( epoch ) # somehow this is not deterministic for decentralized? self.train_step(epoch) self.finish_training()
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) self.sampler.initialize(env, policy, pool) with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): # TODO.codeconsolidation: Add control interval to sampler self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') self.sampler.terminate()
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training() self.sampler.initialize(env, policy, pool) evaluation_env = deep_clone(env) if self._eval_n_episodes else None with tf_utils.get_default_session().as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training( iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(policy, evaluation_env) gt.stamp('eval') params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) time_itrs = gt.get_times().stamps.itrs time_eval = time_itrs['eval'][-1] time_total = gt.get_times().total time_train = time_itrs.get('train', [0])[-1] time_sample = time_itrs.get('sample', [0])[-1] logger.record_tabular('time-train', time_train) logger.record_tabular('time-eval', time_eval) logger.record_tabular('time-sample', time_sample) logger.record_tabular('time-total', time_total) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix()
def train(self): """Negative epochs are offline, positive epochs are online""" for self.epoch in gt.timed_for( range(self._start_epoch, self.num_epochs), save_itrs=True, ): self.offline_rl = self.epoch < 0 self._begin_epoch(self.epoch) self._train() self._end_epoch(self.epoch)
def train(self): if self.min_num_steps_before_training > 0: for _ in range(0, self.min_num_steps_before_training, self.max_path_length): patch_trajectory = rollout(self.expl_env, self.trainer.policy, self.trainer.qf1, self.trainer.qf2, self.max_path_length, self.rnn_seq_len) self.replay_buffer.add_trajectory(patch_trajectory) for epoch in gt.timed_for( range(self._start_epoch, self.num_epochs), save_itrs=True, ): rewards, seen_area, total_rotate, right_rotate = eval_rollout( self.eval_env, self.trainer.eval_policy, epoch, self.num_eval_steps_per_epoch) self.writer.add_scalar('eval/mean_reward', np.mean(rewards), epoch) self.writer.add_scalar('eval/mean_sean_area', np.mean(seen_area), epoch) self.writer.add_scalar('eval/max_reward', np.max(rewards), epoch) self.writer.add_scalar('eval/max_sean_area', np.max(seen_area), epoch) self.writer.add_scalar('eval/min_reward', np.min(rewards), epoch) self.writer.add_scalar('eval/min_sean_area', np.min(seen_area), epoch) self.writer.add_scalar( 'eval/mean_rotate_ratio', abs(0.5 - np.sum(right_rotate) / np.sum(total_rotate)), epoch) gt.stamp('evalution_sampling', unique=False) for _ in range(self.num_train_loops_per_epoch): for _ in range(0, self.num_expl_steps_per_train_loop, self.max_path_length): patch_trajectory = rollout(self.expl_env, self.trainer.policy, self.trainer.qf1, self.trainer.qf2, self.max_path_length, self.rnn_seq_len) gt.stamp('exploration sampling', unique=False) self.replay_buffer.add_trajectory(patch_trajectory) gt.stamp('data storing', unique=False) self.training_mode(True) for _ in range(self.num_trains_per_train_loop): train_batch_data = self.replay_buffer.random_batch( self.batch_size) self.trainer.train(train_batch_data) gt.stamp('training', unique=False) self.training_mode(False) self._end_epoch()
def _train(self): if self.min_num_steps_before_training > 0: init_expl_paths = self.expl_data_collector.collect_new_paths( self.max_path_length, self.min_num_steps_before_training, discard_incomplete_paths=False, ) self.replay_buffer.add_paths(init_expl_paths) self.expl_data_collector.end_epoch(-1) self.estimate_obs_stats(init_expl_paths[0]['observations'], init_flag=True) for epoch in gt.timed_for( range(self._start_epoch, self.num_epochs), save_itrs=True, ): self.eval_data_collector.collect_normalized_new_paths( self.max_path_length, self.num_eval_steps_per_epoch, discard_incomplete_paths=True, input_mean=self._obs_mean, input_std=self._obs_std, ) gt.stamp('evaluation sampling') for _ in range(self.num_train_loops_per_epoch): new_expl_paths = self.expl_data_collector.collect_normalized_new_paths( self.max_path_length, self.num_expl_steps_per_train_loop, discard_incomplete_paths=False, input_mean=self._obs_mean, input_std=self._obs_std, ) gt.stamp('exploration sampling', unique=False) self.replay_buffer.add_paths(new_expl_paths) gt.stamp('data storing', unique=False) self.training_mode(True) for _ in range(self.num_trains_per_train_loop): train_data = self.replay_buffer.random_batch( self.batch_size) self.estimate_obs_stats(train_data['observations'], init_flag=False) train_data['observations'] = self.apply_normalize_obs( train_data['observations']) self.trainer.train(train_data) gt.stamp('training', unique=False) self.training_mode(False) self._end_epoch(epoch) if self.save_frequency > 0: if epoch % self.save_frequency == 0: self.trainer.save_models(epoch) self.replay_buffer.save_buffer(epoch)
def _train(self): if self.min_num_steps_before_training > 0: init_expl_paths = self.expl_data_collector.collect_new_paths( self.max_path_length, self.min_num_steps_before_training, discard_incomplete_paths=False, ) self.replay_buffer.add_paths(init_expl_paths) self.expl_data_collector.end_epoch(-1) for epoch in gt.timed_for( range(self._start_epoch, self.num_epochs), save_itrs=True, ): # curriculum update - Tenho as probabilidades novas if epoch >= self.min_ep_curriculum and epoch % self.curr_update_freq == 0: gt.stamp('curriculum update') self.proba = self.curr_fn(self.trainer.policy, self.trainer.qf1, self.trainer.qf2, self.evaluation_env, **self.curr_kwargs) self.evaluation_env.set_init_proba(self.proba) self.exploration_env.set_init_proba(self.proba) self.all_probas[epoch] = self.proba # Eval step self.curr_state = self.eval_data_collector.collect_new_paths( self.max_path_length, self.num_eval_steps_per_epoch, discard_incomplete_paths=True, ) gt.stamp('evaluation sampling') for _ in range(self.num_train_loops_per_epoch): new_expl_paths = self.expl_data_collector.collect_new_paths( self.max_path_length, self.num_expl_steps_per_train_loop, discard_incomplete_paths=False, ) gt.stamp('exploration sampling', unique=False) self.replay_buffer.add_paths(new_expl_paths) gt.stamp('data storing', unique=False) self.training_mode(True) for _ in range(self.num_trains_per_train_loop): train_data = self.replay_buffer.random_batch( self.batch_size) self.trainer.train(train_data) gt.stamp('training', unique=False) self.training_mode(False) self._end_epoch(epoch)
def _train(self): st = time.time() if self.min_num_steps_before_training > 0: init_expl_paths = self.expl_data_collector.collect_new_paths( self.max_path_length, self.min_num_steps_before_training, runtime_policy=self.pretrain_policy, ) self.replay_buffer.add_paths(init_expl_paths) self.expl_data_collector.end_epoch(-1) self.total_train_expl_time += time.time() - st self.trainer.buffer = self.replay_buffer # TODO: make a cleaner of doing this self.training_mode(True) for _ in range(self.num_pretrain_steps): train_data = self.replay_buffer.random_batch(self.batch_size) self.trainer.train(train_data) self.training_mode(False) for epoch in gt.timed_for( range(self._start_epoch, self.num_epochs), save_itrs=True, ): self.eval_data_collector.collect_new_paths( self.max_path_length, self.num_eval_steps_per_epoch, ) gt.stamp("evaluation sampling") st = time.time() for _ in range(self.num_train_loops_per_epoch): new_expl_paths = self.expl_data_collector.collect_new_paths( self.max_path_length, self.num_expl_steps_per_train_loop, ) gt.stamp("exploration sampling", unique=False) self.replay_buffer.add_paths(new_expl_paths) gt.stamp("data storing", unique=False) self.training_mode(True) for train_step in range(self.num_trains_per_train_loop): train_data = self.replay_buffer.random_batch(self.batch_size) self.trainer.train(train_data) gt.stamp("training", unique=False) self.training_mode(False) if self.eval_buffer: eval_data = self.eval_buffer.random_batch(self.batch_size) self.trainer.evaluate(eval_data, buffer_data=False) eval_data = self.replay_buffer.random_batch(self.batch_size) self.trainer.evaluate(eval_data, buffer_data=True) self.total_train_expl_time += time.time() - st self._end_epoch(epoch)
def _train(self): if self.min_num_steps_before_training > 0: init_expl_paths = self.expl_data_collector.collect_new_paths( self.max_path_length, self.min_num_steps_before_training, discard_incomplete_paths=False, ) self.replay_buffer.add_paths(init_expl_paths) self.expl_data_collector.end_epoch(-1) self._fit_input_stats() for epoch in gt.timed_for( range(self._start_epoch, self.num_epochs), save_itrs=True, ): self.eval_data_collector.collect_new_paths( self.max_path_length, self.num_eval_steps_per_epoch, discard_incomplete_paths=True, ) gt.stamp('evaluation sampling') self.training_mode(True) if self.replay_buffer.num_steps_can_sample() > 0: self.model_trainer.train_from_buffer( self.replay_buffer, max_grad_steps=self.model_max_grad_steps, epochs_since_last_update=self. model_epochs_since_last_update, ) gt.stamp('model training', unique=False) for _ in range(self.num_train_loops_per_epoch): new_expl_paths = self.expl_data_collector.collect_new_paths( self.max_path_length, self.num_expl_steps_per_train_loop, discard_incomplete_paths=False, ) gt.stamp('exploration sampling', unique=False) self.replay_buffer.add_paths(new_expl_paths) gt.stamp('data storing', unique=False) self.training_mode(True) for _ in range(self.num_trains_per_train_loop): self.trainer.train_from_paths(new_expl_paths) gt.stamp('training', unique=False) self.training_mode(False) self._fit_input_stats() self._end_epoch(epoch)
def experiment(variant): E = 10 R = 84 cuda = True envs = [] renderer = MjCudaRender(R, R) for e in range(E): env = HalfCheetahEnv() envs.append(env) c = Convnet(6, output_activation=torch.tanh, input_channels=3) if cuda: c.cuda() # viewer = mujoco_py.MjRenderContextOffscreen(env.sim, device_id=1) # env.sim.add_render_context(viewer) def step(stamp=True): imgs = [] if i % 100 == 0: for e in range(E): envs[e].reset() for e in range(E): # img = renderer.get_cuda_tensor(envs[e].sim) img = envs[e].sim.render(R, R, device_id=1).transpose() gt.stamp('render') if stamp else 0 # imgs =np.array(imgs) # torch_img = np_to_var(imgs) # if cuda: # torch_img = torch_img.cuda() # torch.cuda.synchronize() # gt.stamp('transfer') if stamp else 0 # u = get_numpy(c.forward(torch_img).cpu()) # torch.cuda.synchronize() # gt.stamp('forward') if stamp else 0 # for e in range(E): # envs[e].step(u[e, :]) # gt.stamp('step') if stamp else 0 for i in range(10): step(False) gt.stamp('start') for i in gt.timed_for(range(100)): step() gt.stamp('end')
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) with self._sess.as_default(): observation = env.reset() policy.reset() path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) if self.iter_callback is not None: self.iter_callback(locals(), globals()) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(observation) next_ob, reward, terminal, info = env.step(action) path_length += 1 path_return += reward self.pool.add_sample( observation, action, reward, terminal, next_ob, ) if terminal or path_length >= self._max_path_length: observation = env.reset() policy.reset() path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 else: observation = next_ob gt.stamp('sample') if self.pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self.pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self.pool.size) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()