def _run_env_eval(self, step, do_sampler_step=True, calculate_holdout=True): timeit.start('eval') ### calculate holdout costs self._policy.eval_holdout() timeit.stop('eval')
def _run_env_eval(self, step, do_sampler_step=True, calculate_holdout=True): timeit.start('eval') ### add to eval buffer if self._sampler_eval and do_sampler_step: self._sampler_eval.reset() eval_step = step num_dones = 0 while num_dones < self._rollouts_per_eval: _, _, _, _, done, _ = \ self._sampler_eval.step(eval_step, explore=False) eval_step += 1 num_dones += int(done) self._sampler.reset() ### calculate holdout costs if self._replay_pool_eval.can_sample( batch_size=self._batch_size) and calculate_holdout: indices, weights, steps, observations, goals, actions, rewards, dones, _ = \ self._replay_pool_eval.sample(self._batch_size) self._policy.eval_holdout(step, steps=steps, observations=observations, goals=goals, actions=actions, rewards=rewards, dones=dones) timeit.stop('eval')
def _run_env_step(self, step): timeit.start('sample') self._sampler.step( step, take_random_actions=(step < self._onpolicy_after_n_steps), explore=True) timeit.stop('sample') return step
def _run_init(self): self._restore() # note this is the right step, but the trajectories might not all have been saved save_itr = self._fm.get_train_itr() start_step = save_itr * self._save_every_n_steps self._sampler.reset() timeit.reset() timeit.start('total') return start_step, save_itr
def _run_init_inference(self): inference_itr = self._fm.get_inference_itr() self._restore_rollouts('train') self._restore_rollouts('eval') save_itr = inference_itr start_step = save_itr * self._save_every_n_steps timeit.reset() timeit.start('total') return start_step, save_itr
def _run_init_train(self): train_itr = self._fm.get_train_itr() if train_itr > 0: logger.info('Restore train iteration {0}'.format(train_itr - 1)) self._policy.restore(self._fm.train_policy_fname(train_itr - 1), train=True) save_itr = train_itr start_step = save_itr * self._save_every_n_steps timeit.reset() timeit.start('total') return start_step, save_itr
def _run_init_train(self): train_itr = self._fm.get_train_itr() if train_itr > 0: self._policy.restore(self._fm.train_policy_fname(train_itr - 1), train=True) save_itr = train_itr start_step = save_itr * self._save_every_n_steps timeit.reset() timeit.start('total') return start_step, save_itr
def train(self): self._start_train_batch() logger.info('Training model') alg_args = self._params['alg'] total_steps = int(alg_args['total_steps']) save_every_n_steps = int(alg_args['save_every_n_steps']) update_target_after_n_steps = int( alg_args['update_target_after_n_steps']) update_target_every_n_steps = int( alg_args['update_target_every_n_steps']) log_every_n_steps = int(alg_args['log_every_n_steps']) timeit.reset() timeit.start('total') save_itr = 0 for step in range(total_steps): timeit.start('sample') # steps, observations, actions, rewards, dones, _ = self._replay_pool.sample(batch_size) steps, observations, actions, rewards, dones, _ = self._batch_queue.get( ) timeit.stop('sample') timeit.start('train') self._model.train_step(step, steps=steps, observations=observations, actions=actions, rewards=rewards, dones=dones, use_target=True) timeit.stop('train') ### update target network if step > update_target_after_n_steps and step % update_target_every_n_steps == 0: self._model.update_target() ### log if step > 0 and step % log_every_n_steps == 0: logger.record_tabular('Step', step) self._model.log() logger.dump_tabular(print_func=logger.info) timeit.stop('total') for line in str(timeit).split('\n'): logger.debug(line) timeit.reset() timeit.start('total') ### save model if step > 0 and step % save_every_n_steps == 0: logger.info('Saving files for itr {0}'.format(save_itr)) self._save_train_policy(save_itr) save_itr += 1 ### always save the end self._save_train_policy(save_itr) self._stop_train_batch()
def _run_log(self, step): logger.record_tabular('Step', step) self._env.log() self._replay_pool.log() if self._env_eval: self._env_eval.log(prefix='Eval') if self._replay_pool_eval: self._replay_pool_eval.log(prefix='Eval') self._policy.log() logger.dump_tabular(print_func=logger.info) timeit.stop('total') for line in str(timeit).split('\n'): logger.debug(line) timeit.reset() timeit.start('total')
def do_train_step(): timeit.start('batch') indices, weights, steps, observations, goals, actions, rewards, dones, _ = \ self._replay_pool.sample(self._batch_size) timeit.stop('batch') timeit.start('train') rew_errors = self._policy.train_step(step, steps=steps, observations=observations, goals=goals, actions=actions, rewards=rewards, dones=dones, weights=weights) self._replay_pool.update_priorities(indices, rew_errors) timeit.stop('train')
def train(self): ### restore where we left off save_itr = self._restore() target_updated = False eval_rollouts = [] self._sampler.reset() if self._eval_sampler is not None: self._eval_sampler.reset() timeit.reset() timeit.start('total') for step in range(0, self._total_steps, self._sampler.n_envs): ### sample and add to buffer if step > self._sample_after_n_steps: timeit.start('sample') self._sampler.step( step, take_random_actions=(step <= self._onpolicy_after_n_steps), explore=True) timeit.stop('sample') ### sample and DON'T add to buffer (for validation) if self._eval_sampler is not None and step > 0 and step % self._eval_every_n_steps == 0: timeit.start('eval') for _ in range(self._rollouts_per_eval): eval_rollouts_step = [] eval_step = step while len(eval_rollouts_step) == 0: self._eval_sampler.step(eval_step, explore=False) eval_rollouts_step = self._eval_sampler.get_recent_paths( ) eval_step += 1 eval_rollouts += eval_rollouts_step timeit.stop('eval') if step >= self._learn_after_n_steps: ### training step if self._train_every_n_steps >= 1: if step % int(self._train_every_n_steps) == 0: timeit.start('batch') steps, observations, goals, actions, rewards, dones, _ = \ self._sampler.sample(self._batch_size) timeit.stop('batch') timeit.start('train') self._policy.train_step(step, steps=steps, observations=observations, goals=goals, actions=actions, rewards=rewards, dones=dones, use_target=target_updated) timeit.stop('train') else: for _ in range(int(1. / self._train_every_n_steps)): timeit.start('batch') steps, observations, goals, actions, rewards, dones, _ = \ self._sampler.sample(self._batch_size) timeit.stop('batch') timeit.start('train') self._policy.train_step(step, steps=steps, observations=observations, goals=goals, actions=actions, rewards=rewards, dones=dones, use_target=target_updated) timeit.stop('train') ### update target network if step > self._update_target_after_n_steps and step % self._update_target_every_n_steps == 0: self._policy.update_target() target_updated = True ### log if step % self._log_every_n_steps == 0: logger.record_tabular('Step', step) self._sampler.log() self._eval_sampler.log(prefix='Eval') self._policy.log() logger.dump_tabular(print_func=logger.info) timeit.stop('total') for line in str(timeit).split('\n'): logger.debug(line) timeit.reset() timeit.start('total') ### save model if step > 0 and step % self._save_every_n_steps == 0: logger.info('Saving files for itr {0}'.format(save_itr)) self._save(save_itr, self._sampler.get_recent_paths(), eval_rollouts) save_itr += 1 eval_rollouts = [] self._save(save_itr, self._sampler.get_recent_paths(), eval_rollouts)
def inference(self): ### restore where we left off self._restore_inference() inference_itr = self._get_inference_itr() inference_step = self._get_inference_step() train_itr = self._get_train_itr() self._run_rsync() train_rollouts = [] eval_rollouts = [] self._inference_reset_sampler() timeit.reset() timeit.start('total') while True: train_step = self._get_train_step() if inference_step > self._total_steps: break ### sample and add to buffer if inference_step > self._sample_after_n_steps: timeit.start('sample') inference_step = self._inference_step(inference_step) timeit.stop('sample') else: inference_step += self._sampler.n_envs ### sample and DON'T add to buffer (for validation) if self._eval_sampler is not None and inference_step > 0 and inference_step % self._eval_every_n_steps == 0: timeit.start('eval') eval_rollouts_step = [] eval_step = inference_step while len(eval_rollouts_step) == 0: self._eval_sampler.step(eval_step, explore=False) eval_rollouts_step = self._eval_sampler.get_recent_paths() eval_step += 1 eval_rollouts += eval_rollouts_step timeit.stop('eval') ### log if inference_step % self._log_every_n_steps == 0: logger.info('train itr {0:04d} inference itr {1:04d}'.format( train_itr, inference_itr)) logger.record_tabular('Train step', train_step) logger.record_tabular('Inference step', inference_step) self._sampler.log() if self._eval_sampler: self._eval_sampler.log(prefix='Eval') logger.dump_tabular(print_func=logger.info) timeit.stop('total') for line in str(timeit).split('\n'): logger.debug(line) timeit.reset() timeit.start('total') ### save rollouts / load model train_rollouts += self._sampler.get_recent_paths() if inference_step > 0 and inference_step % self._inference_save_every_n_steps == 0: self._inference_reset_sampler() ### save rollouts logger.debug('Saving files for itr {0}'.format(inference_itr)) self._save_inference(inference_itr, train_rollouts, eval_rollouts) inference_itr += 1 train_rollouts = [] eval_rollouts = [] ### load model with self._rsync_lock: # to ensure the ckpt has been fully transferred over new_train_itr = self._get_train_itr() if train_itr < new_train_itr: logger.debug( 'Loading policy for itr {0}'.format(new_train_itr - 1)) try: self._policy.restore( self._inference_policy_file_name( new_train_itr - 1), train=False) train_itr = new_train_itr except: logger.debug( 'Failed to load model for itr {0}'.format( new_train_itr - 1)) self._policy.restore( self._inference_policy_file_name(train_itr - 1), train=False) logger.debug('As backup, restored itr {0}'.format( train_itr - 1)) self._save_inference(inference_itr, self._sampler.get_recent_paths(), eval_rollouts)
def train(self): ### restore where we left off init_inference_step = len(self._sampler) # don't count offpolicy self._restore_train() train_itr = self._get_train_itr() train_step = self._get_train_step() inference_itr = self._get_inference_itr() target_updated = False timeit.reset() timeit.start('total') while True: inference_step = len(self._sampler) - init_inference_step if inference_step > self._total_steps or train_step > self._train_total_steps: break if inference_step >= self._learn_after_n_steps: ### training step train_step += 1 timeit.start('batch') steps, observations, goals, actions, rewards, dones, _ = \ self._sampler.sample(self._batch_size) timeit.stop('batch') timeit.start('train') self._policy.train_step(train_step, steps=steps, observations=observations, goals=goals, actions=actions, rewards=rewards, dones=dones, use_target=target_updated) timeit.stop('train') ### update target network if train_step > self._update_target_after_n_steps and train_step % self._update_target_every_n_steps == 0: self._policy.update_target() target_updated = True ### log if train_step % self._log_every_n_steps == 0: logger.info( 'train itr {0:04d} inference itr {1:04d}'.format( train_itr, inference_itr)) logger.record_tabular('Train step', train_step) logger.record_tabular('Inference step', inference_step) self._policy.log() logger.dump_tabular(print_func=logger.info) timeit.stop('total') for line in str(timeit).split('\n'): logger.debug(line) timeit.reset() timeit.start('total') else: time.sleep(1) ### save model if train_step > 0 and train_step % self._train_save_every_n_steps == 0: logger.debug('Saving files for itr {0}'.format(train_itr)) self._save_train(train_itr) train_itr += 1 ### reset model if train_step > 0 and self._train_reset_every_n_steps is not None and \ train_step % self._train_reset_every_n_steps == 0: logger.debug('Resetting model') self._policy.reset_weights() ### load data inference_itr = self._train_load_data(inference_itr)
def inference(self): ### restore where we left off self._restore_inference() inference_itr = self._get_inference_itr() inference_step = self._get_inference_step() train_itr = self._get_train_itr() self._run_rsync() assert (self._eval_sampler is None) # TODO: temporary train_rollouts = [] eval_rollouts = [] self._reset_sampler() timeit.reset() timeit.start('total') while True: train_step = self._get_train_step() if inference_step > self._total_steps: break ### sample and add to buffer if inference_step > self._sample_after_n_steps: timeit.start('sample') try: self._sampler.step( inference_step, take_random_actions=( inference_step <= self._learn_after_n_steps or inference_step <= self._onpolicy_after_n_steps), explore=True) inference_step += self._sampler.n_envs except Exception as e: logger.warn('Sampler exception {0}'.format(str(e))) trashed_steps = self._sampler.trash_current_rollouts() inference_step -= trashed_steps logger.warn('Trashed {0} steps'.format(trashed_steps)) while not self._env.ros_is_good( print=False): # TODO hard coded time.sleep(0.25) self._reset_sampler() logger.warn('Continuing...') timeit.stop('sample') else: inference_step += self._sampler.n_envs ### sample and DON'T add to buffer (for validation) if self._eval_sampler is not None and inference_step > 0 and inference_step % self._eval_every_n_steps == 0: timeit.start('eval') eval_rollouts_step = [] eval_step = inference_step while len(eval_rollouts_step) == 0: self._eval_sampler.step(eval_step, explore=False) eval_rollouts_step = self._eval_sampler.get_recent_paths() eval_step += 1 eval_rollouts += eval_rollouts_step timeit.stop('eval') ### log if inference_step % self._log_every_n_steps == 0: logger.info('train itr {0:04d} inference itr {1:04d}'.format( train_itr, inference_itr)) logger.record_tabular('Train step', train_step) logger.record_tabular('Inference step', inference_step) self._sampler.log() if self._eval_sampler: self._eval_sampler.log(prefix='Eval') logger.dump_tabular(print_func=logger.info) timeit.stop('total') for line in str(timeit).split('\n'): logger.debug(line) timeit.reset() timeit.start('total') ### save rollouts / load model train_rollouts += self._sampler.get_recent_paths() if inference_step > 0 and inference_step % self._inference_save_every_n_steps == 0 and \ len(train_rollouts) > 0: response = input('Keep rollouts?') if response != 'y': train_rollouts = [] continue ### reset to stop rollout self._sampler.reset() ### save rollouts logger.debug('Saving files for itr {0}'.format(inference_itr)) self._save_inference(inference_itr, train_rollouts, eval_rollouts) inference_itr += 1 train_rollouts = [] eval_rollouts = [] ### load model with self._rsync_lock: # to ensure the ckpt has been fully transferred over new_train_itr = self._get_train_itr() if train_itr < new_train_itr: logger.debug( 'Loading policy for itr {0}'.format(new_train_itr - 1)) try: self._policy.restore( self._inference_policy_file_name( new_train_itr - 1), train=False) train_itr = new_train_itr except: logger.debug( 'Failed to load model for itr {0}'.format( new_train_itr - 1)) self._policy.restore( self._inference_policy_file_name(train_itr - 1), train=False) logger.debug('As backup, restored itr {0}'.format( train_itr - 1)) self._save_inference(inference_itr, self._sampler.get_recent_paths(), eval_rollouts)
def do_train_step(): timeit.start('train') self._policy.train_step(step) timeit.stop('train')
def train(self): ### restore where we left off self._restore_train() train_itr = self._get_train_itr() train_step = self._get_train_step() inference_itr = self._get_inference_itr() init_inference_step = len(self._sampler) target_updated = False timeit.reset() timeit.start('total') while True: inference_step = len(self._sampler) - init_inference_step if inference_step > self._total_steps: break if inference_step >= self._learn_after_n_steps: ### update preprocess if train_step % self._update_preprocess_every_n_steps == 0: self._policy.update_preprocess(self._sampler.statistics) ### training step train_step += 1 timeit.start('batch') batch = self._sampler.sample(self._batch_size) timeit.stop('batch') timeit.start('train') self._policy.train_step(train_step, *batch, use_target=target_updated) timeit.stop('train') ### update target network if train_step > self._update_target_after_n_steps and train_step % self._update_target_every_n_steps == 0: self._policy.update_target() target_updated = True ### log if train_step % self._log_every_n_steps == 0: logger.info( 'train itr {0:04d} inference itr {1:04d}'.format( train_itr, inference_itr)) logger.record_tabular('Train step', train_step) logger.record_tabular('Inference step', inference_step) self._policy.log() logger.dump_tabular(print_func=logger.info) timeit.stop('total') for line in str(timeit).split('\n'): logger.debug(line) timeit.reset() timeit.start('total') else: time.sleep(1) ### save model if train_step > 0 and train_step % self._train_save_every_n_steps == 0: logger.debug('Saving files for itr {0}'.format(train_itr)) self._save_train(train_itr) train_itr += 1 ### reset model if train_step > 0 and self._train_reset_every_n_steps is not None and \ train_step % self._train_reset_every_n_steps == 0: logger.debug('Resetting model') self._policy.reset_weights() ### load data new_inference_itr = self._get_inference_itr() if inference_itr < new_inference_itr: for i in range(inference_itr, new_inference_itr): try: logger.debug('Loading files for itr {0}'.format(i)) self._sampler.add_rollouts( [self._train_rollouts_file_name(i)]) inference_itr = i + 1 except: logger.debug( 'Failed to load files for itr {0}'.format(i))
def _run_save(self, save_itr): timeit.start('save') logger.info('Saving files for itr {0}'.format(save_itr)) self._save(save_itr, self._replay_pool.get_recent_rollouts(), self._replay_pool_eval.get_recent_rollouts()) timeit.stop('save')