def eval(self): average_episode_reward = 0 average_episode_success = 0 video_recorder = VideoRecorder() video_recorder.init() for episode in range(self.num_eval_episodes): obs_dict = self.env.reset() obs = obs_dict[self.observation_key] obs_g = obs_dict[self.desired_goal_key] done = False episode_reward = 0 episode_step = 0 while not done: action = self.agent.act(obs, obs_g, sample=True) next_obs_dict, reward, done, info = self.env.step(action) done = float(done) episode_reward += reward achieved_goal = next_obs_dict[self.achieved_goal_key] obs = next_obs_dict[self.observation_key] obs_g = next_obs_dict[self.desired_goal_key] episode_step += 1 video_recorder.record(next_obs_dict) average_episode_reward += episode_reward / self.num_eval_episodes average_episode_success += float( info['is_success']) / self.num_eval_episodes video_recorder.save(f'{self.step}.mp4') tune.report( eval_reward=average_episode_reward, eval_is_success=average_episode_success, timesteps_this_iter=0, )
class Workspace(object): def __init__( self, log_save_tb=True, log_frequency_step=10000, agent_name='drq', # device='cuda', device='cpu', env='cartpole_swingup', seed=1, image_size=84, action_repeat=8, frame_stack=3, replay_buffer_capacity=100000, image_pad=4, save_video=True): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.logger = Logger(self.work_dir, save_tb=log_save_tb, log_frequency=log_frequency_step, agent=agent_name, action_repeat=action_repeat) utils.set_seed_everywhere(seed) self.device = torch.device(device) self.env = make_env(env, seed, image_size, action_repeat, frame_stack) self.agent = DRQAgent( obs_shape=self.env.observation_space.shape, action_shape=self.env.action_space.shape, action_range=(float(self.env.action_space.low.min()), float(self.env.action_space.high.max())), device=self.device) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, replay_buffer_capacity, image_pad, self.device) self.video_recorder = VideoRecorder( self.work_dir if save_video else None) self.step = 0 def evaluate( self, num_eval_episodes=10, ): average_episode_reward = 0 for episode in range(num_eval_episodes): obs = self.env.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 episode_step = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, info = self.env.step(action) self.video_recorder.record(self.env) episode_reward += reward episode_step += 1 average_episode_reward += episode_reward self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= num_eval_episodes self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step) def run(self, num_train_steps=1000000, num_train_iters=1, num_seed_steps=1000, eval_frequency=5000): episode, episode_reward, episode_step, done = 0, 0, 1, True start_time = time.time() while self.step < num_train_steps: if done: if self.step > 0: self.logger.log('train/duration', time.time() - start_time, self.step) start_time = time.time() self.logger.dump(self.step, save=(self.step > num_seed_steps)) # evaluate agent periodically if self.step % eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) self.evaluate() self.logger.log('train/episode_reward', episode_reward, self.step) obs = self.env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 self.logger.log('train/episode', episode, self.step) # sample action for data collection if self.step < num_seed_steps: action = self.env.action_space.sample() else: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=True) # run training update if self.step >= num_seed_steps: for _ in range(num_train_iters): self.agent.update(self.replay_buffer, self.logger, self.step) next_obs, reward, done, info = self.env.step(action) # allow infinite bootstrap done = float(done) done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done episode_reward += reward self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max) obs = next_obs episode_step += 1 self.step += 1
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir + "_" + self.cfg.env + "_eval2k_effective_{}_seed_{}".format( self.cfg.effective_aug, self.cfg.seed), save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat) self.effective_aug = self.cfg.effective_aug utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = make_env(cfg) cfg.agent.params.obs_shape = self.env.observation_space.shape cfg.agent.params.action_shape = self.env.action_space.shape cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, cfg.replay_buffer_capacity, self.cfg.image_pad, self.device, self.effective_aug) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0 def evaluate(self): average_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): obs = self.env.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 episode_step = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, info = self.env.step(action) self.video_recorder.record(self.env) episode_reward += reward episode_step += 1 average_episode_reward += episode_reward self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.cfg.num_eval_episodes self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step) def run(self): episode, episode_reward, episode_step, done = 0, 0, 1, True start_time = time.time() while self.step < self.cfg.num_train_steps: if done: if self.step > 0: self.logger.log('train/duration', time.time() - start_time, self.step) start_time = time.time() self.logger.dump( self.step, save=(self.step > self.cfg.num_seed_steps)) # evaluate agent periodically if self.step % self.cfg.eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) self.evaluate() self.logger.log('train/episode_reward', episode_reward, self.step) obs = self.env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 self.logger.log('train/episode', episode, self.step) # sample action for data collection if self.step < self.cfg.num_seed_steps: action = self.env.action_space.sample() else: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=True) # run training update if self.step >= self.cfg.num_seed_steps: for _ in range(self.cfg.num_train_iters): self.agent.update(self.replay_buffer, self.logger, self.step) next_obs, reward, done, info = self.env.step(action) # allow infinite bootstrap done = float(done) done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done episode_reward += reward self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max) obs = next_obs episode_step += 1 self.step += 1
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd() """Hack to adjust action_repeat""" adjust_action_repeat_hack(cfg) print(f"CFG:\n{'-'*100}\n{cfg}\n{'-'*100}") self.cfg = cfg experiment_name = f"{cfg.full_title}_{cfg.run_id}" self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, save_wb=cfg.log_save_wandb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat, cfg=dict(flatten_cfg(cfg)), plot_project="drqtest", experiment=experiment_name) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = make_env(cfg) cfg.agent.params.obs_shape = self.env.observation_space.shape cfg.agent.params.action_shape = self.env.action_space.shape cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) print(f"ACTOR:\n{'-'*100}\n{self.agent.actor}\n{'-'*100}") print(f"CRITIC:\n{'-'*100}\n{self.agent.critic}\n{'-'*100}") self.replay_buffer = ReplayBuffer( self.env.observation_space.shape, self.env.action_space.shape, cfg.replay_buffer_capacity, self.cfg.image_pad, self.device, use_aug=cfg.replay_buffer_augmentation) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0 def evaluate(self): average_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): obs = self.env.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 episode_step = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, info = self.env.step(action) self.video_recorder.record(self.env) episode_reward += reward episode_step += 1 average_episode_reward += episode_reward self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.cfg.num_eval_episodes self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step) def run(self): episode, episode_reward, episode_step, done = 0, 0, 1, True start_time = time.time() while self.step < self.cfg.num_train_steps: if done: if self.step > 0: self.logger.log('train/duration', time.time() - start_time, self.step) start_time = time.time() self.logger.dump( self.step, save=(self.step > self.cfg.num_seed_steps)) # evaluate agent periodically if self.step % self.cfg.eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) self.evaluate() self.logger.log('train/episode_reward', episode_reward, self.step) obs = self.env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 self.logger.log('train/episode', episode, self.step) # sample action for data collection if self.step < self.cfg.num_seed_steps: action = self.env.action_space.sample() else: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=True) # run training update if self.step >= self.cfg.num_seed_steps: for _ in range(self.cfg.num_train_iters): self.agent.update(self.replay_buffer, self.logger, self.step) next_obs, reward, done, info = self.env.step(action) # allow infinite bootstrap done = float(done) done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done episode_reward += reward self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max) obs = next_obs episode_step += 1 self.step += 1
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency, agent=cfg.agent.name) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = utils.make_env(cfg) self.obs_shape = self.env.observation_space['observation'].shape self.goal_shape = self.env.observation_space['desired_goal'].shape cfg.agent.params.obs_dim = self.obs_shape[0] cfg.agent.params.goal_dim = self.goal_shape[0] cfg.agent.params.action_dim = self.env.action_space.shape[0] cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.obs_shape, self.goal_shape, self.env.action_space.shape, int(cfg.replay_buffer_capacity), self.device) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0 def evaluate(self): for episode in range(self.cfg.num_eval_episodes): obs = self.env.reset() self.agent.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs['observation'], obs['desired_goal'], sample=False) obs, reward, done, _ = self.env.step(action) self.video_recorder.record(self.env) episode_reward += reward self.video_recorder.save(f'{self.step}.mp4') self.logger.log('eval/episode_reward', episode_reward, self.step) self.logger.dump(self.step) def run_her(self, path_buffer): #first_obs = path_buffer[0][0] #last_obs = path_buffer[-1][0] #first_goal = first_obs['achieved_goal'] #last_goal = last_obs['achieved_goal'] #goal_changed = np.mean(last_goal - first_goal)**2 > 1e-6 #if goal_changed: for n, ts in enumerate(path_buffer): # select goal id if self.cfg.her_strat == 'future': i = np.random.randint(n, len(path_buffer)) elif self.cfg.her_strat == 'last': i = -1 new_goal_obs = path_buffer[i][3] new_goal = new_goal_obs['achieved_goal'] # relabel obs, action, reward, next_obs, done, done_no_max = ts obs['desired_goal'] = new_goal next_obs['desired_goal'] = new_goal reward = self.env.compute_reward(next_obs['achieved_goal'], new_goal, None) self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max) def run(self): episode, episode_reward, done = 0, 0, True start_time = time.time() path_buffer = [] while self.step < self.cfg.num_train_steps: if done: if self.step > 0: self.logger.log('train/duration', time.time() - start_time, self.step) start_time = time.time() self.logger.dump( self.step, save=(self.step > self.cfg.num_seed_steps)) # evaluate agent periodically if self.step > 0 and self.step % self.cfg.eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) if self.cfg.save_model: self.agent.save() self.agent.load() self.evaluate() self.logger.log('train/episode_reward', episode_reward, self.step) obs = self.env.reset() self.agent.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 self.logger.log('train/episode', episode, self.step) # her if self.cfg.her_iters > 0 and len(path_buffer): for k in range(self.cfg.her_iters): self.run_her(path_buffer) path_buffer = [] # sample action for data collection if self.step < self.cfg.num_seed_steps: action = self.env.action_space.sample() else: with utils.eval_mode(self.agent): action = self.agent.act(obs['observation'], obs['desired_goal'], sample=True) # run training update if self.step >= self.cfg.num_seed_steps: self.agent.update(self.replay_buffer, self.logger, self.step) next_obs, reward, done, _ = self.env.step(action) # allow infinite bootstrap done = float(done) done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done episode_reward += reward self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max) path_buffer.append( [obs, action, reward, next_obs, done, done_no_max]) obs = next_obs episode_step += 1 self.step += 1
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = dmc.make_meta(cfg.env, cfg.episode_length, cfg.seed) self.eval_env = dmc.make_meta(cfg.env, cfg.episode_length, cfg.seed + 1) obs_spec = self.env.observation_spec()['features'] action_spec = self.env.action_spec() cfg.agent.params.obs_shape = obs_spec.shape cfg.agent.params.action_shape = action_spec.shape cfg.agent.params.action_range = [ float(action_spec.minimum.min()), float(action_spec.maximum.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = MetaReplayBuffer(cfg.train_tasks, obs_spec.shape, action_spec.shape, cfg.replay_buffer_capacity, self.device) self.eval_video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0 def evaluate(self): average_total_reward = 0 for task_id in self.cfg.eval_tasks: # adaptation phase state = self.agent.reset( ) # reset agent once, so the memory persists acros episodes for episode in range(self.cfg.num_adapt_episodes): time_step = self.eval_env.reset(task_id) while not time_step.last(): with utils.eval_mode(self.agent): obs = time_step.observation['features'] action = self.agent.act(obs, state, sample=False) time_step = self.eval_env.step(action) next_obs = time_step.observation['features'] # update agent's memory state = self.agent.step(state, obs, action, time_step.reward, next_obs) # evaluation phase # agent's memory should be initialized by now average_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): time_step = self.eval_env.reset(task_id) self.eval_video_recorder.init(enabled=(episode == 0)) episode_reward = 0 episode_success = 0 episode_step = 0 while not time_step.last(): with utils.eval_mode(self.agent): obs = time_step.observation['features'] action = self.agent.act(obs, state, sample=False) time_step = self.eval_env.step(action) next_obs = time_step.observation['features'] # update agent's memory state = self.agent.step(state, obs, action, time_step.reward, next_obs) self.eval_video_recorder.record(self.eval_env) episode_reward += time_step.reward episode_step += 1 average_episode_reward += episode_reward self.eval_video_recorder.save( f'task_{task_id}_step_{self.step}.mp4') average_episode_reward /= self.cfg.num_eval_episodes average_total_reward += average_episode_reward self.logger.log(f'eval/task_{task_id}_episode_reward', average_episode_reward / self.cfg.episode_length, self.step) average_total_reward /= len(self.cfg.eval_tasks) self.logger.log('eval/episode_reward', average_total_reward / self.cfg.episode_length, self.step) self.logger.dump(self.step, ty='eval') def run(self): episode, episode_reward, episode_step = 0, 0, 0 start_time = time.time() done = True while self.step < self.cfg.num_train_steps: if done: if self.step > 0: fps = episode_step / (time.time() - start_time) self.logger.log('train/fps', fps, self.step) start_time = time.time() self.logger.log('train/episode_reward', episode_reward / self.cfg.episode_length, self.step) self.logger.log('train/episode', episode, self.step) self.logger.dump( self.step, save=(self.step > self.cfg.num_seed_steps), ty='train') # initially try each task if episode < len(self.cfg.train_tasks): task_id = self.cfg.train_tasks[episode] else: task_id = np.random.choice(self.cfg.train_tasks) state = self.agent.reset() time_step = self.env.reset(task_id) obs = time_step.observation['features'] episode_reward = 0 episode_step = 0 episode += 1 self.logger.log('train/episode', episode, self.step) # evaluate agent periodically if self.step % self.cfg.eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) self.evaluate() # sample action for data collection if self.step < self.cfg.num_seed_steps: spec = self.env.action_spec() action = np.random.uniform(spec.minimum, spec.maximum, spec.shape) else: with utils.eval_mode(self.agent): action = self.agent.act(obs, state, sample=True) # run training update if self.step >= self.cfg.num_seed_steps: for _ in range(self.cfg.num_train_iters): self.agent.update(self.replay_buffer, self.logger, self.step) time_step = self.env.step(action) next_obs = time_step.observation['features'] # allow infinite bootstrap done = time_step.last() episode_reward += time_step.reward self.replay_buffer.add(task_id, obs, action, time_step.reward, next_obs, done) # update agent's memory state = self.agent.step(state, obs, action, time_step.reward, next_obs) obs = next_obs episode_step += 1 self.step += 1
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd().split('runs')[0] + 'runs/' self.work_dir = self.work_dir + \ '2020.10.21/jaco_reach_site_features_drq_agent.cls=agents.drq_agent.DRQAgent,agent.name=drq,batch_size=64,lr=0.005/seed=0/' self.model_dir = self.work_dir + '/agent_model' print(f'workspace: {self.work_dir}') self.cfg = cfg self.log_eval_dir = self.work_dir + '/eval_standalone' # Use a separate eval dir to avoid overwriting training files if not os.path.exists(self.log_eval_dir): os.makedirs(self.log_eval_dir) self.logger = Logger(self.log_eval_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat, overwrite=True) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) # Environment Sampler self.num_train_envs = cfg.num_envs self.env_sampler = utils.EnvSampler(cfg, False, False, work_dir=self.work_dir) experiment_identifier = self.work_dir.split('runs')[1] self.eval_envs = self.env_sampler.sample_eval_envs( experiment_identifier) env_sample_key = list(self.eval_envs.keys())[0] sample_env = self.eval_envs[env_sample_key] cfg.agent.params.obs_shape = sample_env.observation_space.shape cfg.agent.params.action_shape = sample_env.action_space.shape cfg.agent.params.action_range = [ float(sample_env.action_space.low.min()), float(sample_env.action_space.high.max()) ] if cfg.lowobs_append: if cfg.env == 'jaco_reach_site_features': cfg.agent.params.lstate_shape = 49 else: cfg.agent.params.lstate_shape = 9 else: cfg.agent.params.lstate_shape = 0 self.agent = hydra.utils.instantiate(cfg.agent) self.render_train_samples = True if self.render_train_samples: if cfg.env.startswith('jaco'): height = 256 width = 256 else: height = width = 500 from PIL import Image for env_idx, env in self.eval_envs.items(): name = 'StandAloneEval_Unseen_Environment_' + str( env_idx) + '.png' img_path = self.work_dir + name env.reset() obs = env.render(mode='rgb_array', height=height, width=width) im = Image.fromarray(obs) im.save(img_path) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None, phase='eval_standalone') self.reload_weights = cfg.reload_weights self.train_vid_interval = cfg.train_vid_interval self.eval_trials = 100 self.step = 0 def evaluate(self): average_episode_reward = 0 for episode in range(self.eval_trials): print('Episode Trial ', episode) self.video_recorder.init(enabled=True) eval_env = self.eval_envs[random.sample(list(self.eval_envs), 1)[0]] obs = eval_env.reset() done = False episode_reward = 0 episode_step = 0 while (episode_step <= eval_env._max_episode_steps - 1): with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, _, _ = eval_env.step(action) self.video_recorder.record(eval_env) episode_reward += reward episode_step += 1 self.step += 1 if done: break average_episode_reward += episode_reward print('Episode Reward ', episode_reward) self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.eval_trials self.logger.log('eval_standalone/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step, ty='eval_standalone') def run(self): if os.path.exists(self.model_dir): latest_step = utils.get_latest_file(self.model_dir) self.agent.load(self.model_dir, latest_step) else: raise ValueError('Could not reload weights!') self.evaluate()
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg config_file = self.work_dir.split('runs')[0] + 'configs/' \ + cfg.env.replace('-', '_') + '.yaml' shutil.copy(config_file, self.work_dir) self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat, overwrite=True) experiment_identifier = self.work_dir.split('runs')[1] utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) # Interventions interventions = cfg.internvention image_augmentation = False if 'type_1' in interventions: image_augmentation = True if 'type_2' in interventions: cfg.apply_mod = True # Environment Sampler self.num_train_envs = cfg.num_envs self.env_sampler = utils.EnvSampler(cfg, work_dir=self.work_dir) self.eval_envs = self.env_sampler.sample_eval_envs( experiment_identifier) self.train_envs = self.env_sampler.sample_all_train_envs( experiment_identifier) self.resample_envs = cfg.resample_env self.env_resample_rate = cfg.env_resample_rate self.render_train_samples = True if self.render_train_samples: if cfg.env.startswith('jaco'): height = 256 width = 256 else: height = width = 500 from PIL import Image for env_idx, env in self.train_envs.items(): name = 'Environment_' + str(env_idx) + '.png' env.reset() obs = env.render(mode='rgb_array', height=height, width=width) im = Image.fromarray(obs) im.save(name) for env_idx, env in self.eval_envs.items(): name = 'Eval_Unseen_Environment_' + str(env_idx) + '.png' env.reset() obs = env.render(mode='rgb_array', height=height, width=width) im = Image.fromarray(obs) im.save(name) env_sample_key = list(self.eval_envs.keys())[0] sample_env = self.eval_envs[env_sample_key] cfg.agent.params.obs_shape = sample_env.observation_space.shape cfg.agent.params.action_shape = sample_env.action_space.shape cfg.agent.params.action_range = [ float(sample_env.action_space.low.min()), float(sample_env.action_space.high.max()) ] state_append = cfg.lowobs_append if state_append: if cfg.env == 'window-open-v1': # Double check this cfg.agent.params.lstate_shape = 9 elif cfg.env == 'jaco_reach_site_features': cfg.agent.params.lstate_shape = 49 else: cfg.agent.params.lstate_shape = 9 else: cfg.agent.params.lstate_shape = 0 self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = MultiEnvReplayBuffer( sample_env.observation_space.shape, sample_env.action_space.shape, cfg.replay_buffer_capacity, self.cfg.image_pad, self.device, image_augmentation, num_envs=self.num_train_envs, state_append=state_append, state_lstate_shape=cfg.agent.params.lstate_shape) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.train_video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None, phase='train') self.model_dir = self.work_dir + '/agent_model' self.step = [0] * self.num_train_envs self.reload_weights = cfg.reload_weights self.train_vid_interval = cfg.train_vid_interval def evaluate(self, phase, eval_env): average_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): obs = eval_env.reset() if phase == 'unseen': self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 episode_step = 0 # not done doesnt work for metaworld while (episode_step <= eval_env._max_episode_steps - 1): with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, _ = eval_env.step(action) if phase == 'unseen': self.video_recorder.record(eval_env) episode_reward += reward episode_step += 1 if done: break average_episode_reward += episode_reward if phase == 'unseen': self.video_recorder.save(f'{self.step[0]}.mp4') average_episode_reward /= self.cfg.num_eval_episodes if phase == 'seen': self.logger.log('eval_seen/episode_reward', average_episode_reward, self.step[0]) self.logger.dump(self.step[0], ty='eval_seen') elif phase == 'unseen': self.logger.log('eval_unseen/episode_reward', average_episode_reward, self.step[0]) self.logger.dump(self.step[0], ty='eval_unseen') eval_env.reset() def run(self): init_env = None keys_to_sample = random.sample(list(self.train_envs), self.num_train_envs) sampled_train_envs = { key: self.train_envs[key] for key in keys_to_sample } # Better way to access first elem of OrderedDict? for env_idx, env in sampled_train_envs.items(): init_env = env break episode, episode_reward, episode_step, done = [0] * self.num_train_envs, [0] * self.num_train_envs, \ [0] * self.num_train_envs, [True] * self.num_train_envs obs, next_obs = [init_env.reset()] * self.num_train_envs, [ init_env.reset() ] * self.num_train_envs start_time = time.time() train_recording = False env_to_rec = 0 if self.reload_weights and os.path.exists(self.model_dir): # Continue training try: latest_step = utils.get_latest_file(self.model_dir) self.agent.load(self.model_dir, latest_step) except: print('Could not reload weights!') while self.step[0] < self.cfg.num_train_steps: if self.resample_envs and self.step[ 0] > 0 and self.step[0] % self.env_resample_rate == 0: keys_to_sample = random.sample(list(self.train_envs), self.num_train_envs) sampled_train_envs = { key: self.train_envs[key] for key in keys_to_sample } for env_idx, (env_tag, env) in enumerate(sampled_train_envs.items()): episode_step[env_idx] = 0 while (episode_step[env_idx] <= env._max_episode_steps - 1): if not train_recording and env_idx == env_to_rec and self.step[ env_idx] % self.train_vid_interval == 0: train_recording = True self.train_video_recorder.init(enabled=True) if done[env_idx] or (episode_step[env_idx] >= env._max_episode_steps - 1): if self.step[env_idx] > 0: self.logger.log('train/duration', time.time() - start_time, self.step[env_idx]) start_time = time.time() if self.step[ env_idx] > 0 and env_idx == env_to_rec and train_recording: file_name = str(self.step[env_idx]) + '_' + env_tag self.train_video_recorder.save(f'{file_name}.mp4') self.train_video_recorder.frames = [] train_recording = False env_to_rec = random.randint( 0, len(sampled_train_envs) - 1) # Evaluate agent periodically if env_idx == 0 and episode[ env_idx] % self.cfg.eval_frequency == 0: # Evaluate an env from training self.logger.log('eval_seen/episode', episode[env_idx], self.step[env_idx]) eval_env = self.train_envs[random.sample( list(self.train_envs), 1)[0]] self.evaluate(phase='seen', eval_env=eval_env) # Evaluate an unseen env self.logger.log('eval_unseen/episode', episode[env_idx], self.step[env_idx]) eval_env = self.eval_envs[random.sample( list(self.eval_envs), 1)[0]] self.evaluate(phase='unseen', eval_env=eval_env) if episode[env_idx] % self.cfg.ckpt_frequency == 0: self.agent.save(self.model_dir, episode[env_idx]) self.logger.log('train/episode_reward', episode_reward[env_idx], self.step[env_idx]) obs[env_idx] = env.reset() done[env_idx] = False episode_reward[env_idx] = 0 episode[env_idx] += 1 self.logger.log('train/episode', episode[env_idx], self.step[env_idx]) self.logger.log('train/env_idx', env_tag, self.step[env_idx]) # sample action for data collection if self.step[env_idx] < self.cfg.num_seed_steps: action = env.action_space.sample() else: with utils.eval_mode(self.agent): action = self.agent.act(obs[env_idx], sample=True) next_obs[env_idx], reward, done[env_idx], _ = env.step( action) if train_recording and env_idx == env_to_rec: self.train_video_recorder.record(env) # allow infinite bootstrap done[env_idx] = float(done[env_idx]) done_no_max = 0 if episode_step[ env_idx] + 1 == env._max_episode_steps - 1 else done[ env_idx] episode_reward[env_idx] += reward self.replay_buffer.add(env_idx, obs[env_idx], action, reward, next_obs[env_idx], done[env_idx], done_no_max) obs[env_idx] = next_obs[env_idx] episode_step[env_idx] += 1 self.step[env_idx] += 1 # Run training update if self.step[env_idx] >= self.cfg.num_seed_steps: #print('Running train update') for _ in range(self.cfg.num_train_iters): self.agent.update(self.replay_buffer, self.num_train_envs, self.logger, self.step[env_idx], env_tag, env_idx) # At the end of each episode, log self.logger.dump( self.step[env_idx], save=(self.step[env_idx] > self.cfg.num_seed_steps), ty='train')
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd().split('runs')[0] + 'runs/' self.work_dir = self.work_dir + \ '2020.10.22/jaco_reach_site_features_drq_agent.cls=agents.drq_agent.DRQAgent,agent.name=drq,batch_size=64,lr=0.005/' self.model_dir = self.work_dir + '/agent_model' print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat, overwrite=True) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) # Environment Sampler self.num_train_envs = cfg.num_envs self.frame_stack = 5 self.env_sampler = utils.EnvSampler(cfg, False, False, work_dir=self.work_dir) self.eval_env_sim = self.env_sampler.make_env() self.jaco_real_env = JacoPhysics('j2s7s300', robot_server_ip='127.0.0.1', robot_server_port=9030, control_type='position') self.frame_size = 84 self.jaco_real_env = utils.FrameStackJacoReal(self.jaco_real_env, k=self.frame_stack, frame_size=self.frame_size, dummy_env=self.eval_env_sim) cfg.agent.params.obs_shape = self.eval_env_sim.observation_space.shape cfg.agent.params.action_shape = self.eval_env_sim.action_space.shape cfg.agent.params.action_range = [ float(self.eval_env_sim.action_space.low.min()), float(self.eval_env_sim.action_space.high.max()) ] if cfg.lowobs_append: if cfg.env == 'jaco_reach_site_features': cfg.agent.params.lstate_shape = 49 else: cfg.agent.params.lstate_shape = 9 else: cfg.agent.params.lstate_shape = 0 self.agent = hydra.utils.instantiate(cfg.agent) self.sim_video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None, dir_name='jaco_sim_video', phase='eval') self.real_video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None, dir_name='jaco_real_video', phase='eval', height=640, width=480) self.reload_weights = cfg.reload_weights self.train_vid_interval = cfg.train_vid_interval self.num_eval_episodes = 1 self.episode_max_step = 30 self.step = 0 def evaluate(self): average_episode_reward = 0 for trial in range(self.num_eval_episodes): # This will send jaco to real home obs = self.jaco_real_env.reset() sim_obs = self.eval_env_sim.reset() obs['state_low_obs'] = sim_obs['state_low_obs'] # Now lets go to sim home self.send_robot_to_sim_home() print('Done sending him home') self.sim_video_recorder.init(enabled=(trial == 0)) self.real_video_recorder.init(enabled=(trial == 0)) # What to do with done? Make sim to indicate done? # done = False episode_reward = 0 episode_step = 0 while (episode_step <= self.episode_max_step): with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) translated_act = self.translate_action_sim_to_real(action) obs = self.jaco_real_env.step(translated_act) obs['state_low_obs'] = sim_obs['state_low_obs'] print('Translated Act ', translated_act) # Take a sim step with the original action sim_obs, reward, done, _ = self.eval_env_sim.step(action) self.sim_video_recorder.record(self.eval_env_sim) self.real_video_recorder.record(self.jaco_real_env, real_jaco=True) episode_reward += reward episode_step += 1 # if done: break average_episode_reward += episode_reward self.sim_video_recorder.save(f'{trial}.mp4') self.real_video_recorder.save(f'{trial}.mp4') average_episode_reward /= self.cfg.num_eval_episodes print('Rewards ', average_episode_reward) def run(self): if os.path.exists(self.model_dir): latest_step = utils.get_latest_file(self.model_dir) self.agent.load(self.model_dir, latest_step) self.evaluate() def translate_action_sim_to_real(self, action): self.eval_env_sim.step(action) sim_qpos = self.eval_env_sim.physics.data.qpos return sim_qpos def send_robot_to_sim_home(self): self.eval_env_sim.reset() home_sim = self.eval_env_sim.physics.data.qpos self.jaco_real_env.step(home_sim)
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.model_dir = utils.make_dir(self.work_dir, 'model') self.buffer_dir = utils.make_dir(self.work_dir, 'buffer') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, action_repeat=cfg.action_repeat, agent=cfg.agent.name) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = dmc.make(cfg.env, cfg.frame_stack, cfg.action_repeat, cfg.seed) self.eval_env = dmc.make(cfg.env, cfg.frame_stack, cfg.action_repeat, cfg.seed + 1) obs_spec = self.env.observation_spec()['pixels'] action_spec = self.env.action_spec() cfg.agent.params.obs_shape = obs_spec.shape cfg.agent.params.action_shape = action_spec.shape cfg.agent.params.action_range = [ float(action_spec.minimum.min()), float(action_spec.maximum.max()) ] # exploration agent uses intrinsic reward self.expl_agent = hydra.utils.instantiate(cfg.agent, task_agnostic=True) # task agent uses extr extrinsic reward self.task_agent = hydra.utils.instantiate(cfg.agent, task_agnostic=False) self.task_agent.assign_modules_from(self.expl_agent) if cfg.load_pretrained: pretrained_path = utils.find_pretrained_agent( cfg.pretrained_dir, cfg.env, cfg.seed, cfg.pretrained_step) print(f'snapshot is taken from: {pretrained_path}') pretrained_agent = utils.load(pretrained_path) self.task_agent.assign_modules_from(pretrained_agent) # buffer for the task-agnostic phase self.expl_buffer = ReplayBuffer(obs_spec.shape, action_spec.shape, cfg.replay_buffer_capacity, self.device) # buffer for task-specific phase self.task_buffer = ReplayBuffer(obs_spec.shape, action_spec.shape, cfg.replay_buffer_capacity, self.device) self.eval_video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0 def get_agent(self): if self.step < self.cfg.num_expl_steps: return self.expl_agent return self.task_agent def get_buffer(self): if self.step < self.cfg.num_expl_steps: return self.expl_buffer return self.task_buffer def evaluate(self): avg_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): time_step = self.eval_env.reset() self.eval_video_recorder.init(enabled=(episode == 0)) episode_reward = 0 episode_success = 0 episode_step = 0 while not time_step.last(): agent = self.get_agent() with utils.eval_mode(agent): obs = time_step.observation['pixels'] action = agent.act(obs, sample=False) time_step = self.eval_env.step(action) self.eval_video_recorder.record(self.eval_env) episode_reward += time_step.reward episode_step += 1 avg_episode_reward += episode_reward self.eval_video_recorder.save(f'{self.step}.mp4') avg_episode_reward /= self.cfg.num_eval_episodes self.logger.log('eval/episode_reward', avg_episode_reward, self.step) self.logger.dump(self.step, ty='eval') def run(self): episode, episode_reward, episode_step = 0, 0, 0 start_time = time.time() done = True while self.step <= self.cfg.num_train_steps: if done: if self.step > 0: fps = episode_step / (time.time() - start_time) self.logger.log('train/fps', fps, self.step) start_time = time.time() self.logger.log('train/episode_reward', episode_reward, self.step) self.logger.log('train/episode', episode, self.step) self.logger.dump(self.step, ty='train') time_step = self.env.reset() obs = time_step.observation['pixels'] episode_reward = 0 episode_step = 0 episode += 1 self.logger.log('train/episode', episode, self.step) agent = self.get_agent() replay_buffer = self.get_buffer() # evaluate agent periodically if self.step % self.cfg.eval_frequency == 0: self.logger.log('eval/episode', episode - 1, self.step) self.evaluate() # save agent periodically if self.cfg.save_model and self.step % self.cfg.save_frequency == 0: utils.save( self.expl_agent, os.path.join(self.model_dir, f'expl_agent_{self.step}.pt')) utils.save( self.task_agent, os.path.join(self.model_dir, f'task_agent_{self.step}.pt')) if self.cfg.save_buffer and self.step % self.cfg.save_frequency == 0: replay_buffer.save(self.buffer_dir, self.cfg.save_pixels) # sample action for data collection if self.step < self.cfg.num_random_steps: spec = self.env.action_spec() action = np.random.uniform(spec.minimum, spec.maximum, spec.shape) else: with utils.eval_mode(agent): action = agent.act(obs, sample=True) agent.update(replay_buffer, self.step) time_step = self.env.step(action) next_obs = time_step.observation['pixels'] # allow infinite bootstrap done = time_step.last() episode_reward += time_step.reward replay_buffer.add(obs, action, time_step.reward, next_obs, done) obs = next_obs episode_step += 1 self.step += 1
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency, agent=cfg.agent.name) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.train_envs, self.test_envs = utils.make_env(cfg) cfg.agent.params.obs_dim = self.train_envs[0].observation_space.shape[0] + cfg.noise_dims cfg.agent.params.action_dim = self.train_envs[0].action_space.shape[0] if cfg.agent.name != 'sac': cfg.agent.params.num_envs = cfg.num_train_envs cfg.agent.params.action_range = [ float(self.train_envs[0].action_space.low.min()), float(self.train_envs[0].action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.agent.seq_len = cfg.seq_len self.replay_buffer = MultiEnvReplayBuffer((cfg.agent.params.obs_dim,), # hard coded self.train_envs[0].action_space.shape, int(cfg.replay_buffer_capacity), self.device, num_envs=cfg.num_train_envs, seq_len=cfg.seq_len) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = [0] * cfg.num_train_envs def evaluate(self, env, train=False): for episode in range(self.cfg.num_eval_episodes): obs = env.reset() self.agent.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, _ = env.step(action) self.video_recorder.record(env) episode_reward += reward self.video_recorder.save(f'{self.step}.mp4') if train: self.logger.log('eval/train_episode_reward', episode_reward, self.step[0]) else: self.logger.log('eval/eval_episode_reward', episode_reward, self.step[0]) def run(self): episode, episode_reward, episode_step, done = [0] * self.cfg.num_train_envs, [0] * self.cfg.num_train_envs, \ [0] * self.cfg.num_train_envs, [True] * self.cfg.num_train_envs obs, next_obs = [self.train_envs[0].reset()] * self.cfg.num_train_envs, [self.train_envs[0].reset()] * self.cfg.num_train_envs start_time = time.time() while self.step[0] < self.cfg.num_train_steps: for e_idx, env in enumerate(self.train_envs): if done[e_idx]: if self.step[e_idx] > 0: self.logger.log('train/duration', time.time() - start_time, self.step[e_idx]) start_time = time.time() self.logger.dump( self.step[e_idx], save=(self.step[e_idx] > self.cfg.num_seed_steps)) # evaluate agent periodically if self.step[0] > 0 and self.step[0] % self.cfg.eval_frequency == 0: self.logger.log('eval/episode', episode[e_idx], self.step[e_idx]) self.evaluate(env, train=True) self.evaluate(self.test_envs[0], train=False) self.logger.dump(self.step[e_idx]) self.logger.log('train/episode_reward', episode_reward[e_idx], self.step[e_idx]) obs[e_idx] = env.reset() self.agent.reset() done[e_idx] = False episode_reward[e_idx] = 0 episode_step[e_idx] = 0 episode[e_idx] += 1 self.logger.log('train/episode', episode[e_idx], self.step[e_idx]) # sample action for data collection if self.step[e_idx] < self.cfg.num_seed_steps: action = env.action_space.sample() else: with utils.eval_mode(self.agent): action = self.agent.act(obs[e_idx], sample=True) # run training update for encoder if self.step[e_idx] > self.cfg.num_seed_steps and self.step[e_idx] <= self.cfg.num_train_encoder_steps: self.agent.update(self.replay_buffer, self.logger, self.step[e_idx], train_sac=True, train_encoder=True) # run training update for sac elif self.step[e_idx] >= self.cfg.num_train_encoder_steps: self.agent.update(self.replay_buffer, self.logger, self.step[e_idx], train_sac=True, train_encoder=True) try: next_obs[e_idx], reward, done[e_idx], _ = env.step(action) except: next_obs[e_idx] = obs[e_idx] reward = 0 print('Invalid action. Terminating episode.') done[e_idx] = True # allow infinite bootstrap done[e_idx] = float(done[e_idx]) done_no_max = 0 if episode_step[e_idx] + 1 == env._max_episode_steps else done[e_idx] episode_reward[e_idx] += reward self.replay_buffer.add(e_idx, obs[e_idx], action, reward, next_obs[e_idx], done[e_idx], done_no_max) obs[e_idx] = next_obs[e_idx] episode_step[e_idx] += 1 self.step[e_idx] += 1
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency, agent=cfg.agent.name) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = utils.make_env(cfg) cfg.agent.params.obs_dim = self.env.observation_space.shape[0] cfg.agent.params.action_dim = self.env.action_space.shape[0] cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, int(cfg.replay_buffer_capacity), self.device) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0 def evaluate(self): average_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): obs = self.env.reset() self.agent.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, _ = self.env.step(action) self.video_recorder.record(self.env) episode_reward += reward average_episode_reward += episode_reward self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.cfg.num_eval_episodes self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step) def run(self): episode, episode_reward, done = 0, 0, True start_time = time.time() while self.step < self.cfg.num_train_steps: if done: if self.step > 0: self.logger.log('train/duration', time.time() - start_time, self.step) start_time = time.time() self.logger.dump( self.step, save=(self.step > self.cfg.num_seed_steps)) # evaluate agent periodically if self.step > 0 and self.step % self.cfg.eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) self.evaluate() # save agent periodically if self.step > 0 and self.step % self.cfg.save_frequency == 0: self.agent.save(self.work_dir, self.step) self.logger.log('train/episode_reward', episode_reward, self.step) obs = self.env.reset() self.agent.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 self.logger.log('train/episode', episode, self.step) # sample action for data collection if self.step < self.cfg.num_seed_steps: action = self.env.action_space.sample() propensity = 1.0 / np.prod(self.env.action_space.high - self.env.action_space.low) else: with utils.eval_mode(self.agent): if self.cfg.log_propensities: action, propensity = self.agent.act(obs, sample=True, propensity=True) else: action = self.agent.act(obs, sample=True) # run training update if self.step >= self.cfg.num_seed_steps: self.agent.update(self.replay_buffer, self.logger, self.step) try: next_obs, reward, done, _ = self.env.step(action) except: import ipdb ipdb.set_trace() # allow infinite bootstrap done = float(done) done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done episode_reward += reward if self.cfg.log_propensities: self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max, propensity) else: self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max) obs = next_obs episode_step += 1 self.step += 1
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = make_env(cfg) cfg.agent.params.obs_shape = self.env.observation_space.shape cfg.agent.params.action_shape = self.env.action_space.shape cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, cfg.replay_buffer_capacity, self.cfg.image_pad, self.device) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0 if self.cfg.episode_dir: self.load_episodes(cfg.episode_dir) def load_episodes(self, directory): directory = pathlib.Path(directory).expanduser() print(f'Loading episodes from {directory}') num_loaded_episodes = 0 for filename in directory.glob('*.npz'): try: with filename.open('rb') as f: episode = np.load(f) episode = {k: episode[k] for k in episode.keys()} except Exception as e: print(f'Could not load episode: {e}') continue images = process_images(episode['image']) obses = images[:-1] actions = episode['action'][:-1] rewards = episode['sparse_reward'][:-1] next_obses = images[1:] dones = np.zeros(len(episode['action'])) dones_no_max = dones [ self.replay_buffer.add(*kwargs) for kwargs in zip( obses, actions, rewards, next_obses, dones, dones_no_max) ] num_loaded_episodes += 1 print(f'Loaded {num_loaded_episodes} episodes.') def evaluate(self): average_episode_reward = 0 average_episode_success = 0 for episode in range(self.cfg.num_eval_episodes): obs = self.env.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 episode_step = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, info = self.env.step(action) self.video_recorder.record(self.env) episode_reward += reward episode_step += 1 average_episode_reward += episode_reward average_episode_success += float(episode_reward > 0) self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.cfg.num_eval_episodes average_episode_success /= self.cfg.num_eval_episodes self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.log('eval/episode_success', average_episode_success, self.step) self.logger.dump(self.step) def run(self): episode, episode_reward, episode_step, done = 0, 0, 1, True start_time = time.time() while self.step < self.cfg.num_train_steps: if done: if self.step > 0: self.logger.log('train/duration', time.time() - start_time, self.step) start_time = time.time() self.logger.dump( self.step, save=(self.step > self.cfg.num_seed_steps)) # evaluate agent periodically if self.step % self.cfg.eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) self.evaluate() self.logger.log('train/episode_reward', episode_reward, self.step) self.logger.log('train/episode_success', float(episode_reward > 0), self.step) obs = self.env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 self.logger.log('train/episode', episode, self.step) # sample action for data collection if self.step < self.cfg.num_seed_steps: action = self.env.action_space.sample() else: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=True) # run training update if self.step >= self.cfg.num_seed_steps: for _ in range(self.cfg.num_train_iters): self.agent.update(self.replay_buffer, self.logger, self.step) next_obs, reward, done, info = self.env.step(action) # allow infinite bootstrap done = float(done) done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done episode_reward += reward self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max) obs = next_obs episode_step += 1 self.step += 1
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency, agent=cfg.agent.name) setSeedEverywhere(cfg.seed) self.device = torch.device(cfg.device) # self.env = utils.makeEnv(cfg) self.env = hydra.utils.call(cfg.env) cfg.agent.obs_dim = self.env.observation_space.shape[0] cfg.agent.action_dim = self.env.action_space.shape[0] cfg.agent.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] cfg.agent.n_step = cfg.replay_buffer.n_step # n-step experience replay self.agent = hydra.utils.instantiate(cfg.agent,_recursive_=False) self.replay_buffer = ReplayBuffer( capacity=cfg.replay_buffer.capacity, obs_shape = self.env.observation_space.shape, action_shape = self.env.action_space.shape, obs_dtype = self.env.observation_space.dtype, action_dtype = self.env.action_space.dtype, n_step = cfg.replay_buffer.n_step, # n-step experience replay discount=cfg.agent.discount, # per step discount device = self.device) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0 def evaluate(self): average_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): obs = self.env.reset() self.agent.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 while not done: with evalMode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, _ = self.env.step(action) self.video_recorder.record(self.env) episode_reward += reward average_episode_reward += episode_reward self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.cfg.num_eval_episodes self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step) def run(self): episode, episode_reward, done = 0, 0, True start_time = time.time() num_train_steps = self.cfg.num_train_steps # total training steps num_seed_steps = self.cfg.num_seed_steps # steps prior to training env = self.env while self.step < num_train_steps: if done: if self.step > 0: self.logger.log('train/duration', time.time() - start_time, self.step) start_time = time.time() self.logger.dump(self.step, save=(self.step > num_seed_steps)) # evaluate agent periodically if self.step > 0 and self.step % self.cfg.eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) self.evaluate() self.logger.log('train/episode_reward', episode_reward,self.step) self.logger.log('train/episode', episode, self.step) done = False episode_reward = 0 episode_step = 0 episode += 1 self.agent.reset() obs = env.reset() self.replay_buffer.onEpisodeEnd() # sample action for data collection if self.step < num_seed_steps: action = env.action_space.sample() else: with evalMode(self.agent): action = self.agent.act(obs, sample=True) # run training update if self.step >= num_seed_steps: self.agent.update(self.replay_buffer, self.logger, self.step) next_obs, reward, done, _ = env.step(action) max_episode_step_reached = (episode_step + 1 == env._max_episode_steps) not_done = True if max_episode_step_reached else (not done) # allow infinite bootstrap done = done or max_episode_step_reached # signals episode ended self.replay_buffer.add(obs, action, reward, next_obs, not_done) obs = next_obs episode_step += 1 self.step += 1 episode_reward += reward