def evaluate(self): average_total_reward = 0 for task_id in self.cfg.eval_tasks: # adaptation phase state = self.agent.reset( ) # reset agent once, so the memory persists acros episodes for episode in range(self.cfg.num_adapt_episodes): time_step = self.eval_env.reset(task_id) while not time_step.last(): with utils.eval_mode(self.agent): obs = time_step.observation['features'] action = self.agent.act(obs, state, sample=False) time_step = self.eval_env.step(action) next_obs = time_step.observation['features'] # update agent's memory state = self.agent.step(state, obs, action, time_step.reward, next_obs) # evaluation phase # agent's memory should be initialized by now average_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): time_step = self.eval_env.reset(task_id) self.eval_video_recorder.init(enabled=(episode == 0)) episode_reward = 0 episode_success = 0 episode_step = 0 while not time_step.last(): with utils.eval_mode(self.agent): obs = time_step.observation['features'] action = self.agent.act(obs, state, sample=False) time_step = self.eval_env.step(action) next_obs = time_step.observation['features'] # update agent's memory state = self.agent.step(state, obs, action, time_step.reward, next_obs) self.eval_video_recorder.record(self.eval_env) episode_reward += time_step.reward episode_step += 1 average_episode_reward += episode_reward self.eval_video_recorder.save( f'task_{task_id}_step_{self.step}.mp4') average_episode_reward /= self.cfg.num_eval_episodes average_total_reward += average_episode_reward self.logger.log(f'eval/task_{task_id}_episode_reward', average_episode_reward / self.cfg.episode_length, self.step) average_total_reward /= len(self.cfg.eval_tasks) self.logger.log('eval/episode_reward', average_total_reward / self.cfg.episode_length, self.step) self.logger.dump(self.step, ty='eval')
def evaluate(self, phase, eval_env): average_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): obs = eval_env.reset() if phase == 'unseen': self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 episode_step = 0 # not done doesnt work for metaworld while (episode_step <= eval_env._max_episode_steps - 1): with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, _ = eval_env.step(action) if phase == 'unseen': self.video_recorder.record(eval_env) episode_reward += reward episode_step += 1 if done: break average_episode_reward += episode_reward if phase == 'unseen': self.video_recorder.save(f'{self.step[0]}.mp4') average_episode_reward /= self.cfg.num_eval_episodes if phase == 'seen': self.logger.log('eval_seen/episode_reward', average_episode_reward, self.step[0]) self.logger.dump(self.step[0], ty='eval_seen') elif phase == 'unseen': self.logger.log('eval_unseen/episode_reward', average_episode_reward, self.step[0]) self.logger.dump(self.step[0], ty='eval_unseen') eval_env.reset()
def evaluate(env, policy, num_episodes=10, max_episode_steps=None): """Evaluates the policy. Args: env: Environment to evaluate the policy on. num_episodes: A number of episodes to average the policy on. max_episode_steps: Max steps in an episode. Returns: Averaged reward and a total number of steps. """ total_timesteps = 0 total_returns = 0 for _ in range(num_episodes): state = env.reset() done = False episode_timesteps = 0 while not done: with utils.eval_mode(policy): action = policy.act(state) next_state, reward, done, _ = env.step(action) if (max_episode_steps is not None and episode_timesteps + 1 == max_episode_steps): done = True total_returns += reward total_timesteps += 1 episode_timesteps += 1 state = next_state return total_returns / num_episodes, total_timesteps / num_episodes
def evaluate(self): print("evaluate") average_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): self.env.reset() obs = get_grid_state(self.env) self.agent.reset() # self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 step_count = 0 while not done and step_count < self.max_episode_steps: with utils.eval_mode(self.agent): action_vec = self.agent.act(obs, sample=False) # TRANSFORM action_vec to action action = self.cont_to_disc(action_vec) step_count += 1 _, reward, done, _ = self.env.step(action) obs = get_grid_state(self.env) # self.video_recorder.record(self.env) episode_reward += reward average_episode_reward += episode_reward # self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.cfg.num_eval_episodes self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step)
def evaluate(self): average_episode_reward = 0 for episode in range(self.eval_trials): print('Episode Trial ', episode) self.video_recorder.init(enabled=True) eval_env = self.eval_envs[random.sample(list(self.eval_envs), 1)[0]] obs = eval_env.reset() done = False episode_reward = 0 episode_step = 0 while (episode_step <= eval_env._max_episode_steps - 1): with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, _, _ = eval_env.step(action) self.video_recorder.record(eval_env) episode_reward += reward episode_step += 1 self.step += 1 if done: break average_episode_reward += episode_reward print('Episode Reward ', episode_reward) self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.eval_trials self.logger.log('eval_standalone/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step, ty='eval_standalone')
def evaluate_step(env, agent, video, args, num_episodes, L, step, all_ep_rewards): start_time = time.time() for i in range(num_episodes): obs = env.reset() video.init(enabled=(i == 0)) done = False episode_reward = 0 obs_list = [] while not done: obs = obs / 255. with utils.eval_mode(agent): if random.random() < args.attack_prob: obs_adv = adversarial_obs(agent, obs, args.adversarial_iters) action = agent.select_action(obs_adv) if args.save_image: obs_list.append((obs, obs_adv)) else: action = agent.select_action(obs) obs, reward, done, _ = env.step(action) video.record(env) episode_reward += reward if args.save_image and len(obs_list): save_images(obs_list, step, args) video.save('%d.mp4' % step) L.log('eval/' + 'episode_reward', episode_reward, step) all_ep_rewards.append(episode_reward) return time.time() - start_time
def run_eval_loop(sample_stochastically=True): start_time = time.time() prefix = 'stochastic_' if sample_stochastically else '' for i in range(num_episodes): obs = env.reset() video.init(enabled=(i == 0)) done = False episode_reward = 0 while not done: # center crop image if args.encoder_type == 'pixel': obs = utils.center_crop_image(obs, args.image_size) with utils.eval_mode(agent): if sample_stochastically: action = agent.sample_action(obs) else: action = agent.select_action(obs) obs, reward, done, _ = env.step(action) video.record(env) episode_reward += reward video.save('%d.mp4' % step) L.log('eval/' + prefix + 'episode_reward', episode_reward, step) all_ep_rewards.append(episode_reward) L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step) mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step) L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step)
def run_eval_loop(sample_stochastically=True): start_time = time.time() prefix = 'stochastic_' if sample_stochastically else '' for i in range(num_episodes): obs = env.reset() done = False episode_reward = 0 while not done: # center crop image if args.encoder_type == 'pixel': obs = utils.center_crop_image(obs, args.image_size) with utils.eval_mode(agent): if sample_stochastically: action = agent.sample_action(obs) else: action = agent.select_action(obs) obs, reward, done, _ = env.step(action) episode_reward += reward all_ep_rewards.append(episode_reward) mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) logger.log({ 'mean_reward': mean_ep_reward, 'max_reward': best_ep_reward, })
def evaluate(env, agent, video, num_episodes, eval_mode, adapt=False): episode_rewards = [] for i in tqdm(range(num_episodes)): if adapt: ep_agent = deepcopy(agent) ep_agent.init_pad_optimizer() else: ep_agent = agent obs = env.reset() video.init(enabled=True) done = False episode_reward = 0 while not done: with utils.eval_mode(ep_agent): action = ep_agent.select_action(obs) next_obs, reward, done, _ = env.step(action) video.record(env, eval_mode) episode_reward += reward if adapt: ep_agent.update_inverse_dynamics( *augmentations.prepare_pad_batch(obs, next_obs, action)) obs = next_obs video.save(f'eval_{eval_mode}_{i}.mp4') episode_rewards.append(episode_reward) return np.mean(episode_rewards)
def run_eval_loop(sample_stochastically=True): start_time = time.time() prefix = 'stochastic_' if sample_stochastically else '' for i in range(num_episodes): obs = env.reset() done = False episode_reward = 0 while not done: # center crop image if args.encoder_type == 'pixel': obs = utils.center_crop_image(obs, args.image_size) with utils.eval_mode(agent): if sample_stochastically: print("sample_stochastically") action = random.randint(0, 11) else: print("agent selected") action = agent.select_action(obs) obs, reward, done = env.step(action) episode_reward += reward L.log('eval/' + prefix + 'episode_reward', episode_reward, step) all_ep_rewards.append(episode_reward) L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step) mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step) L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step)
def evaluate( self, num_eval_episodes=10, ): average_episode_reward = 0 for episode in range(num_eval_episodes): obs = self.env.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 episode_step = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, info = self.env.step(action) self.video_recorder.record(self.env) episode_reward += reward episode_step += 1 average_episode_reward += episode_reward self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= num_eval_episodes self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step)
def evaluate(self): average_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): obs = self.env.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 episode_step = 0 print(episode) while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) # print(acion.shape) action=action[0] obs, reward, done, info = self.env.step(action) self.video_recorder.record(obs) episode_reward += reward episode_step += 1 if episode_step>10000: break average_episode_reward += episode_reward self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.cfg.num_eval_episodes self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step)
def run_eval_loop(sample_stochastically=True): start_time = time.time() prefix = "stochastic_" if sample_stochastically else "" for i in range(num_episodes): obs = env.reset() video.init(enabled=(i == 0)) done = False episode_reward = 0 while not done: # center crop image if args.encoder_type == "pixel" and "crop" in args.data_augs: obs = utils.center_crop_image(obs, args.image_size) if args.encoder_type == "pixel" and "translate" in args.data_augs: # first crop the center with pre_image_size obs = utils.center_crop_image( obs, args.pre_transform_image_size) # then translate cropped to center obs = utils.center_translate(obs, args.image_size) with utils.eval_mode(agent): if sample_stochastically: action = agent.sample_action(obs / 255.0) else: action = agent.select_action(obs / 255.0) obs, reward, done, _ = env.step(action) video.record(env) episode_reward += reward video.save("%d.mp4" % step) L.log("eval/" + prefix + "episode_reward", episode_reward, step) all_ep_rewards.append(episode_reward) L.log("eval/" + prefix + "eval_time", time.time() - start_time, step) mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) std_ep_reward = np.std(all_ep_rewards) L.log("eval/" + prefix + "mean_episode_reward", mean_ep_reward, step) L.log("eval/" + prefix + "best_episode_reward", best_ep_reward, step) filename = (args.work_dir + "/" + args.domain_name + "--" + args.task_name + "-" + args.data_augs + "--s" + str(args.seed) + "--eval_scores.npy") key = args.domain_name + "-" + args.task_name + "-" + args.data_augs try: log_data = np.load(filename, allow_pickle=True) log_data = log_data.item() except: log_data = {} if key not in log_data: log_data[key] = {} log_data[key][step] = {} log_data[key][step]["step"] = step log_data[key][step]["mean_ep_reward"] = mean_ep_reward log_data[key][step]["max_ep_reward"] = best_ep_reward log_data[key][step]["std_ep_reward"] = std_ep_reward log_data[key][step]["env_step"] = step * args.action_repeat np.save(filename, log_data)
def run_eval_loop(sample_stochastically=True): start_time = time.time() prefix = 'stochastic_' if sample_stochastically else '' for i in range(num_episodes): obs = env.reset() video.init(enabled=(i == 0)) done = False episode_reward = 0 while not done: # center crop image if args.encoder_type == 'pixel' and 'crop' in args.data_augs: obs = utils.center_crop_image(obs, args.image_size) if args.encoder_type == 'pixel' and 'translate' in args.data_augs: # first crop the center with pre_image_size obs = utils.center_crop_image( obs, args.pre_transform_image_size) # then translate cropped to center obs = utils.center_translate(obs, args.image_size) with utils.eval_mode(agent): if sample_stochastically: action = agent.sample_action(obs / 255.) else: action = agent.select_action(obs / 255.) obs, reward, done, _ = env.step(action) video.record(env) episode_reward += reward video.save('%d.mp4' % step) L.log('eval/' + prefix + 'episode_reward', episode_reward, step) all_ep_rewards.append(episode_reward) L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step) mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) std_ep_reward = np.std(all_ep_rewards) L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step) L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step) filename = args.work_dir + '/' + args.domain_name + '--' + args.task_name + '-' + args.data_augs + '--s' + str( args.seed) + '--eval_scores.npy' key = args.domain_name + '-' + args.task_name + '-' + args.data_augs try: log_data = np.load(filename, allow_pickle=True) log_data = log_data.item() except: log_data = {} if key not in log_data: log_data[key] = {} log_data[key][step] = {} log_data[key][step]['step'] = step log_data[key][step]['mean_ep_reward'] = mean_ep_reward log_data[key][step]['max_ep_reward'] = best_ep_reward log_data[key][step]['std_ep_reward'] = std_ep_reward log_data[key][step]['env_step'] = step * args.action_repeat np.save(filename, log_data) return log_data[key][step]
def run(self, num_train_steps=1000000, num_train_iters=1, num_seed_steps=1000, eval_frequency=5000): episode, episode_reward, episode_step, done = 0, 0, 1, True start_time = time.time() while self.step < num_train_steps: if done: if self.step > 0: self.logger.log('train/duration', time.time() - start_time, self.step) start_time = time.time() self.logger.dump(self.step, save=(self.step > num_seed_steps)) # evaluate agent periodically if self.step % eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) self.evaluate() self.logger.log('train/episode_reward', episode_reward, self.step) obs = self.env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 self.logger.log('train/episode', episode, self.step) # sample action for data collection if self.step < num_seed_steps: action = self.env.action_space.sample() else: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=True) # run training update if self.step >= num_seed_steps: for _ in range(num_train_iters): self.agent.update(self.replay_buffer, self.logger, self.step) next_obs, reward, done, info = self.env.step(action) # allow infinite bootstrap done = float(done) done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done episode_reward += reward self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max) obs = next_obs episode_step += 1 self.step += 1
def evaluate(env, agent, video, num_episodes, L, step): for i in range(num_episodes): obs = env.reset() video.init(enabled=(i == 0)) done = False episode_reward = 0 while not done: with utils.eval_mode(agent): action = agent.select_action(obs) obs, reward, done, _ = env.step(action) video.record(env) episode_reward += reward video.save('%d.mp4' % step) L.log('eval/episode_reward', episode_reward, step) L.dump(step)
def evaluate(self): for episode in range(self.cfg.num_eval_episodes): obs = self.env.reset() self.agent.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, _ = self.env.step(action) self.video_recorder.record(self.env) episode_reward += reward self.video_recorder.save(f'{self.step}.mp4') self.logger.log('eval/episode_reward', episode_reward, self.step) self.logger.dump(self.step)
def evaluate(env, agent, args, video): """Evaluate an agent, optionally adapt using PAD""" episode_rewards = [] episode_inv_pred_vars = [] for i in tqdm(range(args.num_eval_episodes)): # ep_agent = deepcopy(agent) # make a new copy video.init(enabled=True) obs = env.reset() done = False episode_reward = 0 obs_buf = [] next_obs_buf = [] action_buf = [] losses = [] step = 0 # ep_agent.train() while not done: # Take step with utils.eval_mode(agent): action = agent.act(obs) next_obs, reward, done, _ = env.step(action) episode_reward += reward obs_buf.append(obs) next_obs_buf.append(next_obs) action_buf.append(action) video.record(env, losses) obs = next_obs step += 1 video.save('{}_{}.mp4'.format(args.mode, i)) episode_rewards.append(episode_reward) # Compute self-supervised ensemble variance if args.use_inv: episode_inv_pred_vars.append( np.mean( agent.ss_preds_var( np.asarray(obs_buf, dtype=obs.dtype), np.asarray(next_obs_buf, dtype=obs.dtype), np.asarray(action_buf, dtype=action.dtype)))) return np.mean(episode_rewards), np.mean(episode_inv_pred_vars)
def run_eval_loop2(sample_stochastically=True, cor_func="no_cor", cor_sev=1): cor = Corruptor(cor_func=cor_func, severity=cor_sev) start_time = time.time() prefix = 'stochastic_' if sample_stochastically else '' all_ep_rewards = [] for i in range(num_episodes): obs = env.reset() obs = cor.corrupt_stacked_images( obs, args.frame_stack) # added corruption after env done = False episode_reward = 0 while not done: # center crop image if args.encoder_type == 'pixel' and 'crop' in args.data_augs: obs = utils.center_crop_image(obs, args.image_size) if args.encoder_type == 'pixel' and 'translate' in args.data_augs: # first crop the center with pre_image_size obs = utils.center_crop_image( obs, args.pre_transform_image_size) # then translate cropped to center obs = utils.center_translate(obs, args.image_size) with utils.eval_mode(agent): if sample_stochastically: action = agent.sample_action(obs / 255.) else: action = agent.select_action(obs / 255.) obs, reward, done, _ = env.step(action) obs = cor.corrupt_stacked_images( obs, args.frame_stack) # added corruption after env episode_reward += reward all_ep_rewards.append(episode_reward) mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) std_ep_reward = np.std(all_ep_rewards) end_time = time.time() return step, mean_ep_reward, best_ep_reward, std_ep_reward, end_time - start_time
def evaluate(self, env, train=False): for episode in range(self.cfg.num_eval_episodes): obs = env.reset() self.agent.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, _ = env.step(action) self.video_recorder.record(env) episode_reward += reward self.video_recorder.save(f"{self.step}.mp4") if train: self.logger.log("eval/train_episode_reward", episode_reward, self.step[0]) else: self.logger.log("eval/eval_episode_reward", episode_reward, self.step[0])
def evaluate(env, agent, video, num_episodes, L, step, test_env=False): episode_rewards = [] for i in range(num_episodes): obs = env.reset() video.init(enabled=(i == 0)) done = False episode_reward = 0 while not done: with utils.eval_mode(agent): action = agent.select_action(obs) obs, reward, done, _ = env.step(action) video.record(env) episode_reward += reward if L is not None: _test_env = '_test_env' if test_env else '' video.save(f'{step}{_test_env}.mp4') L.log(f'eval/episode_reward{_test_env}', episode_reward, step) episode_rewards.append(episode_reward) return np.mean(episode_rewards)
def evaluate(self): average_episode_reward = 0 eps_reward = [] eps_done = 0 # while eps_done < self.cfg.num_eval_episodes: for episode in range(self.cfg.num_eval_episodes): obs = self.env.reset() # self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 episode_step = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) # This is unnecessary here... self.agent.osl.train(True) obs, reward, done, info = self.env.step(action) # self.video_recorder.record(self.env) episode_reward += reward episode_step += 1 # if episode_reward > 0: # eps_reward.append(episode_reward) # average_episode_reward += episode_reward # eps_done += 1 # else: # continue average_episode_reward += episode_reward # self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.cfg.num_eval_episodes sd_episode_reward = np.std(eps_reward) self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step) return average_episode_reward, sd_episode_reward
def evaluate(env, agent, cfg): average_episode_reward = 0 for episode in range(cfg.num_eval_episodes): obs = env.reset() agent.reset() #self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 while not done: with utils.eval_mode(agent): if attach_state: obs = np.concatenate((obs, get_env_state(env, cfg)), axis=0) action = agent.act(obs, sample=False) obs, reward, done, _ = env.step(action) #video_recorder.record(self.env) episode_reward += reward average_episode_reward += episode_reward #video_recorder.save(f'{self.step}.mp4') average_episode_reward /= cfg.num_eval_episodes return average_episode_reward
def run_eval_loop(sample_stochastically=True): start_time = time.time() prefix = "stochastic_" if sample_stochastically else "" for i in tqdm(range(num_episodes), desc='eval', unit='ep'): obs = env.reset() video.init(enabled=(i == 0)) done = False episode_reward = 0 episode_info = defaultdict(int) while not done: # center crop image if args.encoder_type == "mixed": state, img = utils.split_obs(obs) img = utils.center_crop_image(img, args.image_size) obs = utils.combine_obs(state, img) with utils.eval_mode(agent): if sample_stochastically: action = agent.sample_action(obs) else: action = agent.select_action(obs) obs, reward, done, info = env.step(action) for k in keys_to_monitor: episode_info[k] += info[k] video.record(env, yaw=i) episode_reward += reward for k in keys_to_monitor: L.log("eval/" + prefix + k, np.sum(episode_info[k]), step) video.save("%d.mp4" % step) L.log("eval/" + prefix + "episode_reward", episode_reward, step) all_ep_rewards.append(episode_reward) L.log("eval/" + prefix + "eval_time", time.time() - start_time, step) mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) L.log("eval/" + prefix + "mean_episode_reward", mean_ep_reward, step) L.log("eval/" + prefix + "best_episode_reward", best_ep_reward, step)
def evaluate(self): avg_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): time_step = self.eval_env.reset() self.eval_video_recorder.init(enabled=(episode == 0)) episode_reward = 0 episode_success = 0 episode_step = 0 while not time_step.last(): agent = self.get_agent() with utils.eval_mode(agent): obs = time_step.observation['pixels'] action = agent.act(obs, sample=False) time_step = self.eval_env.step(action) self.eval_video_recorder.record(self.eval_env) episode_reward += time_step.reward episode_step += 1 avg_episode_reward += episode_reward self.eval_video_recorder.save(f'{self.step}.mp4') avg_episode_reward /= self.cfg.num_eval_episodes self.logger.log('eval/episode_reward', avg_episode_reward, self.step) self.logger.dump(self.step, ty='eval')
def evaluate(env, agent, video, num_episodes, L, step): for i in range(num_episodes): a_anti_stuck = np.array([0, 0, 0.1, 0, 0, 0]) env.step(a_anti_stuck) obs = env.reset() video.init(enabled=(i == 0)) done = False episode_reward = 0 step_counter = 0 while not done: with utils.eval_mode(agent): action = agent.select_action(obs) action = np.multiply(action, env.action_space.high) obs, reward, done, _ = env.step(action) print("TE: {} | TS: {} | TR: {:.4f} | TER: {:.4f} | TA: {}". format(i, step_counter, round(reward, 4), round(episode_reward, 4), action)) step_counter += 1 video.record(env) episode_reward += reward video.save('%d.mp4' % step) L.log('eval/episode_reward', episode_reward, step) L.dump(step)
def evaluate(self): average_episode_reward = 0 for trial in range(self.num_eval_episodes): # This will send jaco to real home obs = self.jaco_real_env.reset() sim_obs = self.eval_env_sim.reset() obs['state_low_obs'] = sim_obs['state_low_obs'] # Now lets go to sim home self.send_robot_to_sim_home() print('Done sending him home') self.sim_video_recorder.init(enabled=(trial == 0)) self.real_video_recorder.init(enabled=(trial == 0)) # What to do with done? Make sim to indicate done? # done = False episode_reward = 0 episode_step = 0 while (episode_step <= self.episode_max_step): with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) translated_act = self.translate_action_sim_to_real(action) obs = self.jaco_real_env.step(translated_act) obs['state_low_obs'] = sim_obs['state_low_obs'] print('Translated Act ', translated_act) # Take a sim step with the original action sim_obs, reward, done, _ = self.eval_env_sim.step(action) self.sim_video_recorder.record(self.eval_env_sim) self.real_video_recorder.record(self.jaco_real_env, real_jaco=True) episode_reward += reward episode_step += 1 # if done: break average_episode_reward += episode_reward self.sim_video_recorder.save(f'{trial}.mp4') self.real_video_recorder.save(f'{trial}.mp4') average_episode_reward /= self.cfg.num_eval_episodes print('Rewards ', average_episode_reward)
if global_steps % 2500 == 0: torch.save( agent.critic.state_dict(), f'{data_root}/{args.experiment_name}/{args.name}_encoder/{global_steps}.pt' ) s = env.reset() s = torch.tensor(s / 255.).float() done = False steps = 0 episode_reward = 0 while not done: with eval_mode(agent): a = agent.sample_action(s.unsqueeze(0).float().to('cuda:0')) # print(a) s_, r, done, _ = env.step(a) # print(r) steps += 1 episode_reward += r s_ = torch.tensor(s_ / 255.).float() agent.update(replay_memory, global_steps, True, cpc, noise) # Some infinite bootstrapping # i.e., never returns the '1.0' flag for done, since there is not target goal state done_bool = 0 if steps == env._max_episode_steps else float(done)
def main(cfg): from omegaconf import OmegaConf attach_state = cfg.attach_state from_pixels = cfg.from_pixels encoder_type = cfg.encoder_type if cfg.user_config: print("+++++++++++++++++ Using user specified config") cfg = OmegaConf.load(cfg.user_config) cfg.attach_state = attach_state cfg.from_pixels = from_pixels cfg.encoder_type = encoder_type print("+++++++++++++++++ Configuration : \n", cfg) expert_path = home + "/pytorch_sac/expert/" + cfg.env + "_state" print("+++++++++++++++++ Expert Path : ", expert_path) actor_path = expert_path + "/actor.pt" env = utils.make_env(cfg) # Make env based on cfg. #if cfg.frame_stack = True: # self.env = utils.FrameStack(self.env, k=3) cfg.agent.params.obs_dim = env.observation_space.shape[0] if attach_state: cfg.agent.params.obs_dim += get_env_state_dim(cfg) cfg.agent.params.action_dim = env.action_space.shape[0] cfg.agent.params.action_range = [ float(env.action_space.low.min()), float(env.action_space.high.max()) ] agent = hydra.utils.instantiate(cfg.agent) print("Observation Dimension : ", cfg.agent.params.obs_dim) conf = OmegaConf.load(expert_path + '/config.yaml') assert conf.env == cfg.env conf.agent.params.action_dim = env.action_space.shape[0] conf.agent.params.action_range = [ float(env.action_space.low.min()), float(env.action_space.high.max()) ] conf.agent.params.obs_dim = get_env_state_dim(conf) agent_expert = hydra.utils.instantiate(conf.agent) agent_expert.actor.load_state_dict(torch.load(actor_path)) #video_recorder = VideoRecorder(None) data = Dataset((cfg.agent.params.obs_dim, ), (conf.agent.params.obs_dim, ), env.action_space.shape, 1000000, torch.device("cuda")) collect_steps = 1000000 print("DATASET CAPACITY : 1000000, Collecting Steps : ", collect_steps) loss = nn.MSELoss() #collect_ep = 4000 step = 0 ep = 0 start_time = time.time() while (step < collect_steps): obs = env.reset() state = get_env_state(env, cfg) action_expert = None done = False episode_step = 0 episode_reward = 0 ep_start_time = time.time() while not done: with utils.eval_mode(agent_expert): action_expert = agent_expert.act(state, sample=False) if ep % 4 == 0: action_expert = env.action_space.sample() if ep % 4 == 1: action_expert += np.random.rand(*action_expert.shape) * 0.1 if ep % 4 == 2: action_expert += np.random.rand(*action_expert.shape) * 1 action_expert = np.clip(action_expert, -1, 1) next_obs, reward, done, extra = env.step(action_expert) next_state = get_env_state(env, cfg) #print("XXXXXX\n", obs.shape,"\n", state.shape) data.add(obs, state, action_expert, reward, done) step += 1 episode_step += 1 episode_reward += reward done_no_max = 0 if episode_step + 1 == env._max_episode_steps else done obs = next_obs state = next_state ep += 1 if ep % 100 == 0: print("Episode : ", ep, " Episode Reward : ", episode_reward, " Time taken by one episode : ", time.time() - ep_start_time) print("Total Time taken : ", time.time() - start_time) data.save(home + "/pytorch_sac/Data", prefix=cfg.env + "_" + cfg.encoder_type)
def evaluate(env, agent, args, video, adapt=False): """Evaluate an agent, optionally adapt using PAD""" episode_rewards = [] for i in tqdm(range(args.pad_num_episodes)): ep_agent = deepcopy(agent) # make a new copy if args.use_curl: # initialize replay buffer for CURL replay_buffer = utils.ReplayBuffer( obs_shape=env.observation_space.shape, action_shape=env.action_space.shape, capacity=args.train_steps, batch_size=args.pad_batch_size) video.init(enabled=True) obs = env.reset() done = False episode_reward = 0 losses = [] step = 0 ep_agent.train() while not done: # Take step with utils.eval_mode(ep_agent): action = ep_agent.select_action(obs) next_obs, reward, done, _ = env.step(action) episode_reward += reward # Make self-supervised update if flag is true if adapt: if args.use_rot: # rotation prediction # Prepare batch of cropped observations batch_next_obs = utils.batch_from_obs( torch.Tensor(next_obs).cuda(), batch_size=args.pad_batch_size) batch_next_obs = utils.random_crop(batch_next_obs) # Adapt using rotation prediction losses.append(ep_agent.update_rot(batch_next_obs)) if args.use_inv: # inverse dynamics model # Prepare batch of observations batch_obs = utils.batch_from_obs( torch.Tensor(obs).cuda(), batch_size=args.pad_batch_size) batch_next_obs = utils.batch_from_obs( torch.Tensor(next_obs).cuda(), batch_size=args.pad_batch_size) batch_action = torch.Tensor(action).cuda().unsqueeze( 0).repeat(args.pad_batch_size, 1) # Adapt using inverse dynamics prediction losses.append( ep_agent.update_inv(utils.random_crop(batch_obs), utils.random_crop(batch_next_obs), batch_action)) if args.use_curl: # CURL # Add observation to replay buffer for use as negative samples # (only first argument obs is used, but we store all for convenience) replay_buffer.add(obs, action, reward, next_obs, True) # Prepare positive and negative samples obs_anchor, obs_pos = get_curl_pos_neg( next_obs, replay_buffer) # Adapt using CURL losses.append( ep_agent.update_curl(obs_anchor, obs_pos, ema=True)) video.record(env, losses) obs = next_obs step += 1 video.save( f'{args.mode}_pad_{i}.mp4' if adapt else f'{args.mode}_{i}.mp4') episode_rewards.append(episode_reward) return np.mean(episode_rewards)