예제 #1
0
파일: train.py 프로젝트: denisyarats/metarl
    def evaluate(self):
        average_total_reward = 0
        for task_id in self.cfg.eval_tasks:
            # adaptation phase
            state = self.agent.reset(
            )  # reset agent once, so the memory persists acros episodes
            for episode in range(self.cfg.num_adapt_episodes):
                time_step = self.eval_env.reset(task_id)
                while not time_step.last():
                    with utils.eval_mode(self.agent):
                        obs = time_step.observation['features']
                        action = self.agent.act(obs, state, sample=False)
                    time_step = self.eval_env.step(action)
                    next_obs = time_step.observation['features']
                    # update agent's memory
                    state = self.agent.step(state, obs, action,
                                            time_step.reward, next_obs)

            # evaluation phase
            # agent's memory should be initialized by now
            average_episode_reward = 0
            for episode in range(self.cfg.num_eval_episodes):
                time_step = self.eval_env.reset(task_id)
                self.eval_video_recorder.init(enabled=(episode == 0))
                episode_reward = 0
                episode_success = 0
                episode_step = 0
                while not time_step.last():
                    with utils.eval_mode(self.agent):
                        obs = time_step.observation['features']
                        action = self.agent.act(obs, state, sample=False)
                    time_step = self.eval_env.step(action)
                    next_obs = time_step.observation['features']
                    # update agent's memory
                    state = self.agent.step(state, obs, action,
                                            time_step.reward, next_obs)
                    self.eval_video_recorder.record(self.eval_env)
                    episode_reward += time_step.reward
                    episode_step += 1

                average_episode_reward += episode_reward
                self.eval_video_recorder.save(
                    f'task_{task_id}_step_{self.step}.mp4')
            average_episode_reward /= self.cfg.num_eval_episodes
            average_total_reward += average_episode_reward
            self.logger.log(f'eval/task_{task_id}_episode_reward',
                            average_episode_reward / self.cfg.episode_length,
                            self.step)
        average_total_reward /= len(self.cfg.eval_tasks)
        self.logger.log('eval/episode_reward',
                        average_total_reward / self.cfg.episode_length,
                        self.step)
        self.logger.dump(self.step, ty='eval')
예제 #2
0
파일: train.py 프로젝트: melfm/ibit
 def evaluate(self, phase, eval_env):
     average_episode_reward = 0
     for episode in range(self.cfg.num_eval_episodes):
         obs = eval_env.reset()
         if phase == 'unseen':
             self.video_recorder.init(enabled=(episode == 0))
         done = False
         episode_reward = 0
         episode_step = 0
         # not done doesnt work for metaworld
         while (episode_step <= eval_env._max_episode_steps - 1):
             with utils.eval_mode(self.agent):
                 action = self.agent.act(obs, sample=False)
             obs, reward, done, _ = eval_env.step(action)
             if phase == 'unseen':
                 self.video_recorder.record(eval_env)
             episode_reward += reward
             episode_step += 1
             if done: break
         average_episode_reward += episode_reward
         if phase == 'unseen':
             self.video_recorder.save(f'{self.step[0]}.mp4')
     average_episode_reward /= self.cfg.num_eval_episodes
     if phase == 'seen':
         self.logger.log('eval_seen/episode_reward', average_episode_reward,
                         self.step[0])
         self.logger.dump(self.step[0], ty='eval_seen')
     elif phase == 'unseen':
         self.logger.log('eval_unseen/episode_reward',
                         average_episode_reward, self.step[0])
         self.logger.dump(self.step[0], ty='eval_unseen')
     eval_env.reset()
예제 #3
0
def evaluate(env, policy, num_episodes=10, max_episode_steps=None):
    """Evaluates the policy.

  Args:
    env: Environment to evaluate the policy on.
    num_episodes: A number of episodes to average the policy on.
    max_episode_steps: Max steps in an episode.

  Returns:
    Averaged reward and a total number of steps.
  """
    total_timesteps = 0
    total_returns = 0

    for _ in range(num_episodes):
        state = env.reset()
        done = False
        episode_timesteps = 0
        while not done:
            with utils.eval_mode(policy):
                action = policy.act(state)

            next_state, reward, done, _ = env.step(action)
            if (max_episode_steps is not None
                    and episode_timesteps + 1 == max_episode_steps):
                done = True

            total_returns += reward
            total_timesteps += 1
            episode_timesteps += 1
            state = next_state

    return total_returns / num_episodes, total_timesteps / num_episodes
예제 #4
0
    def evaluate(self):
        print("evaluate")
        average_episode_reward = 0
        for episode in range(self.cfg.num_eval_episodes):
            self.env.reset()
            obs = get_grid_state(self.env)
            self.agent.reset()
            # self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            step_count = 0
            while not done and step_count < self.max_episode_steps:
                with utils.eval_mode(self.agent):
                    action_vec = self.agent.act(obs, sample=False)

                # TRANSFORM action_vec to action
                action = self.cont_to_disc(action_vec)
                step_count += 1
                _, reward, done, _ = self.env.step(action)
                obs = get_grid_state(self.env)
                # self.video_recorder.record(self.env)
                episode_reward += reward

            average_episode_reward += episode_reward
            # self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.dump(self.step)
예제 #5
0
 def evaluate(self):
     average_episode_reward = 0
     for episode in range(self.eval_trials):
         print('Episode Trial ', episode)
         self.video_recorder.init(enabled=True)
         eval_env = self.eval_envs[random.sample(list(self.eval_envs),
                                                 1)[0]]
         obs = eval_env.reset()
         done = False
         episode_reward = 0
         episode_step = 0
         while (episode_step <= eval_env._max_episode_steps - 1):
             with utils.eval_mode(self.agent):
                 action = self.agent.act(obs, sample=False)
             obs, reward, _, _ = eval_env.step(action)
             self.video_recorder.record(eval_env)
             episode_reward += reward
             episode_step += 1
             self.step += 1
             if done: break
         average_episode_reward += episode_reward
         print('Episode Reward ', episode_reward)
         self.video_recorder.save(f'{self.step}.mp4')
     average_episode_reward /= self.eval_trials
     self.logger.log('eval_standalone/episode_reward',
                     average_episode_reward, self.step)
     self.logger.dump(self.step, ty='eval_standalone')
예제 #6
0
def evaluate_step(env, agent, video, args, num_episodes, L, step,
                  all_ep_rewards):
    start_time = time.time()
    for i in range(num_episodes):
        obs = env.reset()
        video.init(enabled=(i == 0))
        done = False
        episode_reward = 0
        obs_list = []
        while not done:
            obs = obs / 255.
            with utils.eval_mode(agent):
                if random.random() < args.attack_prob:
                    obs_adv = adversarial_obs(agent, obs,
                                              args.adversarial_iters)
                    action = agent.select_action(obs_adv)

                    if args.save_image:
                        obs_list.append((obs, obs_adv))
                else:
                    action = agent.select_action(obs)

            obs, reward, done, _ = env.step(action)
            video.record(env)
            episode_reward += reward

        if args.save_image and len(obs_list):
            save_images(obs_list, step, args)

        video.save('%d.mp4' % step)
        L.log('eval/' + 'episode_reward', episode_reward, step)
        all_ep_rewards.append(episode_reward)

    return time.time() - start_time
예제 #7
0
    def run_eval_loop(sample_stochastically=True):
        start_time = time.time()
        prefix = 'stochastic_' if sample_stochastically else ''
        for i in range(num_episodes):
            obs = env.reset()
            video.init(enabled=(i == 0))
            done = False
            episode_reward = 0
            while not done:
                # center crop image
                if args.encoder_type == 'pixel':
                    obs = utils.center_crop_image(obs, args.image_size)
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        action = agent.sample_action(obs)
                    else:
                        action = agent.select_action(obs)
                obs, reward, done, _ = env.step(action)
                video.record(env)
                episode_reward += reward

            video.save('%d.mp4' % step)
            L.log('eval/' + prefix + 'episode_reward', episode_reward, step)
            all_ep_rewards.append(episode_reward)

        L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step)
        mean_ep_reward = np.mean(all_ep_rewards)
        best_ep_reward = np.max(all_ep_rewards)
        L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step)
        L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step)
예제 #8
0
    def run_eval_loop(sample_stochastically=True):
        start_time = time.time()
        prefix = 'stochastic_' if sample_stochastically else ''
        for i in range(num_episodes):
            obs = env.reset()
            done = False
            episode_reward = 0
            while not done:
                # center crop image
                if args.encoder_type == 'pixel':
                    obs = utils.center_crop_image(obs, args.image_size)
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        action = agent.sample_action(obs)
                    else:
                        action = agent.select_action(obs)
                obs, reward, done, _ = env.step(action)
                episode_reward += reward

            all_ep_rewards.append(episode_reward)

        mean_ep_reward = np.mean(all_ep_rewards)
        best_ep_reward = np.max(all_ep_rewards)

        logger.log({
            'mean_reward': mean_ep_reward,
            'max_reward': best_ep_reward,
        })
예제 #9
0
def evaluate(env, agent, video, num_episodes, eval_mode, adapt=False):
    episode_rewards = []
    for i in tqdm(range(num_episodes)):
        if adapt:
            ep_agent = deepcopy(agent)
            ep_agent.init_pad_optimizer()
        else:
            ep_agent = agent
        obs = env.reset()
        video.init(enabled=True)
        done = False
        episode_reward = 0
        while not done:
            with utils.eval_mode(ep_agent):
                action = ep_agent.select_action(obs)
            next_obs, reward, done, _ = env.step(action)
            video.record(env, eval_mode)
            episode_reward += reward
            if adapt:
                ep_agent.update_inverse_dynamics(
                    *augmentations.prepare_pad_batch(obs, next_obs, action))
            obs = next_obs

        video.save(f'eval_{eval_mode}_{i}.mp4')
        episode_rewards.append(episode_reward)

    return np.mean(episode_rewards)
예제 #10
0
    def run_eval_loop(sample_stochastically=True):
        start_time = time.time()
        prefix = 'stochastic_' if sample_stochastically else ''
        for i in range(num_episodes):
            obs = env.reset()
            done = False
            episode_reward = 0
            while not done:
                # center crop image
                if args.encoder_type == 'pixel':
                    obs = utils.center_crop_image(obs, args.image_size)
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        print("sample_stochastically")
                        action = random.randint(0, 11)
                    else:
                        print("agent selected")
                        action = agent.select_action(obs)
                obs, reward, done = env.step(action)
                episode_reward += reward

            L.log('eval/' + prefix + 'episode_reward', episode_reward, step)
            all_ep_rewards.append(episode_reward)

        L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step)
        mean_ep_reward = np.mean(all_ep_rewards)
        best_ep_reward = np.max(all_ep_rewards)
        L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step)
        L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step)
예제 #11
0
파일: train.py 프로젝트: avandekleut/drq
    def evaluate(
        self,
        num_eval_episodes=10,
    ):
        average_episode_reward = 0
        for episode in range(num_eval_episodes):
            obs = self.env.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            episode_step = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, done, info = self.env.step(action)
                self.video_recorder.record(self.env)
                episode_reward += reward
                episode_step += 1

            average_episode_reward += episode_reward
            self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= num_eval_episodes
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.dump(self.step)
예제 #12
0
    def evaluate(self):
        average_episode_reward = 0
        for episode in range(self.cfg.num_eval_episodes):
            obs = self.env.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            episode_step = 0
            print(episode)
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                # print(acion.shape)
                action=action[0]
                obs, reward, done, info = self.env.step(action)
                self.video_recorder.record(obs)
                episode_reward += reward
                episode_step += 1
                if episode_step>10000:
                    break

            average_episode_reward += episode_reward
            self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.dump(self.step)
예제 #13
0
    def run_eval_loop(sample_stochastically=True):
        start_time = time.time()
        prefix = "stochastic_" if sample_stochastically else ""
        for i in range(num_episodes):
            obs = env.reset()
            video.init(enabled=(i == 0))
            done = False
            episode_reward = 0
            while not done:
                # center crop image
                if args.encoder_type == "pixel" and "crop" in args.data_augs:
                    obs = utils.center_crop_image(obs, args.image_size)
                if args.encoder_type == "pixel" and "translate" in args.data_augs:
                    # first crop the center with pre_image_size
                    obs = utils.center_crop_image(
                        obs, args.pre_transform_image_size)
                    # then translate cropped to center
                    obs = utils.center_translate(obs, args.image_size)
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        action = agent.sample_action(obs / 255.0)
                    else:
                        action = agent.select_action(obs / 255.0)
                obs, reward, done, _ = env.step(action)
                video.record(env)
                episode_reward += reward

            video.save("%d.mp4" % step)
            L.log("eval/" + prefix + "episode_reward", episode_reward, step)
            all_ep_rewards.append(episode_reward)

        L.log("eval/" + prefix + "eval_time", time.time() - start_time, step)
        mean_ep_reward = np.mean(all_ep_rewards)
        best_ep_reward = np.max(all_ep_rewards)
        std_ep_reward = np.std(all_ep_rewards)
        L.log("eval/" + prefix + "mean_episode_reward", mean_ep_reward, step)
        L.log("eval/" + prefix + "best_episode_reward", best_ep_reward, step)

        filename = (args.work_dir + "/" + args.domain_name + "--" +
                    args.task_name + "-" + args.data_augs + "--s" +
                    str(args.seed) + "--eval_scores.npy")
        key = args.domain_name + "-" + args.task_name + "-" + args.data_augs
        try:
            log_data = np.load(filename, allow_pickle=True)
            log_data = log_data.item()
        except:
            log_data = {}

        if key not in log_data:
            log_data[key] = {}

        log_data[key][step] = {}
        log_data[key][step]["step"] = step
        log_data[key][step]["mean_ep_reward"] = mean_ep_reward
        log_data[key][step]["max_ep_reward"] = best_ep_reward
        log_data[key][step]["std_ep_reward"] = std_ep_reward
        log_data[key][step]["env_step"] = step * args.action_repeat

        np.save(filename, log_data)
예제 #14
0
파일: train.py 프로젝트: camall3n/rad
    def run_eval_loop(sample_stochastically=True):
        start_time = time.time()
        prefix = 'stochastic_' if sample_stochastically else ''
        for i in range(num_episodes):
            obs = env.reset()
            video.init(enabled=(i == 0))
            done = False
            episode_reward = 0
            while not done:
                # center crop image
                if args.encoder_type == 'pixel' and 'crop' in args.data_augs:
                    obs = utils.center_crop_image(obs, args.image_size)
                if args.encoder_type == 'pixel' and 'translate' in args.data_augs:
                    # first crop the center with pre_image_size
                    obs = utils.center_crop_image(
                        obs, args.pre_transform_image_size)
                    # then translate cropped to center
                    obs = utils.center_translate(obs, args.image_size)
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        action = agent.sample_action(obs / 255.)
                    else:
                        action = agent.select_action(obs / 255.)
                obs, reward, done, _ = env.step(action)
                video.record(env)
                episode_reward += reward

            video.save('%d.mp4' % step)
            L.log('eval/' + prefix + 'episode_reward', episode_reward, step)
            all_ep_rewards.append(episode_reward)

        L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step)
        mean_ep_reward = np.mean(all_ep_rewards)
        best_ep_reward = np.max(all_ep_rewards)
        std_ep_reward = np.std(all_ep_rewards)
        L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step)
        L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step)

        filename = args.work_dir + '/' + args.domain_name + '--' + args.task_name + '-' + args.data_augs + '--s' + str(
            args.seed) + '--eval_scores.npy'
        key = args.domain_name + '-' + args.task_name + '-' + args.data_augs
        try:
            log_data = np.load(filename, allow_pickle=True)
            log_data = log_data.item()
        except:
            log_data = {}

        if key not in log_data:
            log_data[key] = {}

        log_data[key][step] = {}
        log_data[key][step]['step'] = step
        log_data[key][step]['mean_ep_reward'] = mean_ep_reward
        log_data[key][step]['max_ep_reward'] = best_ep_reward
        log_data[key][step]['std_ep_reward'] = std_ep_reward
        log_data[key][step]['env_step'] = step * args.action_repeat

        np.save(filename, log_data)
        return log_data[key][step]
예제 #15
0
파일: train.py 프로젝트: avandekleut/drq
    def run(self,
            num_train_steps=1000000,
            num_train_iters=1,
            num_seed_steps=1000,
            eval_frequency=5000):
        episode, episode_reward, episode_step, done = 0, 0, 1, True
        start_time = time.time()
        while self.step < num_train_steps:
            if done:
                if self.step > 0:
                    self.logger.log('train/duration',
                                    time.time() - start_time, self.step)
                    start_time = time.time()
                    self.logger.dump(self.step,
                                     save=(self.step > num_seed_steps))

                # evaluate agent periodically
                if self.step % eval_frequency == 0:
                    self.logger.log('eval/episode', episode, self.step)
                    self.evaluate()

                self.logger.log('train/episode_reward', episode_reward,
                                self.step)

                obs = self.env.reset()
                done = False
                episode_reward = 0
                episode_step = 0
                episode += 1

                self.logger.log('train/episode', episode, self.step)

            # sample action for data collection
            if self.step < num_seed_steps:
                action = self.env.action_space.sample()
            else:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=True)

            # run training update
            if self.step >= num_seed_steps:
                for _ in range(num_train_iters):
                    self.agent.update(self.replay_buffer, self.logger,
                                      self.step)

            next_obs, reward, done, info = self.env.step(action)

            # allow infinite bootstrap
            done = float(done)
            done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done
            episode_reward += reward

            self.replay_buffer.add(obs, action, reward, next_obs, done,
                                   done_no_max)

            obs = next_obs
            episode_step += 1
            self.step += 1
예제 #16
0
def evaluate(env, agent, video, num_episodes, L, step):
    for i in range(num_episodes):
        obs = env.reset()
        video.init(enabled=(i == 0))
        done = False
        episode_reward = 0
        while not done:
            with utils.eval_mode(agent):
                action = agent.select_action(obs)
            obs, reward, done, _ = env.step(action)
            video.record(env)
            episode_reward += reward

        video.save('%d.mp4' % step)
        L.log('eval/episode_reward', episode_reward, step)
    L.dump(step)
예제 #17
0
    def evaluate(self):
        for episode in range(self.cfg.num_eval_episodes):
            obs = self.env.reset()
            self.agent.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, done, _ = self.env.step(action)
                self.video_recorder.record(self.env)
                episode_reward += reward

            self.video_recorder.save(f'{self.step}.mp4')
            self.logger.log('eval/episode_reward', episode_reward, self.step)
        self.logger.dump(self.step)
def evaluate(env, agent, args, video):
    """Evaluate an agent, optionally adapt using PAD"""
    episode_rewards = []
    episode_inv_pred_vars = []

    for i in tqdm(range(args.num_eval_episodes)):
        # ep_agent = deepcopy(agent)  # make a new copy
        video.init(enabled=True)

        obs = env.reset()
        done = False
        episode_reward = 0
        obs_buf = []
        next_obs_buf = []
        action_buf = []
        losses = []
        step = 0
        # ep_agent.train()

        while not done:
            # Take step
            with utils.eval_mode(agent):
                action = agent.act(obs)
            next_obs, reward, done, _ = env.step(action)
            episode_reward += reward

            obs_buf.append(obs)
            next_obs_buf.append(next_obs)
            action_buf.append(action)
            video.record(env, losses)
            obs = next_obs
            step += 1

        video.save('{}_{}.mp4'.format(args.mode, i))
        episode_rewards.append(episode_reward)
        # Compute self-supervised ensemble variance
        if args.use_inv:
            episode_inv_pred_vars.append(
                np.mean(
                    agent.ss_preds_var(
                        np.asarray(obs_buf, dtype=obs.dtype),
                        np.asarray(next_obs_buf, dtype=obs.dtype),
                        np.asarray(action_buf, dtype=action.dtype))))

    return np.mean(episode_rewards), np.mean(episode_inv_pred_vars)
예제 #19
0
    def run_eval_loop2(sample_stochastically=True,
                       cor_func="no_cor",
                       cor_sev=1):
        cor = Corruptor(cor_func=cor_func, severity=cor_sev)

        start_time = time.time()
        prefix = 'stochastic_' if sample_stochastically else ''

        all_ep_rewards = []
        for i in range(num_episodes):
            obs = env.reset()
            obs = cor.corrupt_stacked_images(
                obs, args.frame_stack)  # added corruption after env
            done = False
            episode_reward = 0
            while not done:
                # center crop image
                if args.encoder_type == 'pixel' and 'crop' in args.data_augs:
                    obs = utils.center_crop_image(obs, args.image_size)
                if args.encoder_type == 'pixel' and 'translate' in args.data_augs:
                    # first crop the center with pre_image_size
                    obs = utils.center_crop_image(
                        obs, args.pre_transform_image_size)
                    # then translate cropped to center
                    obs = utils.center_translate(obs, args.image_size)
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        action = agent.sample_action(obs / 255.)
                    else:
                        action = agent.select_action(obs / 255.)
                obs, reward, done, _ = env.step(action)
                obs = cor.corrupt_stacked_images(
                    obs, args.frame_stack)  # added corruption after env
                episode_reward += reward

            all_ep_rewards.append(episode_reward)

        mean_ep_reward = np.mean(all_ep_rewards)
        best_ep_reward = np.max(all_ep_rewards)
        std_ep_reward = np.std(all_ep_rewards)

        end_time = time.time()

        return step, mean_ep_reward, best_ep_reward, std_ep_reward, end_time - start_time
예제 #20
0
    def evaluate(self, env, train=False):
        for episode in range(self.cfg.num_eval_episodes):
            obs = env.reset()
            self.agent.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, done, _ = env.step(action)
                self.video_recorder.record(env)
                episode_reward += reward

            self.video_recorder.save(f"{self.step}.mp4")
            if train:
                self.logger.log("eval/train_episode_reward", episode_reward,
                                self.step[0])
            else:
                self.logger.log("eval/eval_episode_reward", episode_reward,
                                self.step[0])
예제 #21
0
def evaluate(env, agent, video, num_episodes, L, step, test_env=False):
    episode_rewards = []
    for i in range(num_episodes):
        obs = env.reset()
        video.init(enabled=(i == 0))
        done = False
        episode_reward = 0
        while not done:
            with utils.eval_mode(agent):
                action = agent.select_action(obs)
            obs, reward, done, _ = env.step(action)
            video.record(env)
            episode_reward += reward

        if L is not None:
            _test_env = '_test_env' if test_env else ''
            video.save(f'{step}{_test_env}.mp4')
            L.log(f'eval/episode_reward{_test_env}', episode_reward, step)
        episode_rewards.append(episode_reward)

    return np.mean(episode_rewards)
예제 #22
0
    def evaluate(self):
        average_episode_reward = 0
        eps_reward = []

        eps_done = 0

        # while eps_done < self.cfg.num_eval_episodes:
        for episode in range(self.cfg.num_eval_episodes):
            obs = self.env.reset()
            # self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            episode_step = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)

                # This is unnecessary here...
                self.agent.osl.train(True)

                obs, reward, done, info = self.env.step(action)
                # self.video_recorder.record(self.env)
                episode_reward += reward
                episode_step += 1

            # if episode_reward > 0:
            #     eps_reward.append(episode_reward)
            #     average_episode_reward += episode_reward
            #     eps_done += 1
            # else:
            #     continue

            average_episode_reward += episode_reward
            # self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        sd_episode_reward = np.std(eps_reward)
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.dump(self.step)
        return average_episode_reward, sd_episode_reward
예제 #23
0
def evaluate(env, agent, cfg):
    average_episode_reward = 0
    for episode in range(cfg.num_eval_episodes):
        obs = env.reset()
        agent.reset()
        #self.video_recorder.init(enabled=(episode == 0))
        done = False
        episode_reward = 0
        while not done:
            with utils.eval_mode(agent):
                if attach_state:
                    obs = np.concatenate((obs, get_env_state(env, cfg)),
                                         axis=0)
                action = agent.act(obs, sample=False)
            obs, reward, done, _ = env.step(action)
            #video_recorder.record(self.env)
            episode_reward += reward

        average_episode_reward += episode_reward
        #video_recorder.save(f'{self.step}.mp4')
    average_episode_reward /= cfg.num_eval_episodes
    return average_episode_reward
예제 #24
0
    def run_eval_loop(sample_stochastically=True):
        start_time = time.time()
        prefix = "stochastic_" if sample_stochastically else ""
        for i in tqdm(range(num_episodes), desc='eval', unit='ep'):
            obs = env.reset()
            video.init(enabled=(i == 0))
            done = False
            episode_reward = 0
            episode_info = defaultdict(int)
            while not done:
                # center crop image
                if args.encoder_type == "mixed":
                    state, img = utils.split_obs(obs)
                    img = utils.center_crop_image(img, args.image_size)
                    obs = utils.combine_obs(state, img)
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        action = agent.sample_action(obs)
                    else:
                        action = agent.select_action(obs)
                obs, reward, done, info = env.step(action)

                for k in keys_to_monitor:
                    episode_info[k] += info[k]
                video.record(env, yaw=i)
                episode_reward += reward

            for k in keys_to_monitor:
                L.log("eval/" + prefix + k, np.sum(episode_info[k]), step)
            video.save("%d.mp4" % step)
            L.log("eval/" + prefix + "episode_reward", episode_reward, step)
            all_ep_rewards.append(episode_reward)

        L.log("eval/" + prefix + "eval_time", time.time() - start_time, step)
        mean_ep_reward = np.mean(all_ep_rewards)
        best_ep_reward = np.max(all_ep_rewards)
        L.log("eval/" + prefix + "mean_episode_reward", mean_ep_reward, step)
        L.log("eval/" + prefix + "best_episode_reward", best_ep_reward, step)
예제 #25
0
파일: train.py 프로젝트: JadenTravnik/proto
    def evaluate(self):
        avg_episode_reward = 0
        for episode in range(self.cfg.num_eval_episodes):
            time_step = self.eval_env.reset()
            self.eval_video_recorder.init(enabled=(episode == 0))
            episode_reward = 0
            episode_success = 0
            episode_step = 0
            while not time_step.last():
                agent = self.get_agent()
                with utils.eval_mode(agent):
                    obs = time_step.observation['pixels']
                    action = agent.act(obs, sample=False)
                time_step = self.eval_env.step(action)
                self.eval_video_recorder.record(self.eval_env)
                episode_reward += time_step.reward
                episode_step += 1

            avg_episode_reward += episode_reward
            self.eval_video_recorder.save(f'{self.step}.mp4')
        avg_episode_reward /= self.cfg.num_eval_episodes
        self.logger.log('eval/episode_reward', avg_episode_reward, self.step)
        self.logger.dump(self.step, ty='eval')
예제 #26
0
def evaluate(env, agent, video, num_episodes, L, step):
    for i in range(num_episodes):
        a_anti_stuck = np.array([0, 0, 0.1, 0, 0, 0])
        env.step(a_anti_stuck)
        obs = env.reset()
        video.init(enabled=(i == 0))
        done = False
        episode_reward = 0
        step_counter = 0
        while not done:
            with utils.eval_mode(agent):
                action = agent.select_action(obs)
                action = np.multiply(action, env.action_space.high)
            obs, reward, done, _ = env.step(action)
            print("TE: {}   | TS: {}   | TR: {:.4f} | TER: {:.4f} | TA: {}".
                  format(i, step_counter, round(reward, 4),
                         round(episode_reward, 4), action))
            step_counter += 1
            video.record(env)
            episode_reward += reward
        video.save('%d.mp4' % step)
        L.log('eval/episode_reward', episode_reward, step)
    L.dump(step)
예제 #27
0
    def evaluate(self):
        average_episode_reward = 0
        for trial in range(self.num_eval_episodes):
            # This will send jaco to real home
            obs = self.jaco_real_env.reset()
            sim_obs = self.eval_env_sim.reset()
            obs['state_low_obs'] = sim_obs['state_low_obs']
            # Now lets go to sim home
            self.send_robot_to_sim_home()
            print('Done sending him home')
            self.sim_video_recorder.init(enabled=(trial == 0))
            self.real_video_recorder.init(enabled=(trial == 0))
            # What to do with done? Make sim to indicate done?
            # done = False
            episode_reward = 0
            episode_step = 0
            while (episode_step <= self.episode_max_step):
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                translated_act = self.translate_action_sim_to_real(action)

                obs = self.jaco_real_env.step(translated_act)
                obs['state_low_obs'] = sim_obs['state_low_obs']
                print('Translated Act ', translated_act)
                # Take a sim step with the original action
                sim_obs, reward, done, _ = self.eval_env_sim.step(action)

                self.sim_video_recorder.record(self.eval_env_sim)
                self.real_video_recorder.record(self.jaco_real_env, real_jaco=True)
                episode_reward += reward
                episode_step += 1
                # if done: break
            average_episode_reward += episode_reward
            self.sim_video_recorder.save(f'{trial}.mp4')
            self.real_video_recorder.save(f'{trial}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        print('Rewards ', average_episode_reward)
예제 #28
0
    if global_steps % 2500 == 0:
        torch.save(
            agent.critic.state_dict(),
            f'{data_root}/{args.experiment_name}/{args.name}_encoder/{global_steps}.pt'
        )

    s = env.reset()
    s = torch.tensor(s / 255.).float()
    done = False
    steps = 0

    episode_reward = 0

    while not done:
        with eval_mode(agent):
            a = agent.sample_action(s.unsqueeze(0).float().to('cuda:0'))
        # print(a)
        s_, r, done, _ = env.step(a)
        # print(r)
        steps += 1
        episode_reward += r

        s_ = torch.tensor(s_ / 255.).float()

        agent.update(replay_memory, global_steps, True, cpc, noise)

        # Some infinite bootstrapping
        # i.e., never returns the '1.0' flag for done, since there is not target goal state
        done_bool = 0 if steps == env._max_episode_steps else float(done)
예제 #29
0
def main(cfg):
    from omegaconf import OmegaConf
    attach_state = cfg.attach_state
    from_pixels = cfg.from_pixels
    encoder_type = cfg.encoder_type
    if cfg.user_config:
        print("+++++++++++++++++ Using user specified config")
        cfg = OmegaConf.load(cfg.user_config)
        cfg.attach_state = attach_state
        cfg.from_pixels = from_pixels
        cfg.encoder_type = encoder_type

    print("+++++++++++++++++ Configuration : \n", cfg)
    expert_path = home + "/pytorch_sac/expert/" + cfg.env + "_state"
    print("+++++++++++++++++ Expert Path : ", expert_path)
    actor_path = expert_path + "/actor.pt"

    env = utils.make_env(cfg)  # Make env based on cfg.
    #if cfg.frame_stack = True:
    #	self.env = utils.FrameStack(self.env, k=3)
    cfg.agent.params.obs_dim = env.observation_space.shape[0]
    if attach_state:
        cfg.agent.params.obs_dim += get_env_state_dim(cfg)
    cfg.agent.params.action_dim = env.action_space.shape[0]
    cfg.agent.params.action_range = [
        float(env.action_space.low.min()),
        float(env.action_space.high.max())
    ]
    agent = hydra.utils.instantiate(cfg.agent)
    print("Observation Dimension : ", cfg.agent.params.obs_dim)

    conf = OmegaConf.load(expert_path + '/config.yaml')
    assert conf.env == cfg.env
    conf.agent.params.action_dim = env.action_space.shape[0]
    conf.agent.params.action_range = [
        float(env.action_space.low.min()),
        float(env.action_space.high.max())
    ]
    conf.agent.params.obs_dim = get_env_state_dim(conf)

    agent_expert = hydra.utils.instantiate(conf.agent)
    agent_expert.actor.load_state_dict(torch.load(actor_path))
    #video_recorder = VideoRecorder(None)

    data = Dataset((cfg.agent.params.obs_dim, ), (conf.agent.params.obs_dim, ),
                   env.action_space.shape, 1000000, torch.device("cuda"))
    collect_steps = 1000000
    print("DATASET CAPACITY : 1000000, Collecting Steps : ", collect_steps)
    loss = nn.MSELoss()
    #collect_ep = 4000

    step = 0
    ep = 0
    start_time = time.time()

    while (step < collect_steps):
        obs = env.reset()
        state = get_env_state(env, cfg)

        action_expert = None
        done = False
        episode_step = 0
        episode_reward = 0
        ep_start_time = time.time()
        while not done:
            with utils.eval_mode(agent_expert):
                action_expert = agent_expert.act(state, sample=False)
            if ep % 4 == 0:
                action_expert = env.action_space.sample()
            if ep % 4 == 1:
                action_expert += np.random.rand(*action_expert.shape) * 0.1
            if ep % 4 == 2:
                action_expert += np.random.rand(*action_expert.shape) * 1

            action_expert = np.clip(action_expert, -1, 1)

            next_obs, reward, done, extra = env.step(action_expert)
            next_state = get_env_state(env, cfg)
            #print("XXXXXX\n", obs.shape,"\n", state.shape)
            data.add(obs, state, action_expert, reward, done)

            step += 1
            episode_step += 1
            episode_reward += reward
            done_no_max = 0 if episode_step + 1 == env._max_episode_steps else done

            obs = next_obs
            state = next_state
        ep += 1
        if ep % 100 == 0:
            print("Episode : ", ep, " Episode Reward : ", episode_reward,
                  " Time taken by one episode : ",
                  time.time() - ep_start_time)

    print("Total Time taken : ", time.time() - start_time)
    data.save(home + "/pytorch_sac/Data",
              prefix=cfg.env + "_" + cfg.encoder_type)
def evaluate(env, agent, args, video, adapt=False):
    """Evaluate an agent, optionally adapt using PAD"""
    episode_rewards = []

    for i in tqdm(range(args.pad_num_episodes)):
        ep_agent = deepcopy(agent)  # make a new copy

        if args.use_curl:  # initialize replay buffer for CURL
            replay_buffer = utils.ReplayBuffer(
                obs_shape=env.observation_space.shape,
                action_shape=env.action_space.shape,
                capacity=args.train_steps,
                batch_size=args.pad_batch_size)
        video.init(enabled=True)

        obs = env.reset()
        done = False
        episode_reward = 0
        losses = []
        step = 0
        ep_agent.train()

        while not done:
            # Take step
            with utils.eval_mode(ep_agent):
                action = ep_agent.select_action(obs)
            next_obs, reward, done, _ = env.step(action)
            episode_reward += reward

            # Make self-supervised update if flag is true
            if adapt:
                if args.use_rot:  # rotation prediction

                    # Prepare batch of cropped observations
                    batch_next_obs = utils.batch_from_obs(
                        torch.Tensor(next_obs).cuda(),
                        batch_size=args.pad_batch_size)
                    batch_next_obs = utils.random_crop(batch_next_obs)

                    # Adapt using rotation prediction
                    losses.append(ep_agent.update_rot(batch_next_obs))

                if args.use_inv:  # inverse dynamics model

                    # Prepare batch of observations
                    batch_obs = utils.batch_from_obs(
                        torch.Tensor(obs).cuda(),
                        batch_size=args.pad_batch_size)
                    batch_next_obs = utils.batch_from_obs(
                        torch.Tensor(next_obs).cuda(),
                        batch_size=args.pad_batch_size)
                    batch_action = torch.Tensor(action).cuda().unsqueeze(
                        0).repeat(args.pad_batch_size, 1)

                    # Adapt using inverse dynamics prediction
                    losses.append(
                        ep_agent.update_inv(utils.random_crop(batch_obs),
                                            utils.random_crop(batch_next_obs),
                                            batch_action))

                if args.use_curl:  # CURL

                    # Add observation to replay buffer for use as negative samples
                    # (only first argument obs is used, but we store all for convenience)
                    replay_buffer.add(obs, action, reward, next_obs, True)

                    # Prepare positive and negative samples
                    obs_anchor, obs_pos = get_curl_pos_neg(
                        next_obs, replay_buffer)

                    # Adapt using CURL
                    losses.append(
                        ep_agent.update_curl(obs_anchor, obs_pos, ema=True))

            video.record(env, losses)
            obs = next_obs
            step += 1

        video.save(
            f'{args.mode}_pad_{i}.mp4' if adapt else f'{args.mode}_{i}.mp4')
        episode_rewards.append(episode_reward)

    return np.mean(episode_rewards)