Exemplo n.º 1
0
 def collect_new_paths(self, max_path_length, num_steps,
                       discard_incomplete_paths):
     paths = []
     num_steps_collected = 0
     while num_steps_collected < num_steps:
         max_path_length_this_loop = min(  # Do not go over num_steps
             max_path_length,
             num_steps - num_steps_collected,
         )
         path = rollout(
             self._env,
             self._policy,
             max_path_length=max_path_length_this_loop,
         )
         path_len = len(path['actions'])
         if (path_len != max_path_length and not path['terminals'][-1]
                 and discard_incomplete_paths):
             break
         path_expert_len = 0
         # if the path did not reach the goal, add expert demonstration
         if not path['rewards'][-1] > 0:
             path_expert = rollout(
                 self._env,
                 self._expert_policy,
                 max_path_length=max_path_length_this_loop)
             # if expert demonstration successfully reached goal, add it to buffer
             if path_expert['rewards'][-1] > 0:
                 paths.append(path_expert)
                 path_expert_len = len(path_expert['actions'])
             else:
                 print('No expert solution found.')
                 # import pickle as pkl
                 # import os
                 # filename_fails = '/sequoia/data1/rstrudel/code/nmp/fails.pkl'
                 # if os.path.exists(filename_fails):
                 #     with open(filename_fails, 'rb') as fpkl:
                 #         file_fails = pkl.load(fpkl)
                 # else:
                 #     file_fails = []
                 # file_fails.append((self._env.idx_env, self._env.start, self._env.goal))
                 # with open(filename_fails, 'wb') as fpkl:
                 #     pkl.dump(file_fails, fpkl)
         num_steps_collected += path_len + path_expert_len
         paths.append(path)
     self._num_paths_total += len(paths)
     self._num_steps_total += num_steps_collected
     self._epoch_paths.extend(paths)
     return paths
Exemplo n.º 2
0
def simulate_policy(args):
    data = torch.load(args.file)
    fig_dir = os.path.dirname(args.file)
    print(fig_dir)
    policy = data['evaluation/policy']
    env = data['evaluation/env']
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    paths = []
    while len(paths) < args.num_path:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            render=True,
        )
        paths.append(path)
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()

    if args.visualize:
        plot_problem_paths(env, paths, fig_dir, show_fig=False)
Exemplo n.º 3
0
def simulate_policy(args):
    data = torch.load(str(args.file))
    #data = joblib.load(str(args.file))
    policy = data['evaluation/policy']
    env = NormalizedBoxEnv(HalfCheetahEnv())
    #env = data['evaluation/env']
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()

    if args.collect:
        data = []
    for trial in tqdm(range(100)):
        path = rollout(
            env,
            policy,
            max_path_length=args.H + 1,
            render=not args.collect,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
        if args.collect:
            data.append([path['actions'], path['next_observations']])

    if args.collect:
        import pickle
        with open("data/expert.pkl", mode='wb') as f:
            pickle.dump(data, f)
Exemplo n.º 4
0
 def collect_new_paths(
     self,
     max_path_length,
     num_steps,
     discard_incomplete_paths,
 ):
     actions = []
     paths = []
     num_steps_collected = 0
     while num_steps_collected < num_steps:
         max_path_length_this_loop = min(  # Do not go over num_steps
             max_path_length,
             num_steps - num_steps_collected,
         )
         path = rollout(
             self._env,
             self._policy,
             max_path_length=max_path_length_this_loop,
         )
         path_len = len(path['actions'])
         if (path_len != max_path_length and not path['terminals'][-1]
                 and discard_incomplete_paths):
             break
         actions.extend(path['actions'])
         num_steps_collected += path_len
         paths.append(path)
     self._actions = actions
     self._num_paths_total += len(paths)
     self._num_steps_total += num_steps_collected
     self._epoch_paths.extend(paths)
     return paths
Exemplo n.º 5
0
    def collect_new_paths(
        self,
        max_path_length,
        num_steps,
        discard_incomplete_paths,
    ):
        paths = []
        num_steps_collected = 0
        while num_steps_collected < num_steps:
            max_path_length_this_loop = min(  # Do not go over num_steps
                max_path_length,
                num_steps - num_steps_collected,
            )
            path = rollout(
                self._env,
                self._policy,
                max_path_length=max_path_length_this_loop,
            )
            path_len = len(path['actions'])

            # calculate advantages and add column to path
            path = self.add_advantages(path, path_len,
                                       self.calculate_advantages)

            if (path_len != max_path_length and not path['terminals'][-1]
                    and discard_incomplete_paths):
                break
            num_steps_collected += path_len
            paths.append(path)
        self._num_paths_total += len(paths)
        self._num_steps_total += num_steps_collected
        self._epoch_paths.extend(paths)
        return paths
Exemplo n.º 6
0
 def collect_new_paths(self, max_path_length, num_steps,
                       discard_incomplete_paths):
     paths = []
     num_steps_collected = 0
     while num_steps_collected < num_steps:
         max_path_length_this_loop = min(  # Do not go over num_steps
             max_path_length,
             num_steps - num_steps_collected,
         )
         path = rollout(self._env,
                        self._policy,
                        max_path_length=max_path_length_this_loop,
                        render=self._render and len(paths) == 0)
         path_len = len(path['actions'])
         #  : we don't want to skip incomplete paths, and in fact don't have a meaningful max path length
         # if (
         #         path_len != max_path_length
         #         and not path['terminals'][-1]
         #         and discard_incomplete_paths
         # ):
         #     break
         num_steps_collected += path_len
         paths.append(path)
     self._num_paths_total += len(paths)
     self._num_steps_total += num_steps_collected
     self._epoch_paths.extend(paths)
     return paths
Exemplo n.º 7
0
def simulate_policy(args):
    #data = torch.load(args.file)
    variant, data = doc.load_rklit_file(args.session_name)
    if args.mode == 'eval':
        policy = data['evaluation/policy']
    elif args.mode == 'expl':
        policy = data['exploration/policy']
    else:
        policy = None
    #env = data['evaluation/env']
    environment = stuff.NormalizedActions(
        env.DeepBuilderEnv(args.session_name, 6, 7, 20, 12))
    environment.env.is_simulation = args.simulation == 1
    print("Policy loaded")

    set_gpu_mode(True)
    policy.cuda()

    while True:
        path = rollout(
            environment,
            policy,
            #max_path_length=args.H,
            render=False,
        )
        if hasattr(env, "log_diagnostics"):
            environment.log_diagnostics([path])
        logger.dump_tabular()
Exemplo n.º 8
0
def validate(policy, envs, horizon):
    """
    Collect list of stats for each validation env as dict of following format:
        'pickup_wood': [0, 15, 20] means you picked up a wood object at timesteps 0, 15, and 20.
    """
    stats = [{} for _ in range(len(envs))]
    for env_idx, env in enumerate(envs):
        path = rollout(env, policy, horizon)
        for typ in env.object_to_idx.keys():
            if typ in TYPE_TO_CLASS_ABS and TYPE_TO_CLASS_ABS[typ]().can_mine(
                    env):
                key = 'pickup_%s' % typ
                last_val = 0
                pickup_idxs = []
                for t, env_info in enumerate(path['env_infos']):
                    count = env_info[key] - last_val
                    pickup_idxs.extend([t for _ in range(count)])
                    last_val = env_info[key]
                stats[env_idx][key] = pickup_idxs
        for typ in env.interactions.values():
            key = 'made_%s' % typ
            last_val = 0
            made_idxs = []
            for t, env_info in enumerate(path['env_infos']):
                count = env_info[key] - last_val
                made_idxs.extend([t for _ in range(count)])
                last_val = env_info[key]
            stats[env_idx][key] = made_idxs
    return stats
Exemplo n.º 9
0
def experiment(My_args):
    args = getArgs()

    expl_env = environment(args) 
    # expl_env.render()

    My_args.file = '/home/yujr/rlkit/data/Test/Test_2020_06_08_21_52_33_0000--s-79802/params.pkl'
    data = torch.load(My_args.file)
    print("data loaded", data['evaluation/policy'])
    policy = data['evaluation/policy']
    
    print("Policy loaded")
    if My_args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    while True:
        path = rollout(
            expl_env,
            policy,
            max_path_length=My_args.H,
            # render=True,
        )
        print('path')
        # if hasattr(env, "log_diagnostics"):
        #     env.log_diagnostics([path])
        logger.dump_tabular()
Exemplo n.º 10
0
 def collect_new_paths(
     self,
     max_path_length,
     num_steps,
     discard_incomplete_paths,
 ):
     paths = []
     num_steps_collected = 0
     while num_steps_collected < num_steps:
         max_path_length_this_loop = min(  # Do not go over num_steps
             max_path_length,
             num_steps - num_steps_collected,
         )
         path = rollout(
             self._env,
             self._policy,
             max_path_length=max_path_length_this_loop,
         )  # 单条轨迹数据
         path_len = len(path['actions'])
         if (path_len != max_path_length  # 单条轨迹采集数量是否够数
                 and not path['terminals'][-1]  # 末端状态是否terminal==1
                 and
                 discard_incomplete_paths  # 是否抛弃该条轨迹, 这条轨迹结束原因不明,可能发生了环境bug
             ):
             break
         num_steps_collected += path_len
         paths.append(path)  # 存储多条轨迹
     self._num_paths_total += len(paths)
     self._num_steps_total += num_steps_collected
     self._epoch_paths.extend(
         paths)  # 轨迹队列里存储 最多 _max_num_epoch_paths_saved 条轨迹
     # 返回本回生成的多条轨迹
     return paths
Exemplo n.º 11
0
    def collect_new_paths(
        self,
        max_path_length,
        num_steps,
        discard_incomplete_paths,
        policy_fn=None,
    ):
        paths = []
        num_steps_collected = 0
        while num_steps_collected < num_steps:
            max_path_length_this_loop = min(  # Do not go over num_steps
                max_path_length,
                num_steps - num_steps_collected,
            )
            path = rollout(
                self._env,
                self._policy,
                max_path_length=max_path_length_this_loop,
            )
            path_len = len(path['actions'])
            if (path_len != max_path_length and not path['terminals'][-1]
                    and discard_incomplete_paths):
                break
            num_steps_collected += path_len

            ## Used to sparsify reward
            if self._sparse_reward:
                random_noise = np.random.normal(size=path['rewards'].shape)
                path['rewards'] = path['rewards'] + 1.0 * random_noise

            paths.append(path)
        self._num_paths_total += len(paths)
        self._num_steps_total += num_steps_collected
        self._epoch_paths.extend(paths)
        return paths
Exemplo n.º 12
0
def simulate_policy(args):
    data = torch.load(args.file)
    policy = data['evaluation/policy']

    if args.gpu:
        ptu.set_gpu_mode(True)
        policy.cuda()
        print("set gpu")
    print(ptu.device)

    config_file = get_config_file(args.config_file)
    env = NormalizedBoxEnv(
        load_env(args, config_file, args.env_mode, ptu.device.index))

    print("Policy loaded")

    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            render=False,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
Exemplo n.º 13
0
def simulate_policy(args):
    data = torch.load(args.file)
    policy = data['evaluation/policy']
    env = data['evaluation/env']
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    paths = []
    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            render=True,
        )
        paths.append(path)
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics(paths)
        if hasattr(env, "get_diagnostics"):
            for k, v in env.get_diagnostics(paths).items():
                logger.record_tabular(k, v)
        else:
            logger.record_dict(
                eval_util.get_generic_path_information(paths),
                prefix="evaluation/",
            )
        logger.dump_tabular()
Exemplo n.º 14
0
 def get_validation_returns(self, snapshot):
     policy = snapshot['evaluation/policy']
     policy = PolicyWrappedWithExplorationStrategy(
         EpsilonGreedy(self.eval_env.action_space, 0.1), policy)
     validation_envs = pickle.load(open(self.validation_envs_pkl, 'rb'))
     returns = np.zeros(len(validation_envs['envs']))
     for env_idx, env in enumerate(validation_envs['envs']):
         path = rollout(env, policy, self.validation_rollout_length)
         returns[env_idx] = path['rewards'].sum()
     return {'returns': returns.mean()}
Exemplo n.º 15
0
def simulate_policy(args):
    data = pickle.load(open(args.file, "rb"))
    policy_key = args.policy_type + '/policy'
    if policy_key in data:
        policy = data[policy_key]
    else:
        raise Exception("No policy found in loaded dict. Keys: {}".format(
            data.keys()))

    env_key = args.env_type + '/env'
    if env_key in data:
        env = data[env_key]
    else:
        raise Exception("No environment found in loaded dict. Keys: {}".format(
            data.keys()))

    if isinstance(env, RemoteRolloutEnv):
        env = env._wrapped_env
    print("Policy loaded")

    if args.enable_render:
        # some environments need to be reconfigured for visualization
        env.enable_render()
    if args.gpu:
        ptu.set_gpu_mode(True)
    if hasattr(policy, "to"):
        policy.to(ptu.device)
    if hasattr(env, "vae"):
        env.vae.to(ptu.device)

    if args.deterministic:
        policy = MakeDeterministic(policy)

    if args.pause:
        import ipdb
        ipdb.set_trace()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    paths = []
    while True:
        paths.append(
            rollout(
                env,
                policy,
                max_path_length=args.H,
                render=not args.hide,
            ))
        if args.log_diagnostics:
            if hasattr(env, "log_diagnostics"):
                env.log_diagnostics(paths, logger)
            for k, v in eval_util.get_generic_path_information(paths).items():
                logger.record_tabular(k, v)
            logger.dump_tabular()
def simulate_policy(args):
    filename = args.file
    data = torch.load(filename)
    filename_token = filename.split('/')
    save_path = os.path.join(
        os.path.join(*filename_token[:-1]),
        'offline_buffer_' + filename_token[-1].split('.')[0] + '.hdf5')
    print(save_path)
    print(data)
    '''
    I don't know why but they did not save the policy for evalutaion.
    Instead of that, I used trainer/policy
    '''
    policy = data['trainer/policy']
    env = data['evaluation/env']
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()

    n = 0
    traj_list = []
    while n < args.buffer_size:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            render=False,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
        n = n + len(path['rewards'])
        print('Saving %d sequences' % n)
        traj_list.append(path)
        # Visualize one trajectory
        if args.visualize:
            states = path['observations']
            states = np.concatenate(
                [states, path['next_observations'][-1:, :]], axis=0)
            gr = 0.1  # goal radius, for visualization purposes
            g = np.array([1.0, 1.0])
            plt.figure(figsize=(8, 8))
            axes = plt.axes()
            axes.set(aspect='equal')
            plt.axis([-0.25, 1.25, -0.25, 1.25])
            circle = plt.Circle((g[0], g[1]), radius=gr)
            axes.add_artist(circle)
            plt.plot(states[:-1, 0], states[:-1, 1], '-o')
            plt.plot(states[-1, 0], states[-1, 1], '-x', markersize=20)
            plt.show()
    '''
Exemplo n.º 17
0
def simulate_policy(args):
    data = torch.load(args.file)
    policy = data['evaluation/policy']
    env = data['evaluation/env']
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            render=True,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
Exemplo n.º 18
0
 def collect_new_paths(self, max_path_length, num_steps):
     paths = []
     num_steps_collected = 0
     while num_steps_collected < num_steps:
         path = rollout(
             self._env,
             self._policy,
             max_path_length=min(  # Do not go over num_steps
                 max_path_length,
                 num_steps - num_steps_collected,
             ),
         )
         num_steps_collected += len(path['actions'])
         paths.append(path)
     self._num_paths_total += len(paths)
     self._num_steps_total += num_steps_collected
     self._epoch_paths.extend(paths)
     return paths
Exemplo n.º 19
0
    def collect_new_paths(
        self,
        max_path_length,
        num_eps
    ):
        paths = []
        ep_collected = 0
        fails = 0
        num_steps_collected = 0
        while ep_collected < num_eps:
            path = rollout(
                self._env,
                self._policy,
                max_path_length=max_path_length
            )
            path_len = len(path['actions'])
            ep_collected += 1
            paths.append(path)
        self._num_steps_total += num_steps_collected
        self._num_paths_total += len(paths)
        self._epoch_paths.extend(paths)

        last_rewards = [path["rewards"][-1] for path in paths]
        returns = [sum(path["rewards"]) for path in paths]
        lengths = [len(path["actions"]) for path in paths]
        terminals = [path['terminals'][-1] for path in paths]

        #  passed criterion
        solved = False

        if self.pass_criterion(returns, lengths, terminals, last_rewards, max_path_length):
            print("Solved")
            solved = True

        # # reach the end
        # def criterion(path):
        #     return path['terminals'] and len(path['rewards']) <= max_path_length and \
        #            path['reward'][-1] != self.terminal_reward
        # finished = sum([1 if criterion(path) else 0 for path in paths])
        # if finished == len(paths):
        #     solved = True
        return paths, solved
Exemplo n.º 20
0
 def collect_new_paths(
     self,
     max_path_length,
     num_paths,
 ):
     paths = []
     num_steps_collected = 0
     for _ in range(num_paths):
         path = rollout(
             self._env,
             self._policy,
             max_path_length=max_path_length,
         )
         path_len = len(path['actions'])
         num_steps_collected += path_len
         paths.append(path)
     self._num_paths_total += len(paths)
     self._num_steps_total += num_steps_collected
     self._epoch_paths.extend(paths)
     return paths
Exemplo n.º 21
0
    def collect_new_paths(
            self,
            max_path_length,
            num_steps,
            discard_incomplete_paths,
    ):
        paths = []
        num_steps_collected = 0
        while num_steps_collected < num_steps:
            max_path_length_this_loop = min(  # Do not go over num_steps
                max_path_length,
                num_steps - num_steps_collected,
            )
            path = rollout(
                self._env,
                self._policy,
                max_path_length=max_path_length_this_loop,
            )
            path_len = len(path['actions'])
            if (
                    path_len != max_path_length
                    and not path['terminals'][-1]
                    and discard_incomplete_paths
            ):
                break
            num_steps_collected += path_len

            ## Used to sparsify reward
            if self._sparse_reward:
                random_noise = np.random.normal(size=path['rewards'].shape)
                path['rewards'] = path['rewards'] + 1.0*random_noise 
                # bins = np.array([-10, -0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
                # temp_rewards = np.cast(path['rewards']/2.0, ) 
                # temp_rewards = (path['rewards'] > 1.0)
                # path['rewards'] = temp_rewards.astype(np.float32)

            paths.append(path)
        self._num_paths_total += len(paths)
        self._num_steps_total += num_steps_collected
        self._epoch_paths.extend(paths)
        return paths
Exemplo n.º 22
0
def simulate_policy(args):
    data = torch.load(args.file)
    policy = data['evaluation/policy']
    env = data['evaluation/env']
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    num_fail = 0
    for _ in range(args.ep):
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            render=False,
            sleep=args.S,
        )
        if np.any(path['rewards'] == -1):
            num_fail += 1
            if args.de:
                last_obs = np.moveaxis(
                    np.reshape(path['observations'][-1], (3, 33, 33)), 0, -1)
                last_next_obs = np.moveaxis(
                    np.reshape(path['next_observations'][-1], (3, 33, 33)), 0,
                    -1)
                last_obs = (last_obs * 33 + 128).astype(np.uint8)
                last_next_obs = (last_next_obs * 33 + 128).astype(np.uint8)
                fig = plt.figure(figsize=(10, 10))
                fig.add_subplot(2, 1, 1)
                plt.imshow(last_obs)
                fig.add_subplot(2, 1, 2)
                plt.imshow(last_next_obs)
                plt.show()
                plt.close()

        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()

    print('number of failures:', num_fail)
Exemplo n.º 23
0
    def pretrain(self):
        if (self.num_paths_for_normalization == 0 or
            (self.obs_normalizer is None and self.action_normalizer is None)):
            return

        pretrain_paths = []
        random_policy = RandomPolicy(self.env.action_space)
        while len(pretrain_paths) < self.num_paths_for_normalization:
            path = rollout(self.env, random_policy, self.max_path_length)
            pretrain_paths.append(path)
        ob_mean, ob_std, ac_mean, ac_std = (
            compute_normalization(pretrain_paths))
        if self.obs_normalizer is not None:
            self.obs_normalizer.set_mean(ob_mean)
            self.obs_normalizer.set_std(ob_std)
            self.target_qf.obs_normalizer = self.obs_normalizer
            self.target_policy.obs_normalizer = self.obs_normalizer
        if self.action_normalizer is not None:
            self.action_normalizer.set_mean(ac_mean)
            self.action_normalizer.set_std(ac_std)
            self.target_qf.action_normalizer = self.action_normalizer
            self.target_policy.action_normalizer = self.action_normalizer
Exemplo n.º 24
0
 def collect_new_paths(self,
                       max_path_length,
                       num_steps,
                       discard_incomplete_paths,
                       continuing=False):
     if not continuing:
         # reset held state re: env and obs since we're resetting now
         self.curr_env = self._env
         self.last_obs = None
     path, self.curr_env, self.last_obs = rollout(
         self.curr_env,
         self._policy,
         #  : this is not a typo
         max_path_length=num_steps,
         render=self._render,
         return_env_obs=True,
         continuing=continuing,
         obs=self.last_obs)
     path_len = len(path['actions'])
     self._num_paths_total += 1
     self._num_steps_total += path_len
     self._epoch_paths.append(path)
     return path
Exemplo n.º 25
0
def simulate_policy(args):
    data = torch.load(args.file)
    print(data)
    # policy = data['evaluation/policy']
    '''
    I don't know why but they did not save the policy for evalutaion.
    Instead of that, I used trainer/policy
    '''
    policy = data['trainer/policy']
    env = data['evaluation/env']
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            render=True,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
Exemplo n.º 26
0
    def validate(self, snapshot):
        """
        Collect list of stats for each validation env as dict of following format:
            'pickup_wood': [0, 15, 20] means you picked up a wood object at timesteps 0, 15, and 20.
        """
        policy = snapshot['evaluation/policy']
        if hasattr(policy, 'policy'):
            # if it's reset free, strip out the underlying policy from the exploration strategy
            policy = policy.policy
        policy = PolicyWrappedWithExplorationStrategy(
            EpsilonGreedy(self.eval_env.action_space, 0.1), policy)

        validation_envs = pickle.load(open(self.validation_envs_pkl, 'rb'))
        stats = [{} for _ in range(len(validation_envs['envs']))]
        for env_idx, env in enumerate(validation_envs['envs']):
            path = rollout(env, policy, self.validation_rollout_length)
            for typ in env.object_to_idx.keys():
                if typ not in ['empty', 'wall', 'tree']:
                    key = 'pickup_%s' % typ
                    last_val = 0
                    pickup_idxs = []
                    for t, env_info in enumerate(path['env_infos']):
                        count = env_info[key] - last_val
                        pickup_idxs.extend([t for _ in range(count)])
                        last_val = env_info[key]
                    stats[env_idx][key] = pickup_idxs
            for typ in env.interactions.values():
                key = 'made_%s' % typ
                last_val = 0
                made_idxs = []
                for t, env_info in enumerate(path['env_infos']):
                    count = env_info[key] - last_val
                    made_idxs.extend([t for _ in range(count)])
                    last_val = env_info[key]
                stats[env_idx][key] = made_idxs
        return stats
Exemplo n.º 27
0
def offpolicy_inference():
    import time
    from gym import wrappers

    filename = str(uuid.uuid4())

    gpu = True

    env, _, _ = prepare_env(args.env_name, args.visionmodel_path, **env_kwargs)

    snapshot = torch.load(args.load_name)
    policy = snapshot['evaluation/policy']
    if args.env_name.find('doorenv') > -1:
        policy.knob_noisy = args.knob_noisy
        policy.nn = env._wrapped_env.nn
        policy.visionnet_input = env_kwargs['visionnet_input']

    epi_counter = 1
    dooropen_counter = 0
    total_time = 0
    test_num = 100

    if evaluation:
        render = False
    else:
        if not args.unity:
            render = True
        else:
            render = False

    start_time = int(time.mktime(time.localtime()))

    if gpu:
        set_gpu_mode(True)
    while True:
        if args.env_name.find('doorenv') > -1:
            path, door_opened, opening_time = rollout(
                env,
                policy,
                max_path_length=512,
                doorenv=True,
                render=render,
                evaluate=True,
            )
            print("done first")
            if hasattr(env, "log_diagnostics"):
                env.log_diagnostics([path])
            logger.dump_tabular()
            if evaluation:
                env, _, _ = prepare_env(args.env_name, args.visionmodel_path,
                                        **env_kwargs)
                if door_opened:
                    dooropen_counter += 1
                    total_time += opening_time
                    eval_print(dooropen_counter, epi_counter, start_time,
                               total_time)

        else:
            path = rollout(
                env,
                policy,
                max_path_length=512,
                doorenv=False,
                render=render,
            )
            if hasattr(env, "log_diagnostics"):
                env.log_diagnostics([path])
            logger.dump_tabular()

        if evaluation:
            print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format(epi_counter))
            epi_counter += 1

            if args.env_name.find('door') > -1 and epi_counter > test_num:
                eval_print(dooropen_counter, epi_counter, start_time,
                           total_time)
                break
Exemplo n.º 28
0
parser.add_argument('--log_dir', type=str, default='PPO')
parser.add_argument('--file', type=str, default='params')
parser.add_argument('--epoch', type=int, default=None)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--neval', type=int, default=100)
args = parser.parse_args()

pre_dir = './Data/' + args.exp_name + args.extra_name
data_path = '{}/{}/seed{}/{}.pkl'.format(pre_dir, args.log_dir, args.seed,
                                         args.file)
data = torch.load(data_path, map_location='cpu')

policy = data['trainer/policy']
policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)

import sys
from traffic.make_env import make_env
import json
with open('{}/{}/seed{}/variant.json'.format(pre_dir, args.log_dir,
                                             args.seed)) as f:
    variant = json.load(f)
env = make_env(args.exp_name, **variant['env_kwargs'])

returns = []
for i in range(args.neval):
    path = rollout(env, policy, max_path_length=200)
    ret = np.sum(path['rewards'])
    returns.append(ret)

print(np.mean(returns), np.std(returns))
Exemplo n.º 29
0
def offpolicy_inference(seed,
                        env_name,
                        det,
                        load_name,
                        evaluation,
                        render,
                        knob_noisy,
                        visionnet_input,
                        env_kwargs,
                        actor_critic=None,
                        verbose=True,
                        pos_control=True,
                        step_skip=4):

    import time
    from gym import wrappers

    print("evaluatin started!")

    filename = str(uuid.uuid4())

    gpu = True

    env, _, _ = prepare_env(env_name, **env_kwargs)

    if not actor_critic:
        snapshot = torch.load(load_name)
        policy = snapshot['evaluation/policy']
    else:
        policy = actor_critic
    if env_name.find('doorenv') > -1:
        policy.knob_noisy = knob_noisy
        policy.nn = env._wrapped_env.nn
        policy.visionnet_input = env_kwargs['visionnet_input']

    epi_counter = 1
    dooropen_counter = 0
    total_time = 0
    test_num = 100

    start_time = int(time.mktime(time.localtime()))

    if gpu:
        set_gpu_mode(True)
    while True:
        # print("new env")
        if env_name.find('doorenv') > -1:
            if evaluation:
                path, door_opened, opening_time = rollout(
                    env,
                    policy,
                    max_path_length=512,
                    render=render,
                    evaluate=evaluation,
                    verbose=True,
                    doorenv=True,
                    pos_control=pos_control,
                    step_skip=step_skip,
                )
                if hasattr(env, "log_diagnostics"):
                    env.log_diagnostics([path])
                logger.dump_tabular()
                # if evaluation:
                # print("1")
                env, _, _ = prepare_env(env_name, **env_kwargs)
                if door_opened:
                    dooropen_counter += 1
                    total_time += opening_time
                    if verbose:
                        print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format(
                            epi_counter))
                        eval_print(dooropen_counter, epi_counter, start_time,
                                   total_time)
            else:
                path = rollout(
                    env,
                    policy,
                    max_path_length=512,
                    render=render,
                    evaluate=evaluation,
                    verbose=True,
                    doorenv=True,
                    pos_control=pos_control,
                    step_skip=step_skip,
                )
                if hasattr(env, "log_diagnostics"):
                    env.log_diagnostics([path])
                logger.dump_tabular()

        else:
            path = rollout(
                env,
                policy,
                max_path_length=512,
                doorenv=False,
                render=render,
            )
            if hasattr(env, "log_diagnostics"):
                env.log_diagnostics([path])
            logger.dump_tabular()

        if evaluation:
            if verbose:
                print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format(epi_counter))
                eval_print(dooropen_counter, epi_counter, start_time,
                           total_time)
            epi_counter += 1

            if env_name.find('door') > -1 and epi_counter > test_num:
                if verbose:
                    print("dooropening counter:", dooropen_counter,
                          " epi counter:", epi_counter)
                    eval_print(dooropen_counter, epi_counter, start_time,
                               total_time)
                break

    opening_rate, opening_timeavg = eval_print(dooropen_counter,
                                               epi_counter - 1, start_time,
                                               total_time)
    return opening_rate, opening_timeavg
Exemplo n.º 30
0
def get_gifs_heatmaps(exps_dir_name, seeds, save_dir, titles):
    data_dir = join(get_repo_dir(), 'data')
    exps_dir = join(data_dir, exps_dir_name)
    gifs_dir = join(data_dir, 'gifs')
    heat_dir = join(data_dir, 'heatmaps')

    # load variant and get pickled validation envs
    rand_exp_dir = glob(join(exps_dir, '*'))[0]
    with open(join(rand_exp_dir, 'variant.json'), 'r') as f:
        variant = json.load(f)
    task_obj = variant['env_kwargs']['task'].split()[1]
    val_envs_path = variant['algo_kwargs']['algorithm_kwargs'][
        'validation_envs_pkl']
    val_rollout_len = variant['algo_kwargs']['algorithm_kwargs'][
        'validation_rollout_length']
    val_envs = get_val_envs(val_envs_path)

    # load policy
    for seed_idx, seed in enumerate(seeds):
        val_env_idxs = random.sample(list(range(len(val_envs))), 10)
        exp_dir = glob(join(exps_dir, '*%d' % seed))[0]
        """ Get policy """
        pol_file = max(glob(join(exp_dir, 'itr_*.pkl')),
                       key=lambda pol_path: int(basename(pol_path)[4:-4]))
        # to override policy itr number
        # pol_file = join(exp_dir, 'itr_%d.pkl' % 2990)
        print(pol_file)
        with open(pol_file, 'rb') as f:
            policy = pickle.load(f)['evaluation/policy']
        if hasattr(policy, 'policy'):
            # if it's reset free, strip out the underlying policy from the exploration strategy
            policy = policy.policy
        policy = PolicyWrappedWithExplorationStrategy(
            EpsilonGreedy(spaces.Discrete(7), 0.1), policy)

        # re-fetch the val envs each time so that envs are fresh
        # val_envs = get_val_envs(val_envs_path)
        # """ Get gifs """
        # stats = [{} for _ in range(len(val_env_idxs))]
        # for meta_idx, env_idx in enumerate(val_env_idxs):
        #     env = val_envs[env_idx]
        #     path = rollout(env, policy, val_rollout_len, render=True, save=True,
        #                    save_dir=join(gifs_dir, exps_dir_name, save_dir, str(seed), str(env_idx)))
        #     env.render(close=True)
        #     for typ in env.object_to_idx.keys():
        #         if typ not in ['empty', 'wall', 'tree']:
        #             key = 'pickup_%s' % typ
        #             last_val = 0
        #             pickup_idxs = []
        #             for t, env_info in enumerate(path['env_infos']):
        #                 count = env_info[key] - last_val
        #                 pickup_idxs.extend([t for _ in range(count)])
        #                 last_val = env_info[key]
        #             stats[meta_idx][key] = pickup_idxs
        #     for typ in env.interactions.values():
        #         key = 'made_%s' % typ
        #         last_val = 0
        #         made_idxs = []
        #         for t, env_info in enumerate(path['env_infos']):
        #             count = env_info[key] - last_val
        #             made_idxs.extend([t for _ in range(count)])
        #             last_val = env_info[key]
        #         stats[meta_idx][key] = made_idxs
        # solved = [val_env_idxs[i] for i, stat in enumerate(stats) if stat['pickup_%s' % task_obj]]
        # print('seed %d solved %d percent:' % (seed, 100 * len(solved) // len(val_env_idxs)))
        # print(solved)

        # re-fetch the val envs each time so that envs are fresh
        val_envs = get_val_envs(val_envs_path)
        print('refetched envs')
        """ Get heatmaps """
        vcs = []
        for env_idx, env in enumerate(val_envs):
            path = rollout(env, policy, val_rollout_len)
            vcs.append(env.visit_count)
        visit_count_sum = sum(vcs)
        plt.imshow(visit_count_sum)
        plt.title('Validation Tasks State Visitation Count (%s)' %
                  titles[seed_idx])
        plt.axis('off')
        vc_save_path = join(heat_dir, exps_dir_name, save_dir, str(seed))
        os.makedirs(vc_save_path, exist_ok=True)
        plt.savefig(join(vc_save_path, 'map.png'))