示例#1
0
def simulate_policy(args):
    data = joblib.load(args.file)
    policy = data['policy']
    env = data['env']

    # wrapped_env = gym.make('HalfCheetahHolePositions-v{}'.format(1))
    # disc = data['env'].disc
    # reward_params = data['env'].reward_params
    # unsupervised_reward_weight = data['env'].unsupervised_reward_weight
    # reward_weight = data['env'].reward_weight
    # env = DiscriminatorWrappedEnv(wrapped_env=wrapped_env,
    #                              disc=disc,
    #                              reward_params=reward_params,
    #                              unsupervised_reward_weight=unsupervised_reward_weight,
    #                              reward_weight=reward_weight)
    # env = data['env']
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True, 1)
        policy.cuda()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    while True:
        skill = np.random.randint(0, args.num_skills)
        path = rollout(env,
                       policy,
                       max_path_length=args.H,
                       animated=True,
                       skill=skill,
                       deterministic=True)
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
示例#2
0
    def pretrain(self):
        if (
            self.num_paths_for_normalization == 0
            or (self.obs_normalizer is None and self.action_normalizer is None)
        ):
            return

        pretrain_paths = []
        random_policy = RandomPolicy(self.env.action_space)
        while len(pretrain_paths) < self.num_paths_for_normalization:
            path = rollout(self.env, random_policy, self.max_path_length)
            pretrain_paths.append(path)
        ob_mean, ob_std, ac_mean, ac_std = (
            compute_normalization(pretrain_paths)
        )
        if self.obs_normalizer is not None:
            self.obs_normalizer.set_mean(ob_mean)
            self.obs_normalizer.set_std(ob_std)
            self.target_qf.obs_normalizer = self.obs_normalizer
            self.target_policy.obs_normalizer = self.obs_normalizer
        if self.action_normalizer is not None:
            self.action_normalizer.set_mean(ac_mean)
            self.action_normalizer.set_std(ac_std)
            self.target_qf.action_normalizer = self.action_normalizer
            self.target_policy.action_normalizer = self.action_normalizer
示例#3
0
 def obtain_samples(self,
                    deterministic=False,
                    max_samples=np.inf,
                    max_trajs=np.inf,
                    accum_context=True,
                    resample=1):
     """
     Obtains samples in the environment until either we reach either max_samples transitions or
     num_traj trajectories.
     The resample argument specifies how often (in trajectories) the agent will resample it's context.
     """
     assert max_samples < np.inf or max_trajs < np.inf, "either max_samples or max_trajs must be finite"
     policy = MakeDeterministic(
         self.policy) if deterministic else self.policy
     paths = []
     n_steps_total = 0
     n_trajs = 0
     while n_steps_total < max_samples and n_trajs < max_trajs:
         path = rollout(self.env,
                        policy,
                        max_path_length=self.max_path_length,
                        accum_context=accum_context)
         # save the latent context that generated this trajectory
         path['context'] = policy.z.detach().cpu().numpy()
         paths.append(path)
         n_steps_total += len(path['observations'])
         n_trajs += 1
         # don't we also want the option to resample z ever transition?
         if n_trajs % resample == 0:
             policy.sample_z()
     return paths, n_steps_total
 def _train(self, policy, accum_context):
     for i in range(self.num_train_steps_per_itr):
         path = rollout(self.env,
                        policy,
                        max_path_length=self.max_path_length,
                        accum_context=accum_context)
         self.model.train(path)
示例#5
0
 def obtain_samples(self, rollout_type="multitask"):
     paths = []
     n_steps_total = 0
     while n_steps_total + self.max_path_length <= self.max_samples:
         if self.randomize_env:
             self.env, env_name = self.alg.get_new_env()
             print(f"Evaluating {env_name}")
         if rollout_type == "multitask":
             path = multitask_rollout(
                 self.env,
                 self.policy,
                 max_path_length=self.max_path_length,
                 animated=False,
                 observation_key='observation',
                 desired_goal_key='desired_goal',
                 get_action_kwargs=dict(
                     return_stacked_softmax=False,
                     mask=np.ones((1, self.env.unwrapped.num_blocks)),
                     deterministic=True
                 )
             )
         else:
             path = rollout(
                 self.env, self.policy, max_path_length=self.max_path_length
             )
         paths.append(path)
         n_steps_total += len(path['observations'])
     return paths
示例#6
0
def simulate_policy(args):
    #   data = joblib.load(args.file)
    data = torch.load(args.file)
    policy = data['evaluation/policy']
    env = NormalizedBoxEnv(gym.make("BipedalWalker-v2"))
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()

    import cv2
    video = cv2.VideoWriter('ppo_test.avi',
                            cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 30,
                            (640, 480))
    index = 0
    path = rollout(
        env,
        policy,
        max_path_length=args.H,
        render=True,
    )
    if hasattr(env, "log_diagnostics"):
        env.log_diagnostics([path])
    logger.dump_tabular()

    for i, img in enumerate(path['images']):
        print(i)
        video.write(img[:, :, ::-1].astype(np.uint8))
        cv2.imwrite("frames/ppo_test/%06d.png" % index, img[:, :, ::-1])
        index += 1

    video.release()
    print("wrote video")
示例#7
0
def simulate_policy(args):
 #   data = joblib.load(args.file)
    data = torch.load(args.file)
    policy = data['evaluation/policy']
    env = NormalizedBoxEnv(Mani2dEnv())
    # env.reset()
    # print(env.step(env.action_space.sample()))
    # sys.exit()
 #   env = env.wrapped_env.unwrapped
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        # policy.cuda()
    # import cv2
    # video = cv2.VideoWriter('diayn_bipedal_walker_hardcore.avi', cv2.VideoWriter_fourcc('M','J','P','G'), 30, (1200, 800))
    index = 0
    for skill in range(policy.stochastic_policy.skill_dim):
        print(skill)
        for _ in range(3):
            path = rollout(
                env,
                policy,
                skill,
                max_path_length=args.H,
                render=True,
            )
            if hasattr(env, "log_diagnostics"):
                env.log_diagnostics([path])
            logger.dump_tabular()
示例#8
0
文件: in_place.py 项目: ajabri/oyster
    def obtain_samples(self,
                       deterministic=False,
                       num_samples=None,
                       num_rollouts=None,
                       is_online=False):
        policy = MakeDeterministic(
            self.policy) if deterministic else self.policy
        paths = []
        n_steps_total = 0
        max_samp = self.max_samples
        if num_samples is not None:
            max_samp = num_samples

        # import pdb; pdb.set_trace()
        while n_steps_total + self.max_path_length <= max_samp:
            if num_rollouts is not None and num_rollouts <= len(paths):
                break

            path = rollout(self.env,
                           policy,
                           max_path_length=self.max_path_length,
                           is_online=is_online)
            paths.append(path)
            n_steps_total += len(path['observations'])
        return paths
    def obtain_samples(self,
                       deterministic=False,
                       max_samples=np.inf,
                       max_trajs=np.inf,
                       accum_context=True,
                       resample=1,
                       testing=False):
        assert max_samples < np.inf or max_trajs < np.inf, "either max_samples or max_trajs must be finite"
        policy = MakeDeterministic(
            self.policy) if deterministic else self.policy
        paths = []
        n_steps_total = 0
        n_trajs = 0

        if self.itr <= self.num_train_itr:
            if self.tandem_train:
                self._train(policy, accum_context)
                self.itr += 1
            else:
                for _ in range(self.num_train_itr):
                    self._train(policy, accum_context)
                    self.itr += 1

        while n_steps_total < max_samples and n_trajs < max_trajs:
            if testing:
                path = rollout(self.env,
                               policy,
                               max_path_length=self.max_path_length,
                               accum_context=accum_context)
            else:
                path = rollout(self.model,
                               policy,
                               max_path_length=self.max_path_length,
                               accum_context=accum_context)

            # save the latent context that generated this trajectory
            path['context'] = policy.z.detach().cpu().numpy()
            paths.append(path)
            n_steps_total += len(path['observations'])
            n_trajs += 1
            # don't we also want the option to resample z ever transition?
            if n_trajs % resample == 0:
                policy.sample_z()

        return paths, n_steps_total
示例#10
0
def plot_separated_by_task(file):
    file = "./logs/sac-pointmass-multitask-5/sac-pointmass-multitask-5_2019_04_20_18_57_19_0000--s-0/params.pkl"

    data = joblib.load(file)
    policy = data['evaluation/policy']
    env = data['evaluation/env']

    # plt.figure(figsize=(8, 8))
    num_goals = len(env.goals)
    has_circle = np.zeros(num_goals).astype(bool)

    fig, ax = plt.subplots(nrows=5, ncols=num_goals // 5)
    fig.set_size_inches(8, 8)
    print("Number of goals:", num_goals)
    for i in range(200):
        path = rollout(
            env,
            policy,
            max_path_length=100,
            animated=False,
        )

        # print(path)
        obs = path["observations"]
        acts = path["actions"]
        goal_idx = np.argmax(obs[0, 2:])
        plot_row, plot_col = goal_idx // 5, goal_idx % 5
        goal_plot = ax[plot_row, plot_col]

        # Turn off
        goal_plot.set_yticklabels([])
        goal_plot.set_xticklabels([])

        start_x = obs[0, 0]
        start_y = obs[0, 1]

        goal_plot.scatter(start_x, start_y, color="green")
        goal_plot.scatter(obs[1:, 0], obs[1:, 1], color="b")
        goal_plot.quiver(obs[:, 0], obs[:, 1], acts[:, 0], acts[:, 1],
                         angles='xy', scale_units='xy', scale=1, width=.005, headwidth=3, alpha=.9)
        # plt.annotate("start=({0}, {1})".format(start_x.round(4), start_y.round(4)), (start_x, start_y), xytext=(start_x-.5, start_y+.2))

        final_x, final_y = obs[len(obs) - 1, 0], obs[len(obs) - 1, 1]
        # plt.annotate("end=({0}, {1})".format(final_x.round(4), final_y.round(4)), (final_x, final_y), xytext=(final_x-.5, final_y-.2))

        goal = env.goals[goal_idx]
        goal_x, goal_y = goal[0], goal[1]
        # plt.annotate("goal=({0}, {1})".format(goal_x.round(4), goal_y.round(4)), (goal_x, goal_y), xytext=(goal_x-.5, goal_y+.1))
        goal_plot.scatter(goal[0], goal[1], color="r")  # Goal
        goal_plot.set_xlim(-1.5, 1.5)
        goal_plot.set_ylim(-1.5, 1.5)

        if not has_circle[goal_idx]:
            circle = plt.Circle((0, 0), 1, color='black', alpha=.5, fill=False)
            goal_plot.add_artist(circle)
            has_circle[goal_idx] = True
示例#11
0
    def get_eval_policy(self, task_identifier, mode='meta_test'):
        if task_identifier not in self.context_buffer.task_replay_buffers:
            # generate some rollouts with prior policy
            eval_context_buffer = MetaEnvReplayBuffer(
                self.context_buffer_size_per_task,
                self.training_env,
                policy_uses_pixels=self.policy_uses_pixels,
            )

            n_steps_total = 0
            steps_needed = self.num_context_trajs_for_exploration * self.max_path_length
            task_params = self.training_env.task_id_to_task_params(
                task_identifier)
            obs_task_params = self.training_env.task_id_to_obs_task_params(
                task_identifier)
            while n_steps_total < steps_needed:
                first_obs = self.training_env.reset(
                    task_params=task_params, obs_task_params=obs_task_params)
                task_id = self.training_env.task_identifier

                z = self.prior_dist.sample()
                z = z.cpu().data.numpy()[0]
                post_cond_policy = PostCondMLPPolicyWrapper(
                    self.main_policy, z)

                new_path = rollout(self.training_env,
                                   post_cond_policy,
                                   max_path_length=min(
                                       self.max_path_length + 1,
                                       steps_needed - n_steps_total + 1),
                                   do_not_reset=True,
                                   first_obs=first_obs)
                n_steps_total += len(new_path['observations'])
                eval_context_buffer.add_path(new_path, task_id)

            list_of_trajs = eval_context_buffer.sample_trajs_from_task(
                task_identifier,
                self.num_context_trajs_for_exploration,
                samples_per_traj=self.samples_per_traj)
            mask = None
        else:
            list_of_trajs = self.context_buffer.sample_trajs_from_task(
                task_identifier,
                self.num_context_trajs_for_exploration,
            )
            mask = None

        enc_to_use = self.encoder
        mode = enc_to_use.training
        enc_to_use.eval()
        post_dist = enc_to_use([list_of_trajs], mask)
        enc_to_use.train(mode)

        z = post_dist.sample()
        z = z.cpu().data.numpy()[0]
        return PostCondMLPPolicyWrapper(self.main_policy, z)
示例#12
0
 def obtain_samples(self):
     paths = []
     n_steps_total = 0
     while n_steps_total + self.max_path_length <= self.max_samples:
         path = rollout(self.env,
                        self.policy,
                        max_path_length=self.max_path_length)
         paths.append(path)
         n_steps_total += len(path['observations'])
     return paths
示例#13
0
 def obtain_samples(self):
     paths = []
     n_steps_total = 0
     while n_steps_total + self.max_path_length <= self.max_samples:
         self.start_new_rollout()
         path = rollout(self.env,
                        self.policy,
                        max_path_length=self.max_path_length)
         self.handle_rollout_ending()
         paths.append(path)
         n_steps_total += len(path['observations'])
     return paths
示例#14
0
def dump_video(
        env,
        policy,
        filename,
        ROWS=3,
        COLUMNS=6,
        do_timer=True,
        horizon=100,
        image_env=None,
        dirname=None,
        subdirname="rollouts",
):
    policy.train(False) # is this right/necessary?
    paths = []
    num_channels = env.vae.input_channels
    frames = []
    N = ROWS * COLUMNS
    for i in range(N):
        rollout_dir = osp.join(dirname, subdirname, str(i))
        os.makedirs(rollout_dir, exist_ok=True)
        start = time.time()
        paths.append(rollout(
            env,
            policy,
            frames,
            max_path_length=horizon,
            animated=False,
            image_env=image_env,
        ))
        rollout_frames = frames[-101:]
        goal_img = np.flip(rollout_frames[0][:84, :84, :], 0)
        scipy.misc.imsave(rollout_dir+"/goal.png", goal_img)
        goal_img = np.flip(rollout_frames[1][:84, :84, :], 0)
        scipy.misc.imsave(rollout_dir+"/z_goal.png", goal_img)
        for j in range(0, 101, 1):
            img = np.flip(rollout_frames[j][84:, :84, :], 0)
            scipy.misc.imsave(rollout_dir+"/"+str(j)+".png", img)
        if do_timer:
            print(i, time.time() - start)

    frames = np.array(frames, dtype=np.uint8).reshape((N, horizon + 1, H, W, num_channels))
    f1 = []
    for k1 in range(COLUMNS):
        f2 = []
        for k2 in range(ROWS):
            k = k1 * ROWS + k2
            f2.append(frames[k:k+1, :, :, :, :].reshape((horizon + 1, H, W, num_channels)))
        f1.append(np.concatenate(f2, axis=1))
    outputdata = np.concatenate(f1, axis=2)
    skvideo.io.vwrite(filename, outputdata)
    print("Saved video to ", filename)

    return paths
示例#15
0
 def obtain_samples(self, deterministic=False, num_samples=None, is_online=False):
     policy = MakeDeterministic(self.policy) if deterministic else self.policy
     paths = []
     n_steps_total = 0
     max_samp = self.max_samples
     if num_samples is not None:
         max_samp = num_samples
     while n_steps_total + self.max_path_length < max_samp:
         path = rollout(
             self.env, policy, max_path_length=self.max_path_length, is_online=is_online)
         paths.append(path)
         n_steps_total += len(path['observations'])
     return paths
示例#16
0
def simulate_policy(args):
    data = joblib.load(args.file)
    policy = data['evaluation/policy']
    env = data['evaluation/env']
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            animated=True,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
示例#17
0
def simulate_policy(args):
    #   data = joblib.load(args.file)
    data = torch.load(args.file)
    policy = data['evaluation/policy']
    env = NormalizedBoxEnv(gym.make(str(args.env)))
    #   env = env.wrapped_env.unwrapped
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()

    import cv2
    video = None
    # index = 0
    for skill in range(policy.stochastic_policy.skill_dim):
        for trial in range(3):
            print("skill-{} rollout-{}".format(skill, trial))
            path = rollout(
                env,
                policy,
                skill,
                max_path_length=args.H,
                render=True,
            )
            if hasattr(env, "log_diagnostics"):
                env.log_diagnostics([path])
            logger.dump_tabular()

            for i, img in enumerate(path['images']):
                # print(i)
                # print(img.shape)
                if not video:
                    video = cv2.VideoWriter(
                        '{}.avi'.format(str(args.env)),
                        cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 30,
                        img.shape[:2])
                video.write(img[:, :, ::-1].astype(np.uint8))


#                cv2.imwrite("frames/diayn_bipedal_walker_hardcore.avi/%06d.png" % index, img[:,:,::-1])
# index += 1

    video.release()
    print("wrote video")
def simulate_policy(args):
    data = joblib.load(args.file)  # Pickle is internally used using joblib
    policy = data['policy']
    env = data['env']
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            animated=False,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
示例#19
0
def create_policy(variant):
    bottom_snapshot = joblib.load(variant['bottom_path'])
    column_snapshot = joblib.load(variant['column_path'])
    policy = variant['combiner_class'](
        policy1=bottom_snapshot['naf_policy'],
        policy2=column_snapshot['naf_policy'],
    )
    env = bottom_snapshot['env']
    logger.save_itr_params(0, dict(
        policy=policy,
        env=env,
    ))
    path = rollout(
        env,
        policy,
        max_path_length=variant['max_path_length'],
        animated=variant['render'],
    )
    env.log_diagnostics([path])
    logger.dump_tabular()
示例#20
0
def simulate_policy(args):
    data = joblib.load(args.file)
    import ipdb; ipdb.set_trace()
    policy = data['exploration_policy']  # ? TODO, eval ?
    env = data['env']
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode("gpu")
        policy.cuda()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            animated=True,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
示例#21
0
def _run_policy(env, policy, num_rollouts):
    """
    Takes in a trained policy, runs the policy for a specified number of
    rollouts, and returns the results of the experiment.
    :param fn: The path to the `params.pkl` file containing the trained policy.
    :param num_rollouts: The number of rollouts to experience.
    :return: A list of the `num_rollouts` recorded paths to be used for
             further analysis.
    """
    start_states, final_states, goal_states, actions, paths = [], [], [], [], []

    for i in range(num_rollouts):
        path = rollout(
            env,
            policy,
            max_path_length=100,
            animated=False,
        )
        obs = path["observations"]
        acts = path["actions"]

        goal_idx = np.argmax(obs[0, 2:])
        start_x, start_y = obs[0, 0], obs[0, 1]
        acts_x, acts_y = acts[:, 0], acts[:, 1]
        final_x, final_y = obs[len(obs) - 1, 0], obs[len(obs) - 1, 1]
        goal = env.goals[goal_idx]
        goal_x, goal_y = goal[0], goal[1]

        start_states.append(np.array([start_x, start_y]))
        final_states.append(np.array([final_x, final_y]))
        goal_states.append(np.array([goal_x, goal_y]))
        actions.append(np.array([acts_x, acts_y]))
        paths.append(path)

    return dict(start_states=np.array(start_states),
                final_states=np.array(final_states),
                goal_states=np.array(goal_states),
                actions=np.array(actions),
                paths=paths,
                env=env)
示例#22
0
    def pretrain(self):
        print('Generating initial contexts')

        # fill the contexts
        for task_params, obs_task_params in self.train_task_params_sampler:
            print('task')
            n_steps_total = 0
            # print(n_steps_total)
            while n_steps_total < self.context_buffer_size_per_task:
                # print('------')
                # print(n_steps_total)
                # print(self.context_buffer_size_per_task)
                # print(self.max_path_length)

                first_obs = self.training_env.reset(
                    task_params=task_params, obs_task_params=obs_task_params)
                task_id = self.training_env.task_identifier

                z = self.prior_dist.sample()
                z = z.cpu().data.numpy()[0]
                post_cond_policy = PostCondMLPPolicyWrapper(
                    self.main_policy, z)

                new_path = rollout(
                    self.training_env,
                    post_cond_policy,
                    max_path_length=min(
                        self.max_path_length + 1,
                        self.context_buffer_size_per_task - n_steps_total + 1),
                    do_not_reset=True,
                    first_obs=first_obs)
                # print(len(new_path['observations']))
                n_steps_total += len(new_path['observations'])

                if self.add_context_rollouts_to_replay_buffer:
                    self.replay_buffer.add_path(new_path, task_id)
                self.context_buffer.add_path(new_path, task_id)

        print('Generating initial replay buffer rollouts')
        super().pretrain()
示例#23
0
def simulate_policy(args):
    data = joblib.load(args.file)
    policy = data['policy']
    # env = data['env']
    from rlkit.envs.mujoco_manip_env import MujocoManipEnv
    env = MujocoManipEnv("SawyerLiftEnv", render=True)
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            animated=True,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
示例#24
0
def simulate_policy(args):
    manager_data = torch.load(args.manager_file)
    worker_data = torch.load(args.worker_file)
    policy = manager_data['evaluation/policy']
    worker = worker_data['evaluation/policy']
    env = NormalizedBoxEnv(gym.make(str(args.env)))
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()

    import cv2
    video = cv2.VideoWriter('ppo_dirichlet_diayn_bipedal_walker_hardcore.avi',
                            cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 30,
                            (1200, 800))
    index = 0

    path = rollout(
        env,
        policy,
        worker,
        continuous=True,
        max_path_length=args.H,
        render=True,
    )
    if hasattr(env, "log_diagnostics"):
        env.log_diagnostics([path])
    logger.dump_tabular()

    for i, img in enumerate(path['images']):
        print(i)
        video.write(img[:, :, ::-1].astype(np.uint8))
        #        cv2.imwrite("frames/ppo_dirichlet_diayn_policy_bipedal_walker_hardcore/%06d.png" % index, img[:,:,::-1])
        index += 1

    video.release()
    print("wrote video")
示例#25
0
def simulate_policy(args):
    data = joblib.load(args.file)
    policy = data['mpc_controller']
    env = data['env']
    print("Policy loaded")
    if args.pause:
        import ipdb
        ipdb.set_trace()
    policy.cost_fn = env.cost_fn
    policy.env = env
    if args.T:
        policy.mpc_horizon = args.T
    paths = []
    while True:
        paths.append(
            rollout(
                env,
                policy,
                max_path_length=args.H,
                animated=True,
            ))
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics(paths)
        logger.dump_tabular()
def simulate_policy(args):
    data = joblib.load(args.file)
    policy = data['policy']
    env = data['env']
    print("Policy loaded")

    farmer = Farmer([('0.0.0.0', 1)])
    env_to_sim = farmer.force_acq_env()

    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    while True:
        path = rollout(
            env_to_sim,
            policy,
            max_path_length=args.H,
            animated=False,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
示例#27
0
def simulate_policy(args):
    data = joblib.load(args.file)

    cont = False

    if 'policies' in data:
        policy = data['policies'][0]
    else:
        policy = data['policy']
    env = NormalizedBoxEnv(create_swingup())  #data['env']

    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
        data['qf1'].cuda()
    if isinstance(policy, PyTorchModule):
        policy.train(False)

    diayn = 'df' in data
    rnd = 'rf' in data

    if diayn:
        skills = len(data['eval_policy'].skill_vec)
        disc = data['df']

        policy = OptionPolicy(policy, skills, cont)
        if args.gpu:
            disc.cuda()
        if isinstance(policy, PyTorchModule):
            disc.train(False)

    if rnd:
        data['rf'].cuda()
        data['pf'].cuda()
        data['qf1'].cuda()

    import cv2
    video = cv2.VideoWriter('video.avi', cv2.VideoWriter_fourcc(*"H264"), 30,
                            (640, 480))
    index = 0

    truth, pred = [], []

    if cont:
        eps = 1
    elif diayn:
        eps = skills * 2
    else:
        eps = 5

    Rs = []

    for ep in range(eps):
        if diayn and not cont:
            z_index = ep // 2
            policy.set_z(z_index)

        path = rollout(
            env,
            policy,
            max_path_length=args.H * skills if cont else args.H,
            animated=True,
        )

        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()

        total_r = 0

        if diayn:
            predictions = F.log_softmax(
                disc(torch.FloatTensor(path['observations']).cuda()),
                1).cpu().detach().numpy()
            probs = predictions.max(1)
            labels = predictions.argmax(1)

            if cont:
                for k in range(skills):
                    truth.extend([k] * 100)
            else:
                truth.extend([z_index] * len(labels))
            pred.extend(labels.tolist())

        if rnd:
            random_feats = data['rf'](torch.FloatTensor(
                path['observations']).cuda())
            pred_feats = data['pf'](torch.FloatTensor(
                path['observations']).cuda())

            i_rewards = ((random_feats -
                          pred_feats)**2.0).sum(1).cpu().data.numpy()

        q_pred = data['qf1'](torch.FloatTensor(path['observations']).cuda(),
                             torch.FloatTensor(
                                 path['actions']).cuda()).cpu().data.numpy()

        for i, (img, r, s) in enumerate(
                zip(path['images'], path['rewards'], path['observations'])):
            #video.write(img[:,:,::-1].astype(np.uint8))
            total_r += r[0]
            img = img.copy()
            img = np.rot90(img, 3).copy()
            col = (255, 0, 255)
            cv2.putText(img, "step: %d" % (i + 1), (20, 40),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA)

            if diayn:
                if cont:
                    cv2.putText(img, "z: %s" % str(truth[i]), (20, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255),
                                2, cv2.LINE_AA)
                else:
                    cv2.putText(img, "z: %s" % str(z_index), (20, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255),
                                2, cv2.LINE_AA)

                cv2.putText(img,
                            "disc_pred: %s (%.3f)" % (labels[i], probs[i]),
                            (20, 120), cv2.FONT_HERSHEY_SIMPLEX, 1.0,
                            (255, 255, 255), 2, cv2.LINE_AA)
                cv2.putText(img, "reward: %.3f" % r[0], (20, 160),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2,
                            cv2.LINE_AA)
                cv2.putText(img, "total reward: %.1f" % total_r, (20, 200),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2,
                            cv2.LINE_AA)
                cv2.putText(img, "action: %s" % path['actions'][i], (20, 240),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2,
                            cv2.LINE_AA)
            else:
                cv2.putText(img, "reward: %.1f" % r[0], (20, 80),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA)
                cv2.putText(img, "total reward: %.1f" % total_r, (20, 120),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA)
                y = 120

            if rnd:
                cv2.putText(img, "i reward (unscaled): %.3f" % i_rewards[i],
                            (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2,
                            cv2.LINE_AA)
                #cv2.rectangle(img, (20, 180), (20 + int(q_pred[i, 0]), 200), (255, 0, 255), -1)
                cv2.rectangle(img, (20, 200),
                              (20 + int(i_rewards[i] * 10), 220),
                              (255, 255, 0), -1)
                y = 220

            try:
                y += 40
                cv2.putText(img, "Q: %.3f" % q_pred[i], (20, y),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA)
            except:
                y += 40
                cv2.putText(img, "Q:" + str([q for q in q_pred[i]]), (20, y),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA)
            y += 40
            cv2.putText(img, str(["%.3f" % x
                                  for x in path['observations'][i]]), (20, y),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA)

            try:
                cv2.imwrite("frames/%06d.png" % index, img[:, :, ::-1])
            except:
                cv2.imwrite("frames/%06d.png" % index, img[:, :])
            index += 1

        if diayn:
            print(z_index, ":", total_r)
        Rs.append(total_r)

    print("best", np.argmax(Rs))
    print("worst", np.argmin(Rs))

    video.release()
    print("wrote video")

    if diayn:
        import sklearn
        from sklearn.metrics import confusion_matrix
        import matplotlib as mpl
        import itertools
        mpl.use('Agg')
        import matplotlib.pyplot as plt
        normalize = False
        classes = range(skills)
        cm = confusion_matrix(truth, pred)
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            print("Normalized confusion matrix")
        else:
            print('Confusion matrix, without normalization')
        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        plt.colorbar()
        tick_marks = np.arange(skills)
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)
        """
        fmt = '.2f' if normalize else 'd'
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        """

        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.tight_layout()
        plt.savefig("confusion.png")
示例#28
0
    args = parser.parse_args()

    vertical_pos = 'middle'
    horizontal_pos = 'bottom'

    ddpg1_snapshot_path, ddpg2_snapshot_path, x_goal, y_goal = (
        get_snapshots_and_goal(
            vertical_pos=vertical_pos,
            horizontal_pos=horizontal_pos,
        ))
    env_params = dict(goal=(x_goal, y_goal), )
    env = PusherEnv3DOF(**env_params)
    env = normalize(env)
    ddpg1_snapshot_dict = joblib.load(ddpg1_snapshot_path)
    ddpg2_snapshot_dict = joblib.load(ddpg2_snapshot_path)
    policy = AveragerPolicy(
        ddpg1_snapshot_dict['policy'],
        ddpg2_snapshot_dict['policy'],
    )

    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            animated=True,
        )
        env.log_diagnostics([path])
        policy.log_diagnostics([path])
        logger.dump_tabular()
示例#29
0
def sim_policy(variant,
               path_to_exp,
               num_trajs=1,
               deterministic=False,
               save_video=False,
               animated=False):
    '''
    simulate a trained policy adapting to a new task
    optionally save videos of the trajectories - requires ffmpeg

    :variant: experiment configuration dict
    :path_to_exp: path to exp folder
    :num_trajs: number of trajectories to simulate per task (default 1)
    :deterministic: if the policy is deterministic (default stochastic)
    :save_video: whether to generate and save a video (default False)
    '''

    # create multi-task environment and sample tasks
    env = CameraWrapper(
        NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params'])),
        variant['util_params']['gpu_id'])
    if animated:
        env.render()
    tasks = env.get_all_task_idx()
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    eval_tasks = list(tasks[-variant['n_eval_tasks']:])
    print('testing on {} test tasks, {} trajectories each'.format(
        len(eval_tasks), num_trajs))

    # instantiate networks
    latent_dim = variant['latent_size']
    context_encoder = latent_dim * 2 if variant['algo_params'][
        'use_information_bottleneck'] else latent_dim
    reward_dim = 1
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder

    context_encoder = encoder_model(
        hidden_sizes=[200, 200, 200],
        input_size=obs_dim + action_dim + reward_dim,
        output_size=context_encoder,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = PEARLAgent(latent_dim, context_encoder, policy,
                       **variant['algo_params'])
    # deterministic eval
    if deterministic:
        agent = MakeDeterministic(agent)

    # load trained weights (otherwise simulate random policy)
    context_encoder.load_state_dict(
        torch.load(os.path.join(path_to_exp, 'context_encoder.pth'),
                   map_location=torch.device('cpu')))
    policy.load_state_dict(
        torch.load(os.path.join(path_to_exp, 'policy.pth'),
                   map_location=torch.device('cpu')))

    # loop through tasks collecting rollouts
    all_rets = []
    video_frames = []
    for idx in eval_tasks:
        env.reset_task(idx)
        agent.clear_z()
        paths = []
        for n in range(num_trajs):
            path = rollout(
                env,
                agent,
                max_path_length=variant['algo_params']['num_steps_per_eval'],
                accum_context=True,
                animated=animated,
                save_frames=save_video)
            paths.append(path)
            if save_video:
                video_frames += [t['frame'] for t in path['env_infos']]
            if n >= variant['algo_params']['num_exp_traj_eval']:
                agent.infer_posterior(agent.context)
        all_rets.append([sum(p['rewards']) for p in paths])

    if save_video:
        # save frames to file temporarily
        temp_dir = os.path.join(path_to_exp, 'temp')
        os.makedirs(temp_dir, exist_ok=True)
        for i, frm in enumerate(video_frames):
            frm.save(os.path.join(temp_dir, '%06d.jpg' % i))

        video_filename = os.path.join(path_to_exp, 'video.mp4'.format(idx))
        # run ffmpeg to make the video
        os.system('ffmpeg -i {}/%06d.jpg -vcodec mpeg4 {}'.format(
            temp_dir, video_filename))
        # delete the frames
        shutil.rmtree(temp_dir)

    # compute average returns across tasks
    n = min([len(a) for a in all_rets])
    rets = [a[:n] for a in all_rets]
    rets = np.mean(np.stack(rets), axis=0)
    for i, ret in enumerate(rets):
        print('trajectory {}, avg return: {} \n'.format(i, ret))
示例#30
0
    policy = data['evaluation/policy']
    env = data['evaluation/env']

    plt.figure(figsize=(8, 8))
    num_goals = len(env.goals)

    final_states = []
    goals = []

    print("Number of goals:", num_goals)
    num_plotted = 0
    # for i in range(10):
    while num_plotted < 100:
        path = rollout(
            env,
            policy,
            max_path_length=100,
            animated=False,
        )

        # print(path)
        obs = path["observations"]
        acts = path["actions"]
        goal_idx = np.argmax(obs[0, 2:])

        num_plotted += 1

        plot_row, plot_col = goal_idx // 5, goal_idx % 5

        start_x = obs[0, 0]
        start_y = obs[0, 1]