Пример #1
0
def simulate_policy(args):
    data = joblib.load(args.file)
    if args.deterministic:
        print('Using the deterministic version of the policy.')
        policy = data['policy']
    else:
        print('Using the stochastic policy.')
        policy = data['exploration_policy']

    print("Policy loaded")
    env = data['env']
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            animated=True,
            # deterministic=args.deterministic,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
Пример #2
0
    def pretrain(self, n_pretrain_samples):
        if (
                self.num_paths_for_normalization == 0
                or (self.obs_normalizer is None and self.action_normalizer is None)
        ):
            observation = self.explo_env.reset()
            for ii in range(n_pretrain_samples):
                action = self.explo_env.action_space.sample()
                # Interact with environment
                next_ob, reward, terminal, env_info = (
                    self.explo_env.step(action)
                )
                agent_info = None

                # Increase counter
                self._n_env_steps_total += 1
                # Create np.array of obtained terminal and reward
                terminal = np.array([terminal])
                reward = np.array([reward])
                # Add to replay buffer
                self.replay_buffer.add_sample(
                    observation=observation,
                    action=action,
                    reward=reward,
                    terminal=terminal,
                    next_observation=next_ob,
                    agent_info=agent_info,
                    env_info=env_info,
                )
                observation = next_ob

                if self._obs_normalizer is not None:
                    self._obs_normalizer.update(np.array([observation]))

                if terminal:
                    self.explo_env.reset()
        else:
            pretrain_paths = []
            random_policy = RandomPolicy(self.explo_env.action_space)
            while len(pretrain_paths) < self.num_paths_for_normalization:
                path = rollout(self.explo_env, random_policy, self.max_path_length)
                pretrain_paths.append(path)
            ob_mean, ob_std, ac_mean, ac_std = (
                compute_normalization(pretrain_paths)
            )
            if self.obs_normalizer is not None:
                self.obs_normalizer.set_mean(ob_mean)
                self.obs_normalizer.set_std(ob_std)
                self._target_qf.obs_normalizer = self.obs_normalizer
                self._target_policy.obs_normalizer = self.obs_normalizer
            if self.action_normalizer is not None:
                self.action_normalizer.set_mean(ac_mean)
                self.action_normalizer.set_std(ac_std)
                self._target_qf.action_normalizer = self.action_normalizer
                self._target_policy.action_normalizer = self.action_normalizer
Пример #3
0
def simulate_policy(args):
    data = joblib.load(args.file)
    if args.deterministic:
        if args.un > -1:
            print(
                'Using the deterministic version of the UNintentional policy '
                '%02d.' % args.un)
            if 'u_policy' in data:
                policy = MakeDeterministic(
                    # MultiPolicySelector(data['u_policy'], args.un))
                    WeightedMultiPolicySelector(data['policy'], args.un))
            else:
                policy = MakeDeterministic(
                    WeightedMultiPolicySelector(data['policy'], args.un))
        else:
            print('Using the deterministic version of the Intentional policy.')
            policy = MakeDeterministic(data['policy'])
    else:
        if args.un > -1:
            print('Using the UNintentional stochastic policy %02d' % args.un)
            if 'u_policy' in data:
                # policy = MultiPolicySelector(data['u_policy'], args.un)
                policy = WeightedMultiPolicySelector(data['policy'], args.un)
            else:
                # policy = data['u_policies'][args.un]
                policy = WeightedMultiPolicySelector(data['policy'], args.un)
        else:
            print('Using the Intentional stochastic policy.')
            # policy = data['exploration_policy']
            policy = data['policy']

    print("Policy loaded!!")

    # Load environment
    with open('variant.json') as json_data:
        env_params = json.load(json_data)['env_params']
    env = NormalizedBoxEnv(Navigation2dGoalCompoEnv(**env_params))
    print("Environment loaded!!")

    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            animated=True,
            # deterministic=args.deterministic,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
Пример #4
0
def simulate_policy(args):
    data = joblib.load(args.file)
    if args.deterministic:
        print('Using the deterministic version of the policy.')
        policy = data['policy']
    else:
        print('Using the stochastic policy.')
        policy = data['exploration_policy']

    # env = data['env']
    env = NormalizedBoxEnv(gym.make(args.env))
    print("Environment loaded!!")

    # # Load environment
    # with open('variant.json') as json_data:
    #     env_params = json.load(json_data)['env_params']
    # env_params.pop('goal')
    # env_params['is_render'] = True
    # env = NormalizedBoxEnv(args.env(**env_params))
    # print("Environment loaded!!")

    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    # else:
    #     set_gpu_mode(False)
    #     policy.cpu()

    if isinstance(policy, PyTorchModule):
        policy.train(False)
    while True:
        if args.record:
            env.start_recording_video('prueba.mp4')
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            animated=True,
            # deterministic=args.deterministic,
        )
        print('Accum reward is: ', path['rewards'].sum())
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
        if args.record:
            env.stop_recording_video()
            break
def simulate_policy(args):

    np.random.seed(SEED)
    ptu.seed(SEED)

    data = joblib.load(args.file)
    if args.deterministic:
        if args.un > -1:
            print('Using the deterministic version of the UNintentional policy '
                  '%02d.' % args.un)
            if 'u_policy' in data:
                policy = MakeDeterministic(
                    MultiPolicySelector(data['u_policy'], args.un))
                    # WeightedMultiPolicySelector(data['u_policy'], args.un))
            else:
                # policy = MakeDeterministic(data['u_policies'][args.un])
                if isinstance(data['policy'], TanhGaussianPolicy):
                    policy = MakeDeterministic(data['policy'])
                else:
                    policy = MakeDeterministic(
                        WeightedMultiPolicySelector(data['policy'], args.un)
                    )
        else:
            print('Using the deterministic version of the Intentional policy.')
            if isinstance(data['policy'], ExplorationPolicy):
                policy = MakeDeterministic(data['policy'])
            else:
                policy = data['policy']
    else:
        if args.un > -1:
            print('Using the UNintentional stochastic policy %02d' % args.un)
            if 'u_policy' in data:
                # policy = MultiPolicySelector(data['u_policy'], args.un)
                policy = WeightedMultiPolicySelector(data['u_policy'], args.un)
            else:
                policy = WeightedMultiPolicySelector(data['policy'], args.un)
                # policy = data['policy'][args.un]
        else:
            print('Using the Intentional stochastic policy.')
            # policy = data['exploration_policy']
            policy = data['policy']

    print("Policy loaded!!")

    # Load environment
    dirname = os.path.dirname(args.file)
    with open(os.path.join(dirname, 'variant.json')) as json_data:
        log_data = json.load(json_data)
        env_params = log_data['env_params']
        H = int(log_data['path_length'])

    env_params.pop('goal', None)
    env_params['is_render'] = True

    if args.subtask and args.un != -1:
        env_params['subtask'] = args.un

    env = NormalizedBoxEnv(
        Pusher2D3DofGoalCompoEnv(**env_params),
        # normalize_obs=True,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )
    print("Environment loaded!!")

    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    if isinstance(policy, MakeDeterministic):
        if isinstance(policy.stochastic_policy, PyTorchModule):
            policy.stochastic_policy.train(False)
    else:
        if isinstance(policy, PyTorchModule):
            policy.train(False)

    while True:
        if args.record:
            rollout_start_fcn = lambda: \
                env.start_recording_video('pusher_video.mp4')
            rollout_end_fcn = lambda: \
                env.stop_recording_video()
        else:
            rollout_start_fcn = None
            rollout_end_fcn = None

        obs_normalizer = data.get('obs_normalizer')

        if args.H != -1:
            H = args.H

        path = rollout(
            env,
            policy,
            max_path_length=H,
            animated=True,
            obs_normalizer=obs_normalizer,
            rollout_start_fcn=rollout_start_fcn,
            rollout_end_fcn=rollout_end_fcn,
        )
        # plot_rollout_reward(path)

        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])

        logger.dump_tabular()

        if args.record:
            break
Пример #6
0
def simulate_policy(args):

    np.random.seed(SEED)
    ptu.seed(SEED)

    data = joblib.load(args.file)
    if args.deterministic:
        if args.un > -1:
            print(
                'Using the deterministic version of the UNintentional policy '
                '%02d.' % args.un)
            if 'u_policy' in data:
                policy = MakeDeterministic(
                    MultiPolicySelector(data['u_policy'], args.un))
                # WeightedMultiPolicySelector(data['u_policy'], args.un))
            else:
                # policy = MakeDeterministic(data['u_policies'][args.un])
                if isinstance(data['policy'], TanhGaussianPolicy):
                    policy = MakeDeterministic(data['policy'])
                else:
                    policy = MakeDeterministic(
                        WeightedMultiPolicySelector(data['policy'], args.un))
        else:
            print('Using the deterministic version of the Intentional policy.')
            if isinstance(data['policy'], ExplorationPolicy):
                policy = MakeDeterministic(data['policy'])
            else:
                policy = data['policy']
    else:
        if args.un > -1:
            print('Using the UNintentional stochastic policy %02d' % args.un)
            if 'u_policy' in data:
                # policy = MultiPolicySelector(data['u_policy'], args.un)
                policy = WeightedMultiPolicySelector(data['u_policy'], args.un)
            else:
                policy = WeightedMultiPolicySelector(data['policy'], args.un)
                # policy = data['policy'][args.un]
        else:
            print('Using the Intentional stochastic policy.')
            # policy = data['exploration_policy']
            policy = data['policy']

    print("Policy loaded!!")

    # Load environment
    dirname = os.path.dirname(args.file)
    with open(os.path.join(dirname, 'variant.json')) as json_data:
        log_data = json.load(json_data)
        env_params = log_data['env_params']
        H = int(log_data['path_length'])
    env_params['is_render'] = True

    if 'obs_mean' in data.keys():
        obs_mean = data['obs_mean']
        print('OBS_MEAN')
        print(repr(obs_mean))
    else:
        obs_mean = None
        # obs_mean = np.array([ 0.07010766,  0.37585765,  0.21402615,  0.24426296,  0.5789634 ,
        #                       0.88510203,  1.6878743 ,  0.02656335,  0.03794186, -1.0241051 ,
        #                       -0.5226027 ,  0.6198239 ,  0.49062446,  0.01197532,  0.7888951 ,
        #                       -0.4857273 ,  0.69160587, -0.00617676,  0.08966777, -0.14694819,
        #                       0.9559917 ,  1.0450271 , -0.40958315,  0.86435956,  0.00609685,
        #                       -0.01115279, -0.21607827,  0.9762933 ,  0.80748135, -0.48661205,
        #                       0.7473679 ,  0.01649722,  0.15451911, -0.17285274,  0.89978695])

    if 'obs_var' in data.keys():
        obs_var = data['obs_var']
        print('OBS_VAR')
        print(repr(obs_var))
    else:
        obs_var = None
        # obs_var = np.array([0.10795759, 0.12807205, 0.9586606 , 0.46407   , 0.8994803 ,
        #                     0.35167143, 0.30286264, 0.34667444, 0.35105848, 1.9919134 ,
        #                     0.9462659 , 2.245269  , 0.84190637, 1.5407104 , 0.1       ,
        #                     0.10330457, 0.1       , 0.1       , 0.1       , 0.1528581 ,
        #                     0.1       , 0.1       , 0.1       , 0.1       , 0.1       ,
        #                     0.1       , 0.1       , 0.1       , 0.1       , 0.12320185,
        #                     0.1       , 0.18369523, 0.200373  , 0.11895574, 0.15118493])
    print(env_params)

    if args.subtask and args.un != -1:
        env_params['subtask'] = args.un
    # else:
    #     env_params['subtask'] = None

    env = NormalizedBoxEnv(
        CentauroTrayEnv(**env_params),
        # normalize_obs=True,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )
    print("Environment loaded!!")

    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    if isinstance(policy, MakeDeterministic):
        if isinstance(policy.stochastic_policy, PyTorchModule):
            policy.stochastic_policy.train(False)
    else:
        if isinstance(policy, PyTorchModule):
            policy.train(False)

    while True:
        if args.record:
            rollout_start_fcn = lambda: \
                env.start_recording_video('centauro_video.mp4')
            rollout_end_fcn = lambda: \
                env.stop_recording_video()
        else:
            rollout_start_fcn = None
            rollout_end_fcn = None

        obs_normalizer = data.get('obs_normalizer')

        if args.H != -1:
            H = args.H

        path = rollout(
            env,
            policy,
            max_path_length=H,
            animated=True,
            obs_normalizer=obs_normalizer,
            rollout_start_fcn=rollout_start_fcn,
            rollout_end_fcn=rollout_end_fcn,
        )
        plot_rollout_reward(path)

        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])

        logger.dump_tabular()

        if args.record:
            break
Пример #7
0
def simulate_policy(args):

    np.random.seed(SEED)
    ptu.seed(SEED)

    data = joblib.load(args.file)
    if args.deterministic:
        print('Using the deterministic version of the policy.')
        if isinstance(data['policy'], ExplorationPolicy):
            policy = MakeDeterministic(data['policy'])
        else:
            policy = data['policy']
    else:
        print('Using the stochastic policy.')
        policy = data['exploration_policy']

    print("Policy loaded!!")

    # Load environment
    with open('variant.json') as json_data:
        env_params = json.load(json_data)['env_params']

    env_params['is_render'] = True
    env = NormalizedBoxEnv(
        Reacher2D3DofBulletEnv(**env_params),
        # normalize_obs=True,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )
    print("Environment loaded!!")

    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    if isinstance(policy, MakeDeterministic):
        if isinstance(policy.stochastic_policy, PyTorchModule):
            policy.stochastic_policy.train(False)
    else:
        if isinstance(policy, PyTorchModule):
            policy.train(False)

    while True:
        if args.record:
            rollout_start_fcn = lambda: \
                env.start_recording_video('reacher_video.mp4')
            rollout_end_fcn = lambda: \
                env.stop_recording_video()
        else:
            rollout_start_fcn = None
            rollout_end_fcn = None

        obs_normalizer = data.get('obs_normalizer')

        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            animated=True,
            obs_normalizer=obs_normalizer,
            rollout_start_fcn=rollout_start_fcn,
            rollout_end_fcn=rollout_end_fcn,
        )

        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])

        logger.dump_tabular()

        if args.record:
            break