class RemotePathCollectorSingleMdp(object):
    def __init__(self, index, variant, candidate_size=10):
        ptu.set_gpu_mode(True)
        torch.set_num_threads(1)

        import sys
        sys.argv = ['']
        del sys

        env_max_action = variant['env_max_action']
        obs_dim = variant['obs_dim']
        action_dim = variant['action_dim']
        latent_dim = variant['latent_dim']
        vae_latent_dim = 2 * action_dim
        mlp_enconder_input_size = 2 * obs_dim + action_dim + 1 if variant[
            'use_next_obs_in_context'] else obs_dim + action_dim + 1

        mlp_enconder = MlpEncoder(hidden_sizes=[200, 200, 200],
                                  input_size=mlp_enconder_input_size,
                                  output_size=2 * variant['latent_dim'])
        self.context_encoder = ProbabilisticContextEncoder(
            mlp_enconder, variant['latent_dim'])
        self.Qs = FlattenMlp(
            hidden_sizes=variant['Qs_hidden_sizes'],
            input_size=obs_dim + action_dim + latent_dim,
            output_size=1,
        )
        self.vae_decoder = VaeDecoder(
            max_action=variant['env_max_action'],
            hidden_sizes=variant['vae_hidden_sizes'],
            input_size=obs_dim + vae_latent_dim + latent_dim,
            output_size=action_dim,
        )
        self.perturbation_generator = PerturbationGenerator(
            max_action=env_max_action,
            hidden_sizes=variant['perturbation_hidden_sizes'],
            input_size=obs_dim + action_dim + latent_dim,
            output_size=action_dim,
        )

        self.use_next_obs_in_context = variant['use_next_obs_in_context']

        self.env = env_producer(variant['domain'], variant['seed'])
        self.num_evals = variant['num_evals']
        self.max_path_length = variant['max_path_length']

        self.vae_latent_dim = vae_latent_dim
        self.candidate_size = variant['candidate_size']

        self.env.seed(10 * variant['seed'] + 1234 + index)
        set_seed(10 * variant['seed'] + 1234 + index)

        self.env.action_space.np_random.seed(123 + index)

    def async_evaluate(self, goal):
        self.env.set_goal(goal)

        self.context_encoder.clear_z()

        avg_reward = 0.
        avg_achieved = []
        final_achieved = []

        raw_context = deque()
        for i in range(self.num_evals):
            # Sample MDP indentity
            self.context_encoder.sample_z()
            inferred_mdp = self.context_encoder.z

            obs = self.env.reset()
            done = False
            path_length = 0

            while not done and path_length < self.max_path_length:
                action = self.select_actions(np.array(obs), inferred_mdp)
                next_obs, reward, done, env_info = self.env.step(action)
                avg_achieved.append(env_info['achieved'])
                if self.use_next_obs_in_context:
                    new_context = np.concatenate([
                        obs.reshape(1, -1),
                        action.reshape(1, -1),
                        next_obs.reshape(1, -1),
                        np.array(reward).reshape(1, -1)
                    ],
                                                 axis=1)
                else:
                    assert False
                    new_context = np.concatenate([
                        obs.reshape(1, -1),
                        action.reshape(1, -1),
                        np.array(reward).reshape(1, -1)
                    ],
                                                 axis=1)
                raw_context.append(new_context)
                obs = next_obs.copy()
                if i > 1:
                    avg_reward += reward
                path_length += 1

            context = from_numpy(np.concatenate(raw_context, axis=0))[None]
            self.context_encoder.infer_posterior(context)

            if i > 1:
                final_achieved.append(env_info['achieved'])

        avg_reward /= (self.num_evals - 2)
        if np.isscalar(env_info['achieved']):
            avg_achieved = np.mean(avg_achieved)
            final_achieved = np.mean(final_achieved)

        else:
            avg_achieved = np.stack(avg_achieved)
            avg_achieved = np.mean(avg_achieved, axis=0)

            final_achieved = np.stack(final_achieved)
            final_achieved = np.mean(final_achieved, axis=0)
        print(avg_reward)
        return avg_reward, (final_achieved.tolist(), self.env._goal.tolist())

    def async_evaluate_test(self, goal):
        self.env.set_goal(goal)
        self.context_encoder.clear_z()

        avg_reward_list = []
        online_achieved_list = []

        raw_context = deque()
        for _ in range(self.num_evals):
            # Sample MDP indentity
            self.context_encoder.sample_z()
            inferred_mdp = self.context_encoder.z

            obs = self.env.reset()
            done = False
            path_length = 0
            avg_reward = 0.
            online_achieved = []
            while not done and path_length < self.max_path_length:
                action = self.select_actions(np.array(obs), inferred_mdp)
                next_obs, reward, done, env_info = self.env.step(action)
                achieved = env_info['achieved']
                online_achieved.append(np.arctan(achieved[1] / achieved[0]))
                if self.use_next_obs_in_context:
                    new_context = np.concatenate([
                        obs.reshape(1, -1),
                        action.reshape(1, -1),
                        next_obs.reshape(1, -1),
                        np.array(reward).reshape(1, -1)
                    ],
                                                 axis=1)
                else:
                    new_context = np.concatenate([
                        obs.reshape(1, -1),
                        action.reshape(1, -1),
                        np.array(reward).reshape(1, -1)
                    ],
                                                 axis=1)
                raw_context.append(new_context)
                obs = next_obs.copy()
                avg_reward += reward
                path_length += 1

            avg_reward_list.append(avg_reward)
            online_achieved = np.array(online_achieved)
            online_achieved_list.append([
                online_achieved.mean(),
                online_achieved.std(), self.env._goal
            ])

            context = from_numpy(np.concatenate(raw_context, axis=0))[None]
            self.context_encoder.infer_posterior(context)

        return online_achieved_list

    def set_network_params(self, params_list):
        '''
        The shipped params are in cpu here. This function
        will set the params of the sampler's networks using
        the params in the params_list and ship them to gpu.
        '''
        context_encoder_params, Qs_params, vae_params, perturbation_params = params_list

        self.context_encoder.mlp_encoder.set_param_values(
            context_encoder_params)
        self.context_encoder.mlp_encoder.to(ptu.device)

        self.Qs.set_param_values(Qs_params)
        self.Qs.to(ptu.device)

        self.vae_decoder.set_param_values(vae_params)
        self.vae_decoder.to(ptu.device)

        self.perturbation_generator.set_param_values(perturbation_params)
        self.perturbation_generator.to(ptu.device)

    def select_actions(self, obs, inferred_mdp):

        # Repeat the obs as what BCQ has done,
        # candidate_size here indicates how many
        # candidate actions we need.
        obs = from_numpy(np.tile(obs.reshape(1, -1), (self.candidate_size, 1)))
        with torch.no_grad():
            inferred_mdp = inferred_mdp.repeat(self.candidate_size, 1)
            z = from_numpy(
                np.random.normal(0, 1, size=(obs.size(0),
                                             self.vae_latent_dim))).clamp(
                                                 -0.5, 0.5).to(ptu.device)
            candidate_actions = self.vae_decoder(obs, z, inferred_mdp)
            perturbed_actions = self.perturbation_generator.get_perturbed_actions(
                obs, candidate_actions, inferred_mdp)
            qv = self.Qs(obs, perturbed_actions, inferred_mdp)
            ind = qv.max(0)[1]
        return ptu.get_numpy(perturbed_actions[ind])
示例#2
0
class PathCollectorSingleMdp(object):
    def __init__(self, variant, goal, candidate_size=10):
        ptu.set_gpu_mode(True)

        import sys
        sys.argv = ['']
        del sys

        env_max_action = variant['env_max_action']
        obs_dim = variant['obs_dim']
        action_dim = variant['action_dim']
        latent_dim = variant['latent_dim']
        vae_latent_dim = 2 * action_dim

        self.f = MlpEncoder(
            g_hidden_sizes=variant['g_hidden_sizes'],
            g_input_sizes=obs_dim + action_dim + 1,
            g_latent_dim=variant['g_latent_dim'],
            h_hidden_sizes=variant['h_hidden_sizes'],
            latent_dim=latent_dim,
        )
        self.Qs = FlattenMlp(
            hidden_sizes=variant['Qs_hidden_sizes'],
            input_size=obs_dim + action_dim + latent_dim,
            output_size=1,
        )
        self.vae_decoder = VaeDecoder(
            max_action=variant['env_max_action'],
            hidden_sizes=variant['vae_hidden_sizes'],
            input_size=obs_dim + vae_latent_dim + latent_dim,
            output_size=action_dim,
        )
        self.perturbation_generator = PerturbationGenerator(
            max_action=env_max_action,
            hidden_sizes=variant['perturbation_hidden_sizes'],
            input_size=obs_dim + action_dim + latent_dim,
            output_size=action_dim,
        )

        self.env = env_producer(variant['domain'], variant['seed'], goal)
        self.num_evals = variant['algo_params']['num_evals']
        self.max_path_length = variant['algo_params']['max_path_length']

        self.vae_latent_dim = vae_latent_dim
        self.num_trans_context = variant['num_trans_context']
        self.candidate_size = variant['candidate_size']

    def async_evaluate(self, params_list, goal=None):
        if goal is not None:
            self.env.set_goal(goal)

        self.set_network_params(params_list)
        avg_reward = 0.
        avg_achieved = []
        final_achieved = []
        for _ in range(self.num_evals):
            obs = self.env.reset()
            done = False
            path_length = 0
            raw_context = deque()
            while not done and path_length < self.max_path_length:
                action = self.select_actions(np.array(obs), raw_context)
                next_obs, reward, done, env_info = self.env.step(action)
                avg_achieved.append(env_info['achieved'])
                raw_context.append(
                    np.concatenate([
                        obs.reshape(1, -1),
                        action.reshape(1, -1),
                        np.array(reward).reshape(1, -1)
                    ],
                                   axis=1))
                print(env_info['achieved'])
                obs = next_obs.copy()
                avg_reward += reward
                path_length += 1
            final_achieved.append(env_info['achieved'])

        avg_reward /= self.num_evals
        if np.isscalar(env_info['achieved']):
            avg_achieved = np.mean(avg_achieved)
            final_achieved = np.mean(final_achieved)
        else:
            # avg_achieved = np.stack(avg_achieved)
            # avg_achieved = np.mean(avg_achieved, axis=0)

            final_achieved = np.stack(final_achieved)
            final_achieved = np.mean(final_achieved, axis=0)

        return avg_reward, (final_achieved.tolist(), self.env._goal.tolist())
        # return avg_reward, (avg_achieved, self.env._goal), (final_achieved, self.env._goal)

    def get_rollout(self, goal=None, bcq_policy=None):
        if goal is not None:
            self.env.set_goal(goal)

        obs = self.env.reset()
        done = False
        path_length = 0
        avg_reward = 0.
        traj = []
        raw_context = deque()
        while not done and path_length < self.max_path_length:
            if bcq_policy is not None and path_length < 20:
                # print(obs[:2])
                action = bcq_policy.select_action(obs)
            else:
                # print(obs[:2])
                action = self.select_actions(np.array(obs), raw_context)
            action = self.select_actions(np.array(obs), raw_context)
            next_obs, reward, done, env_info = self.env.step(action)
            traj.append([obs, next_obs, action, reward, raw_context, env_info])
            raw_context.append(
                np.concatenate([
                    obs.reshape(1, -1),
                    action.reshape(1, -1),
                    np.array(reward).reshape(1, -1)
                ],
                               axis=1))
            obs = next_obs.copy()
            path_length += 1
            avg_reward += reward

        print(avg_reward)
        return traj

    def set_network_params(self, params_list):
        '''
        The shipped params are in cpu here. This function
        will set the params of the sampler's networks using
        the params in the params_list and ship them to gpu.
        '''
        f_params, Qs_params, vae_params, perturbation_params = params_list

        self.f.set_param_values(f_params)
        self.f.to(ptu.device)

        self.Qs.set_param_values(Qs_params)
        self.Qs.to(ptu.device)

        self.vae_decoder.set_param_values(vae_params)
        self.vae_decoder.to(ptu.device)

        self.perturbation_generator.set_param_values(perturbation_params)
        self.perturbation_generator.to(ptu.device)

    def select_actions(self, obs, raw_context):

        # Repeat the obs as what BCQ has done,
        # candidate_size here indicates how many
        # candidate actions we need.
        obs = from_numpy(np.tile(obs.reshape(1, -1), (self.candidate_size, 1)))
        if len(raw_context) == 0:
            # In the beginning, the inferred_mdp is set to zero vector.
            inferred_mdp = ptu.zeros((1, self.f.latent_dim))
        else:
            # Construct the context from raw context
            context = from_numpy(np.concatenate(raw_context, axis=0))[None]
            inferred_mdp = self.f(context)
        with torch.no_grad():
            inferred_mdp = inferred_mdp.repeat(self.candidate_size, 1)
            z = from_numpy(
                np.random.normal(0, 1, size=(obs.size(0),
                                             self.vae_latent_dim))).clamp(
                                                 -0.5, 0.5).to(ptu.device)
            candidate_actions = self.vae_decoder(obs, z, inferred_mdp)
            perturbed_actions = self.perturbation_generator.get_perturbed_actions(
                obs, candidate_actions, inferred_mdp)
            qv = self.Qs(obs, perturbed_actions, inferred_mdp)
            ind = qv.max(0)[1]
        return ptu.get_numpy(perturbed_actions[ind])