예제 #1
0
    def __init__(self,
                 policy,
                 supervised_model=None,
                 supervised_ground_truth='teacher',
                 name="ppo",
                 learning_rate=1e-3,
                 clip_eps=0.2,
                 max_epochs=5,
                 max_epochs_r=20,
                 entropy_bonus=0.,
                 reward_predictor=None,
                 reward_predictor_type='gaussian',
                 grad_clip_threshold=None,
                 **kwargs):

        # TODO: Check to avoid duplicates of variables and scopes
        self.reward_predictor = reward_predictor
        Serializable.quick_init(self, locals())
        super(PPO, self).__init__(policy)

        self.recurrent = getattr(self.policy, 'recurrent', False)
        self.supervised_model = supervised_model
        if self.recurrent:
            backprop_steps = kwargs.get('backprop_steps', 32)
            self.optimizer = RL2FirstOrderOptimizer(
                learning_rate=learning_rate,
                max_epochs=max_epochs,
                backprop_steps=backprop_steps,
                grad_clip_threshold=grad_clip_threshold)
            if self.reward_predictor is not None:
                self.optimizer_r = RL2FirstOrderOptimizer(
                    learning_rate=learning_rate,
                    max_epochs=max_epochs_r,
                    backprop_steps=backprop_steps,
                    grad_clip_threshold=grad_clip_threshold)
            if self.supervised_model is not None:
                self.optimizer_s = RL2FirstOrderOptimizer(
                    learning_rate=learning_rate,
                    max_epochs=max_epochs_r,
                    backprop_steps=backprop_steps,
                    grad_clip_threshold=grad_clip_threshold)
        else:
            self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate,
                                                 max_epochs=max_epochs)
        # TODO figure out what this does
        self._optimization_keys = [
            'observations', 'actions', 'advantages', 'rewards', 'agent_infos',
            'env_infos'
        ]
        self._optimization_r_keys = [
            'observations', 'actions', 'advantages', 'rewards', 'agent_infos',
            'env_infos'
        ]
        self.name = name
        self._clip_eps = clip_eps
        self.entropy_bonus = entropy_bonus
        self.supervised_ground_truth = supervised_ground_truth
        self.reward_predictor_type = reward_predictor_type

        self.build_graph()
예제 #2
0
    def __init__(
        self,
        env,
        policy,
        dynamics_model,
        num_rollouts,
        max_path_length,
        parallel=False,
        deterministic_policy=False,
        optimize_actions=False,
        max_epochs=2,
        learning_rate=1e-4,
        **kwargs,
    ):
        super(BPTTSampler, self).__init__(env, policy, num_rollouts,
                                          max_path_length)
        assert not parallel

        self.env = env
        self.policy = policy
        self.dynamics_model = dynamics_model
        self.max_path_length = max_path_length
        self.total_samples = num_rollouts * max_path_length
        self.num_rollouts = num_rollouts
        self.total_timesteps_sampled = 0
        self.deterministic_policy = deterministic_policy
        self.optimize_actions = optimize_actions
        self.num_models = getattr(dynamics_model, 'num_models', 1)

        self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate,
                                             max_epochs=max_epochs)

        self.build_graph()
예제 #3
0
    def __init__(
            self,
            policy,
            name="ppo",
            learning_rate=1e-3,
            clip_eps=0.2,
            max_epochs=5,
            entropy_bonus=0.,
            **kwargs
            ):
        Serializable.quick_init(self, locals())
        super(PPO, self).__init__(policy)

        self.recurrent = getattr(self.policy, 'recurrent', False)
        if self.recurrent:
            backprop_steps = kwargs.get('backprop_steps', 32)
            self.optimizer = RL2FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs,
                                                    backprop_steps=backprop_steps)
        else:
            self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs)
        self._optimization_keys = ['observations', 'actions', 'advantages', 'agent_infos']
        self.name = name
        self._clip_eps = clip_eps
        self.entropy_bonus = entropy_bonus

        self.build_graph()
예제 #4
0
파일: svg_inf.py 프로젝트: iclavera/meta-mb
    def __init__(self,
                 policy,
                 dynamics_model,
                 tf_reward,
                 name="svg_inf",
                 learning_rate=1e-3,
                 max_epochs=5,
                 **kwargs):
        super(PPO, self).__init__(policy)
        self.dynamics_model = dynamics_model
        self.tf_reward = tf_reward

        self.recurrent = getattr(self.policy, 'recurrent', False)
        if self.recurrent:
            backprop_steps = kwargs.get('backprop_steps', 32)
            self.optimizer = RL2FirstOrderOptimizer(
                learning_rate=learning_rate,
                max_epochs=max_epochs,
                backprop_steps=backprop_steps)
        else:
            self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate,
                                                 max_epochs=max_epochs)
        self._optimization_keys = [
            'observations', 'actions', 'advantages', 'agent_infos'
        ]
        self.name = name
        self._clip_eps = clip_eps

        self.build_graph()
예제 #5
0
class PPO(Algo, Serializable):
    """
    Algorithm for PPO MAML

    Args:
        policy (Policy): policy object
        name (str): tf variable scope
        learning_rate (float): learning rate for the meta-objective
        exploration (bool): use exploration / pre-update sampling term / E-MAML term
        inner_type (str): inner optimization objective - either log_likelihood or likelihood_ratio
        inner_lr (float) : gradient step size used for inner step
        meta_batch_size (int): number of meta-learning tasks
        num_inner_grad_steps (int) : number of gradient updates taken per maml iteration
        trainable_inner_step_size (boolean): whether make the inner step size a trainable variable
    """
    def __init__(self,
                 policy,
                 supervised_model=None,
                 supervised_ground_truth='teacher',
                 name="ppo",
                 learning_rate=1e-3,
                 clip_eps=0.2,
                 max_epochs=5,
                 max_epochs_r=20,
                 entropy_bonus=0.,
                 reward_predictor=None,
                 reward_predictor_type='gaussian',
                 grad_clip_threshold=None,
                 **kwargs):

        # TODO: Check to avoid duplicates of variables and scopes
        self.reward_predictor = reward_predictor
        Serializable.quick_init(self, locals())
        super(PPO, self).__init__(policy)

        self.recurrent = getattr(self.policy, 'recurrent', False)
        self.supervised_model = supervised_model
        if self.recurrent:
            backprop_steps = kwargs.get('backprop_steps', 32)
            self.optimizer = RL2FirstOrderOptimizer(
                learning_rate=learning_rate,
                max_epochs=max_epochs,
                backprop_steps=backprop_steps,
                grad_clip_threshold=grad_clip_threshold)
            if self.reward_predictor is not None:
                self.optimizer_r = RL2FirstOrderOptimizer(
                    learning_rate=learning_rate,
                    max_epochs=max_epochs_r,
                    backprop_steps=backprop_steps,
                    grad_clip_threshold=grad_clip_threshold)
            if self.supervised_model is not None:
                self.optimizer_s = RL2FirstOrderOptimizer(
                    learning_rate=learning_rate,
                    max_epochs=max_epochs_r,
                    backprop_steps=backprop_steps,
                    grad_clip_threshold=grad_clip_threshold)
        else:
            self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate,
                                                 max_epochs=max_epochs)
        # TODO figure out what this does
        self._optimization_keys = [
            'observations', 'actions', 'advantages', 'rewards', 'agent_infos',
            'env_infos'
        ]
        self._optimization_r_keys = [
            'observations', 'actions', 'advantages', 'rewards', 'agent_infos',
            'env_infos'
        ]
        self.name = name
        self._clip_eps = clip_eps
        self.entropy_bonus = entropy_bonus
        self.supervised_ground_truth = supervised_ground_truth
        self.reward_predictor_type = reward_predictor_type

        self.build_graph()

    def build_graph(self):
        """
        Creates the computation graph

        Notes:
            Pseudocode:
            for task in meta_batch_size:
                make_vars
                init_init_dist_sym
            for step in num_inner_grad_steps:
                for task in meta_batch_size:
                    make_vars
                    update_init_dist_sym
            set objectives for optimizer
        """
        """ Create Variables """
        """ ----- Build graph for the meta-update ----- """
        self.op_phs_dict = OrderedDict()
        if isinstance(self.policy, DiscreteRNNPolicy):
            discrete = True
        else:
            discrete = False
        obs_ph, action_ph, adv_ph, r_ph, obs_r_ph, dist_info_old_ph, all_phs_dict, ground_truth_action_ph = self._make_input_placeholders(
            'train', recurrent=self.recurrent, discrete=discrete)
        self.op_phs_dict.update(all_phs_dict)

        if self.recurrent:
            distribution_info_vars, hidden_ph, next_hidden_var = self.policy.distribution_info_sym(
                obs_ph)
            # TODO: Check if anything is problematic here, when obs is concatenating previous reward
            if self.reward_predictor is not None:
                distribution_info_vars_r, hidden_ph_r, next_hidden_var_r = self.reward_predictor.distribution_info_sym(
                    obs_r_ph)
                if self.reward_predictor_type == 'gaussian':
                    distribution_info_vars_r[
                        "mean"] = distribution_info_vars_r["mean"][:, :, 0]
                    distribution_info_vars_r[
                        "log_std"] = distribution_info_vars_r[
                            "log_std"][:, 0]  # TODO: uncomment
            if self.supervised_model is not None:
                distribution_info_vars_s, hidden_ph_s, next_hidden_var_s = self.supervised_model.distribution_info_sym(
                    obs_ph)
        else:
            distribution_info_vars = self.policy.distribution_info_sym(obs_ph)
            hidden_ph, next_hidden_var = None, None
        """ Outer objective """
        # TODO: Check if anything changes for discrete
        likelihood_ratio = self.policy.distribution.likelihood_ratio_sym(
            action_ph, dist_info_old_ph, distribution_info_vars)
        # TODO: Check if anything changes for discrete
        clipped_obj = tf.minimum(
            likelihood_ratio * adv_ph,
            tf.clip_by_value(likelihood_ratio, 1 - self._clip_eps,
                             1 + self._clip_eps) * adv_ph)
        # TODO: Check that the discrete entropy looks fine
        mask = tf.reduce_sum(all_phs_dict['train_agent_infos/probs'], axis=2)
        ent = self.policy.distribution.entropy_sym(
            distribution_info_vars) * mask
        self.log_values = [
            likelihood_ratio, adv_ph, clipped_obj, dist_info_old_ph,
            distribution_info_vars, ent
        ]
        self.reward_loss = tf.reduce_mean(clipped_obj)
        self.entropy_loss = self.entropy_bonus * tf.reduce_mean(
            self.policy.distribution.entropy_sym(distribution_info_vars) *
            mask)
        surr_obj = - tf.reduce_mean(clipped_obj) - self.entropy_bonus * \
                        tf.reduce_mean(self.policy.distribution.entropy_sym(distribution_info_vars))
        if self.reward_predictor is not None:
            if self.reward_predictor_type == 'gaussian':
                r_obj = -tf.reduce_mean(
                    self.reward_predictor.distribution.log_likelihood_sym(
                        r_ph, distribution_info_vars_r))
            else:
                r_obj = -tf.reduce_mean(
                    tf.exp(5 * r_ph) *
                    self.reward_predictor.distribution.log_likelihood_sym(
                        tf.cast(r_ph, tf.int32),
                        distribution_info_vars_r))  # TODO: what's this?
            self.optimizer_r.build_graph(loss=r_obj,
                                         target=self.reward_predictor,
                                         input_ph_dict=self.op_phs_dict,
                                         hidden_ph=hidden_ph_r,
                                         next_hidden_var=next_hidden_var_r)
        if self.supervised_model is not None:
            if self.supervised_ground_truth == 'teacher':
                action_logits = tf.log(distribution_info_vars_s['probs'])
                ground_truth = tf.squeeze(tf.one_hot(ground_truth_action_ph,
                                                     action_logits.shape[-1]),
                                          axis=2)
                sup_learning_loss = tf.compat.v1.losses.softmax_cross_entropy(
                    ground_truth,
                    action_logits,
                    weights=mask,
                )
                self.log_values_sup = [
                    sup_learning_loss, action_logits, ground_truth
                ]
            elif self.supervised_ground_truth == 'agent':
                old_prob_var = all_phs_dict['train_agent_infos/probs']
                new_prob_var = distribution_info_vars_s['probs']
                TINY = tf.constant(1e-6)
                # TODO: we could switch to this loss function instead, but for whatever reason it gives errors.
                # diff = new_prob_var - old_prob_var
                mask = tf.expand_dims(tf.reduce_sum(old_prob_var, axis=2),
                                      axis=2)
                sup_learning_loss = tf.reduce_sum(
                    mask * old_prob_var * (tf.log(old_prob_var + TINY) -
                                           tf.log(new_prob_var + TINY)), )
                # diff = diff * mask
                # sup_learning_loss = tf.reduce_mean(diff**2)
                self.log_values_sup = [
                    old_prob_var, new_prob_var, sup_learning_loss, mask
                ]
            else:
                raise NotImplementedError
            # self.log_values_sup = self.[action_logits, distribution_info_vars_s['probs'], ground_truth]
            self.optimizer_s.build_graph(loss=sup_learning_loss,
                                         target=self.supervised_model,
                                         input_ph_dict=self.op_phs_dict,
                                         hidden_ph=hidden_ph_s,
                                         next_hidden_var=next_hidden_var_s)

        self.optimizer.build_graph(loss=surr_obj,
                                   target=self.policy,
                                   input_ph_dict=self.op_phs_dict,
                                   hidden_ph=hidden_ph,
                                   next_hidden_var=next_hidden_var)

    def optimize_policy(self,
                        samples_data,
                        log=True,
                        prefix='',
                        verbose=False):
        """
        Performs MAML outer step

        Args:
            samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        input_dict = self._extract_input_dict(samples_data,
                                              self._optimization_keys,
                                              prefix='train')
        entropy_loss, reward_loss = self.optimizer.compute_loss_variations(
            input_dict, self.entropy_loss, self.reward_loss, self.log_values)

        if verbose: logger.log("Optimizing")

        # Update model
        loss_before = self.optimizer.optimize(input_val_dict=input_dict)

        if verbose: logger.log("Computing statistics")
        loss_after = self.optimizer.loss(input_val_dict=input_dict)

        if log:
            logger.logkv(prefix + 'Loss/LossBefore', loss_before)
            logger.logkv(prefix + 'Loss/LossAfter', loss_after)
            logger.logkv(prefix + 'Loss/PartialLossEntropy', entropy_loss)
            logger.logkv(prefix + 'Loss/PartialLossReward', reward_loss)

    def optimize_reward(self,
                        samples_data,
                        log=True,
                        prefix='',
                        verbose=False):
        """
        Performs MAML outer step

        Args:
            samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        input_dict = self._extract_input_dict(samples_data,
                                              self._optimization_r_keys,
                                              prefix='train')

        if verbose: logger.log("Optimizing")
        loss_before = self.optimizer_r.optimize(input_val_dict=input_dict)

        if verbose: logger.log("Computing statistics")
        loss_after = self.optimizer_r.loss(input_val_dict=input_dict)

        if log:
            logger.logkv(prefix + 'RewardLossBefore', loss_before)
            logger.logkv(prefix + 'RewardLossAfter', loss_after)

    def optimize_supervised(self,
                            samples_data,
                            log=True,
                            prefix='',
                            verbose=False):
        input_dict = self._extract_input_dict(samples_data,
                                              self._optimization_keys,
                                              prefix='train')
        self.optimizer_s.compute_loss_variations(input_dict, None, None,
                                                 self.log_values_sup)

        if verbose: logger.log("Optimizing Supervised Model")
        loss_before = self.optimizer_s.optimize(input_val_dict=input_dict)

        if verbose: logger.log("Computing statistics")
        loss_after = self.optimizer_s.loss(input_val_dict=input_dict)

        if log:
            logger.logkv(prefix + 'SupervisedLossBefore', loss_before)
            logger.logkv(prefix + 'SupervisedLossAfter', loss_after)

    def __getstate__(self):
        state = dict()
        state['init_args'] = Serializable.__getstate__(self)
        print('getstate\n')
        print(state['init_args'])
        state['policy'] = self.policy.__getstate__()
        state['optimizer'] = self.optimizer.__getstate__()
        return state

    def __setstate__(self, state):
        Serializable.__setstate__(self, state['init_args'])
        self.policy.__setstate__(state['policy'])
        self.optimizer.__getstate__(state['optimizer'])
예제 #6
0
class BPTTSampler(BaseSampler):
    """
    Sampler for Meta-RL

    Args:
        env (meta_mb.meta_envs.base.MetaEnv) : environment object
        policy (meta_mb.policies.base.Policy) : policy object
        batch_size (int) : number of trajectories per task
        meta_batch_size (int) : number of meta tasks
        max_path_length (int) : max number of steps per trajectory
        envs_per_task (int) : number of meta_envs to run vectorized for each task (influences the memory usage)
    """
    def __init__(
        self,
        env,
        policy,
        dynamics_model,
        num_rollouts,
        max_path_length,
        parallel=False,
        deterministic_policy=False,
        optimize_actions=False,
        max_epochs=2,
        learning_rate=1e-4,
        **kwargs,
    ):
        super(BPTTSampler, self).__init__(env, policy, num_rollouts,
                                          max_path_length)
        assert not parallel

        self.env = env
        self.policy = policy
        self.dynamics_model = dynamics_model
        self.max_path_length = max_path_length
        self.total_samples = num_rollouts * max_path_length
        self.num_rollouts = num_rollouts
        self.total_timesteps_sampled = 0
        self.deterministic_policy = deterministic_policy
        self.optimize_actions = optimize_actions
        self.num_models = getattr(dynamics_model, 'num_models', 1)

        self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate,
                                             max_epochs=max_epochs)

        self.build_graph()

    def build_graph(self):
        self._initial_obs_ph = tf.placeholder(dtype=tf.float32,
                                              shape=(self.num_rollouts,
                                                     self.policy.obs_dim),
                                              name='init_obs')
        obses = []
        acts = []
        rewards = []
        means = []
        log_stds = []
        obs = self._initial_obs_ph
        for t in range(self.max_path_length):
            dist_policy = self.policy.distribution_info_sym(obs)
            act, dist_policy = self.policy.distribution.sample_sym(dist_policy)
            next_obs = self.dynamics_model.predict_sym(obs, act)

            reward = self.env.tf_reward(obs, act, next_obs)

            obses.append(obs)
            acts.append(act)
            rewards.append(reward)
            means.append(dist_policy['mean'])
            log_stds.append(dist_policy['log_std'])

            obs = next_obs

        # rewards = tf.stack(tf.split(tf.transpose(tf.stack(rewards, axis=0)), self.num_models))
        # random_weights = tf.random.uniform(shape=(self.num_models, self.num_rollouts, self.max_path_length))
        # rewards = rewards * random_weights / tf.reduce_sum(random_weights, axis=0)
        self._returns_var = tf.reduce_sum(rewards, axis=0)
        self._rewards_var = rewards
        self._actions_var = acts
        self._observations_var = obses
        self._means_var = means
        self._log_stds_var = log_stds

    def obtain_samples(self, log=False, log_prefix='', buffer=None):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        policy = self.policy
        policy.reset(dones=[True] * self.num_rollouts)

        # initial reset of meta_envs
        init_obses = np.array(
            [self.env.reset() for _ in range(self.num_rollouts)])

        sess = tf.get_default_session()
        observations, actions, means, log_stds, rewards = sess.run(
            [
                self._observations_var, self._actions_var, self._means_var,
                self._log_stds_var, self._rewards_var
            ],
            feed_dict={self._initial_obs_ph: init_obses})

        means = np.array(means).transpose((1, 0, 2))
        log_stds = np.array(log_stds).transpose((1, 0, 2))
        if log_stds.shape[0] == 1:
            log_stds = np.repeat(log_stds, self.num_rollouts, axis=0)
        agent_infos = [
            dict(mean=mean, log_std=log_std)
            for mean, log_std in zip(means, log_stds)
        ]
        observations = np.array(observations).transpose((1, 0, 2))
        actions = np.array(actions).transpose((1, 0, 2))
        rewards = np.array(rewards).T
        dones = [[False for _ in range(self.max_path_length)]
                 for _ in range(self.num_rollouts)]
        env_infos = [dict() for _ in range(self.num_rollouts)]
        paths = [
            dict(observations=obs,
                 actions=act,
                 rewards=rew,
                 dones=done,
                 env_infos=env_info,
                 agent_infos=agent_info)
            for obs, act, rew, done, env_info, agent_info in zip(
                observations, actions, rewards, dones, env_infos, agent_infos)
        ]
        self.total_timesteps_sampled += self.total_samples
        logger.logkv('ModelSampler-n_timesteps', self.total_timesteps_sampled)

        return paths

    def optimize_policy(self, log=True):
        init_obses = np.array(
            [self.env.reset()
             for _ in range(self.num_rollouts)] * self.num_models)
        input_dict = dict(initial_obs=init_obses)
        self.optimizer.optimize(input_dict)
예제 #7
0
class PPO(Algo, Serializable):
    """
    Algorithm for PPO MAML

    Args:
        policy (Policy): policy object
        name (str): tf variable scope
        learning_rate (float): learning rate for the meta-objective
        exploration (bool): use exploration / pre-update sampling term / E-MAML term
        inner_type (str): inner optimization objective - either log_likelihood or likelihood_ratio
        inner_lr (float) : gradient step size used for inner step
        meta_batch_size (int): number of meta-learning tasks
        num_inner_grad_steps (int) : number of gradient updates taken per maml iteration
        trainable_inner_step_size (boolean): whether make the inner step size a trainable variable
    """
    def __init__(
            self,
            policy,
            name="ppo",
            learning_rate=1e-3,
            clip_eps=0.2,
            max_epochs=5,
            entropy_bonus=0.,
            **kwargs
            ):
        Serializable.quick_init(self, locals())
        super(PPO, self).__init__(policy)

        self.recurrent = getattr(self.policy, 'recurrent', False)
        if self.recurrent:
            backprop_steps = kwargs.get('backprop_steps', 32)
            self.optimizer = RL2FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs,
                                                    backprop_steps=backprop_steps)
        else:
            self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs)
        self._optimization_keys = ['observations', 'actions', 'advantages', 'agent_infos']
        self.name = name
        self._clip_eps = clip_eps
        self.entropy_bonus = entropy_bonus

        self.build_graph()

    def build_graph(self):
        """
        Creates the computation graph

        Notes:
            Pseudocode:
            for task in meta_batch_size:
                make_vars
                init_init_dist_sym
            for step in num_inner_grad_steps:
                for task in meta_batch_size:
                    make_vars
                    update_init_dist_sym
            set objectives for optimizer
        """

        """ Create Variables """

        """ ----- Build graph for the meta-update ----- """
        self.op_phs_dict = OrderedDict()
        obs_ph, action_ph, adv_ph, dist_info_old_ph, all_phs_dict = self._make_input_placeholders('train',
                                                                                                  recurrent=self.recurrent)
        self.op_phs_dict.update(all_phs_dict)

        if self.recurrent:
            distribution_info_vars, hidden_ph, next_hidden_var = self.policy.distribution_info_sym(obs_ph)
        else:
            distribution_info_vars = self.policy.distribution_info_sym(obs_ph)
            hidden_ph, next_hidden_var = None, None

        """ Outer objective """
        likelihood_ratio = self.policy.distribution.likelihood_ratio_sym(action_ph, dist_info_old_ph,
                                                                         distribution_info_vars)

        clipped_obj = tf.minimum(likelihood_ratio * adv_ph,
                                 tf.clip_by_value(likelihood_ratio,
                                                  1 - self._clip_eps,
                                                  1 + self._clip_eps ) * adv_ph)
        surr_obj = - tf.reduce_mean(clipped_obj) - self.entropy_bonus * \
                   tf.reduce_mean(self.policy.distribution.entropy_sym(distribution_info_vars))

        self.optimizer.build_graph(
            loss=surr_obj,
            target=self.policy,
            input_ph_dict=self.op_phs_dict,
            hidden_ph=hidden_ph,
            next_hidden_var=next_hidden_var
        )

    def optimize_policy(self, samples_data, log=True, prefix='', verbose=False):
        """
        Performs MAML outer step

        Args:
            samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        input_dict = self._extract_input_dict(samples_data, self._optimization_keys, prefix='train')

        if verbose: logger.log("Optimizing")
        loss_before = self.optimizer.optimize(input_val_dict=input_dict)

        if verbose: logger.log("Computing statistics")
        loss_after = self.optimizer.loss(input_val_dict=input_dict)

        if log:
            logger.logkv(prefix+'LossBefore', loss_before)
            logger.logkv(prefix+'LossAfter', loss_after)

    def __getstate__(self):
        state = dict()
        state['init_args'] = Serializable.__getstate__(self)
        print('getstate\n')
        print(state['init_args'])
        state['policy'] = self.policy.__getstate__()
        state['optimizer'] = self.optimizer.__getstate__()
        return state

    def __setstate__(self, state):
        Serializable.__setstate__(self, state['init_args'])
        self.policy.__setstate__(state['policy'])
        self.optimizer.__getstate__(state['optimizer'])
예제 #8
0
파일: svg_inf.py 프로젝트: iclavera/meta-mb
class SVGInf(Algo):
    """

    Args:
        policy (Policy): policy object
        name (str): tf variable scope
        learning_rate (float): learning rate for the meta-objective
        exploration (bool): use exploration / pre-update sampling term / E-MAML term
        inner_type (str): inner optimization objective - either log_likelihood or likelihood_ratio
        inner_lr (float) : gradient step size used for inner step
        meta_batch_size (int): number of meta-learning tasks
        num_inner_grad_steps (int) : number of gradient updates taken per maml iteration
        trainable_inner_step_size (boolean): whether make the inner step size a trainable variable
    """
    def __init__(self,
                 policy,
                 dynamics_model,
                 tf_reward,
                 name="svg_inf",
                 learning_rate=1e-3,
                 max_epochs=5,
                 **kwargs):
        super(PPO, self).__init__(policy)
        self.dynamics_model = dynamics_model
        self.tf_reward = tf_reward

        self.recurrent = getattr(self.policy, 'recurrent', False)
        if self.recurrent:
            backprop_steps = kwargs.get('backprop_steps', 32)
            self.optimizer = RL2FirstOrderOptimizer(
                learning_rate=learning_rate,
                max_epochs=max_epochs,
                backprop_steps=backprop_steps)
        else:
            self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate,
                                                 max_epochs=max_epochs)
        self._optimization_keys = [
            'observations', 'actions', 'advantages', 'agent_infos'
        ]
        self.name = name
        self._clip_eps = clip_eps

        self.build_graph()

    def build_graph(self):
        """
        Creates the computation graph

        Notes:
            Pseudocode:
            for task in meta_batch_size:
                make_vars
                init_init_dist_sym
            for step in num_inner_grad_steps:
                for task in meta_batch_size:
                    make_vars
                    update_init_dist_sym
            set objectives for optimizer
        """
        """ Create Variables """
        """ ----- Build graph for the meta-update ----- """
        self.op_phs_dict = OrderedDict()
        obs_ph, action_ph, next_obs_ph, adv_ph, dist_info_old_ph, all_phs_dict = \
            self._make_input_placeholders('train', recurrent=False, next_obs=True)

        # TODO: I need the full trajectory here! So I need to reshape or concat the data or sth so the distribution info_vars makes sense
        self.op_phs_dict.update(all_phs_dict)

        distribution_info_vars = self.policy.distribution_info_sym(obs_ph)
        hidden_ph, next_hidden_var = None, None

        for t in range(sel.horizon, 0, -1):
            v = self.tf_reward(obs_ph[t - 1], action_ph[t], obs_ph[t])

        surr_obj = -tf.reduce_mean(clipped_obj)

        self.optimizer.build_graph(loss=surr_obj,
                                   target=self.policy,
                                   input_ph_dict=self.op_phs_dict,
                                   hidden_ph=hidden_ph,
                                   next_hidden_var=next_hidden_var)

    def optimize_policy(self, samples_data, log=True):
        """
        Performs MAML outer step

        Args:
            samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        input_dict = self._extract_input_dict(samples_data,
                                              self._optimization_keys,
                                              prefix='train')

        if log: logger.log("Optimizing")
        loss_before = self.optimizer.optimize(input_val_dict=input_dict)

        if log: logger.log("Computing statistics")
        loss_after = self.optimizer.loss(input_val_dict=input_dict)

        if log:
            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)