Пример #1
0
class PPO_flat(BatchPolopt):
    """
    Normal clipped PPO version of the HiPPO one that this paper investigates.
    """

    # double check this constructor later
    def __init__(self,
                 optimizer=None,
                 optimizer_args=None,
                 step_size=0.0003,
                 truncate_local_is_ratio=None,
                 epsilon=0.1,
                 train_pi_iters=80,
                 **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters,
                                                 **optimizer_args)
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        self.epsilon = epsilon

        super(PPO_flat, self).__init__(**kwargs)  # not sure if this line is correct

        # i hope this is right
        self.debug_fns = []
        # self.old_policy = copy.deepcopy(self.policy)

    # initialize the computation graph
    # optimize is run on >= 1 trajectory at a time

    def init_opt(self):
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )
        advantage_var = ext.new_tensor(
            'advantage',
            ndim=1,
            dtype=theano.config.floatX
        )

        mean_var = ext.new_tensor(
            'mean',
            ndim=2,
            dtype=theano.config.floatX
        )

        log_std_var = ext.new_tensor(
            'log_std',
            ndim=2,
            dtype=theano.config.floatX
        )

        old_dist_info_vars = dict(mean=mean_var, log_std=log_std_var)
        dist_info_vars = self.policy.dist_info_sym(obs_var)
        lr = self.policy.distribution.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)

        surr_loss_vector = TT.minimum(lr * advantage_var,
                                      TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var)
        surr_loss = -TT.mean(surr_loss_vector)

        input_list = [obs_var, action_var, advantage_var, mean_var, log_std_var]

        self.optimizer.update_opt(
            loss=surr_loss,
            target=self.policy,
            inputs=input_list
        )
        return dict()

    # do the optimization
    def optimize_policy(self, itr, samples_data):
        # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse

        input_values = tuple(ext.extract(
            samples_data,
            "observations", "actions", "advantages", "agent_infos"
        ))
        mean = input_values[3]['mean']
        log_std = input_values[3]['log_std']

        all_input_values = (input_values[0], input_values[1], input_values[2], mean, log_std)
        # todo: assign current parameters to old policy; does this work?
        # old_param_values = self.policy.get_param_values()
        # self.old_policy.set_param_values(old_param_values)
        loss_before = self.optimizer.loss(all_input_values)
        self.optimizer.optimize(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()

    def get_itr_snapshot(self, itr, samples_data):
        return dict(
            itr=itr,
            policy=self.policy,
            baseline=self.baseline,
            env=self.env
        )

    def log_diagnostics(self, paths):
        # paths obtained by self.sampler.obtain_samples
        BatchPolopt.log_diagnostics(self, paths)
Пример #2
0
class PG_concurrent(BatchPolopt):
    """
    Designed to enable concurrent training of a SNN that parameterizes skills
    and also train the manager at the same time

    Note that, if I'm not trying to do the sample approximation of the weird log of sum term,
    I don't need to know which skill was picked, just need to know the action
    """

    # double check this constructor later
    def __init__(
            self,
            manager_optimizer=None,
            optimizer=None,
            snn_optimizer=None,
            optimizer_args=None,
            step_size=1e-6,
            latents=None,  # some sort of iterable of the actual latent vectors
            period=10,  # how often I choose a latent
            truncate_local_is_ratio=None,
            **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(
                learning_rate=step_size,
                **optimizer_args)  # I hope this is right
        self.manager_optimizer = manager_optimizer
        self.snn_optimizer = snn_optimizer
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        super(PG_concurrent,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.latents = latents
        self.period = period

        # todo: fix this sampler stuff
        self.sampler = HierBatchSampler(self, self.period)

        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

    # initialize the computation graph
    # optimize is run on >= 1 trajectory at a time

    def init_opt(self):
        # obs_var_raw = self.env.observation_space.new_tensor_variable(
        #     'obs',
        #     extra_dims=1,
        # )

        obs_var_raw = ext.new_tensor(
            'obs', ndim=3, dtype=theano.config.floatX)  # todo: check the dtype

        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )

        # this will have to be the advantage every self.period timesteps
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1,
                                       dtype=theano.config.floatX)

        obs_var_sparse = ext.new_tensor(
            'sparse_obs',
            ndim=2,
            dtype=theano.config.
            floatX  # todo: check this with carlos, refer to discrete.py in rllab.spaces
        )

        assert isinstance(self.policy, HierarchicalPolicy)

        # todo: assumptions: 1 trajectory, which is a multiple of p; that the obs_var_probs is valid

        # undoing the reshape, so that batch sampling is ok
        obs_var = TT.reshape(obs_var_raw, [
            obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2]
        ])
        # obs_var = obs_var_raw

        # i, j should contain the probability of latent j at time step self.period*i
        # should be a len(obs)//self.period by len(self.latent) tensor
        latent_probs = self.policy.manager.dist_info_sym(
            obs_var_sparse)['prob']

        # get the distribution parameters
        # dist_info_vars = []
        # for latent in self.latents:
        #     self.policy.low_policy.set_latent_train(latent)
        #     dist_info_vars.append(self.policy.low_policy.dist_info_sym(obs_var))
        # hopefully the above line takes multiple samples, and state_info_vars not needed as input

        dist_info_vars = self.policy.low_policy.dist_info_sym_all_latents(
            obs_var)
        probs = [
            TT.exp(self.diagonal.log_likelihood_sym(action_var, dist_info))
            for dist_info in dist_info_vars
        ]

        # need to reshape at the end
        reshaped_probs = [
            TT.reshape(prob, [obs_var.shape[0] // self.period, self.period])
            for prob in probs
        ]

        # now, multiply out each row and concatenate
        subtrajectory_probs = TT.stack([
            TT.prod(reshaped_prob, axis=1) for reshaped_prob in reshaped_probs
        ],
                                       axis=1)
        # shape error might come out of here

        # elementwise multiplication, then sum up each individual row and take log
        likelihood = TT.log(TT.sum(subtrajectory_probs * latent_probs, axis=1))

        surr_loss = -TT.mean(likelihood * advantage_var)

        input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var]
        # npo has state_info_vars and old_dist_info_vars, I don't think I need them until I go for NPO/TRPO

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  inputs=input_list)
        return dict()

    #do the optimization
    def optimize_policy(self, itr, samples_data):
        assert len(samples_data) // self.period == 0

        # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse

        input_values = tuple(
            ext.extract(samples_data, "observations", "actions", "advantages"))
        # print(input_values[0].shape)

        obs_raw = input_values[0].reshape(
            input_values[0].shape[0] // self.period, self.period,
            input_values[0].shape[1])
        # obs_raw = input_values[0]

        obs_sparse = input_values[0].take(
            [i for i in range(0, input_values[0].shape[0], self.period)],
            axis=0)
        advantage_sparse = np.sum(input_values[2].reshape(
            [input_values[2].shape[0] // self.period, self.period]),
                                  axis=1)
        all_input_values = (obs_raw, obs_sparse, input_values[1],
                            advantage_sparse)

        loss_before = self.optimizer.loss(all_input_values)
        self.optimizer.optimize(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()

    def optimize_manager(self, itr, samples_data):
        pass

    def optimize_snn(self, itr, samples_data):
        pass

    def get_itr_snapshot(self, itr, samples_data):
        return dict(itr=itr,
                    policy=self.policy,
                    baseline=self.baseline,
                    env=self.env)

    def log_diagnostics(self, paths):
        #paths obtained by self.sampler.obtain_samples
        BatchPolopt.log_diagnostics(self, paths)
Пример #3
0
class CriticEval:
    def __init__(self,
                 qf,
                 policy,
                 min_pool_size=10000,
                 replay_pool_size=1000000,
                 replacement_prob=1.0,
                 qf_batch_size=32,
                 qf_weight_decay=0.,
                 qf_update_method='adam',
                 qf_learning_rate=1e-3,
                 qf_use_target=True,
                 soft_target_tau=0.001,
                 ):

        self.soft_target_tau = soft_target_tau
        self.min_pool_size = min_pool_size
        self.replay_pool_size = replay_pool_size
        self.replacement_prob = replacement_prob
        self.qf_batch_size = qf_batch_size
        self.qf_weight_decay = qf_weight_decay
        self.qf_update_method = FirstOrderOptimizer(update_method=qf_update_method,
                                                    learning_rate=qf_learning_rate)
        self.qf_use_target = qf_use_target
        self.discount = 0.99
        self.qf = qf
        self.policy = policy

        self.qf_loss_averages = []
        self.q_averages = []
        self.y_averages = []

    def init_opt_critic(self, obs_dim, act_dim):

        target_qf = self.qf
        extra_dims = 1

        obs = tf.placeholder(tf.float32, shape=[None] * extra_dims + list([obs_dim]), name='qf_obs')
        action = tf.placeholder(tf.float32, shape=[None] * extra_dims + list([act_dim]), name='qf_action')

        # obs = tf.placeholder(tf.float32, shape=list([obs_dim] + [None] * extra_dims), name='qf_obs')
        # action = tf.placeholder(tf.float32, shape=list([act_dim] + [None] * extra_dims), name='qf_action')

        yvar = tf.placeholder(dtype=tf.float32, shape=[None], name='ys')

        # qf_weight_decay_term = 0.5 * self.qf_weight_decay * \
        #                        sum([tf.reduce_sum(tf.square(param)) for param in
        #                             self.qf.get_params(regularizable=True)])

        qval = self.qf.get_qval_sym(obs, action)

        qf_loss = tf.reduce_mean(tf.square(yvar - qval))
        qf_input_list = [yvar, obs, action]
        qf_output_list = [qf_loss, qval]

        # qf_reg_loss = qf_loss + qf_weight_decay_term
        qf_reg_loss = qf_loss

        self.qf_update_method.update_opt(
            loss=qf_reg_loss, target=self.qf, inputs=qf_input_list)
        # qf_output_list += [self.qf_update_method._train_op]

        f_train_qf = compile_function(inputs=qf_input_list, outputs=qf_output_list, sess=tf.get_default_session())

        self.opt_info_critic = dict(
            f_train_qf=f_train_qf,
            target_qf=target_qf,
        )

    def do_critic_training(self, batch):

        obs = batch['states']
        actions = batch['actions']
        rewards = batch['rewards']
        next_obs = batch['states_']
        terminals = batch['terminals']

        target_qf = self.opt_info_critic["target_qf"]

        target_policy = self.policy
        next_qvals = target_qf.get_e_qval(next_obs, target_policy)

        ys = rewards + (1. - terminals) * self.discount * next_qvals
        inputs = (ys, obs, actions)

        qf_outputs = self.opt_info_critic['f_train_qf'](*inputs)
        qf_loss = qf_outputs.pop(0)
        qval = qf_outputs.pop(0)

        if self.qf_use_target:
            target_qf.set_param_values(
                target_qf.get_param_values() * (1.0 - self.soft_target_tau) +
                self.qf.get_param_values() * self.soft_target_tau)

        self.qf_loss_averages.append(qf_loss)
        self.q_averages.append(qval)
        self.y_averages.append(ys)

    def optimize_critic(self, batch, batch_size):
        """ Train the critic for batch sampling-based policy optimization methods
        :param samples:
        :param batch_size:
        :param policy:
        :return:
        """
        qf_updates_ratio = 1
        qf_itrs = float(batch_size) * qf_updates_ratio
        qf_itrs = int(np.ceil(qf_itrs))
        for i in range(qf_itrs):
            # Train critic
            self.do_critic_training(batch)
Пример #4
0
class Hippo(BatchPolopt):
    def __init__(
            self,
            optimizer=None,
            optimizer_args=None,
            step_size=0.0003,
            latents=None,  # some sort of iterable of the actual latent vectors
            average_period=10,  # average over all the periods
            truncate_local_is_ratio=None,
            epsilon=0.1,
            train_pi_iters=80,
            use_skill_dependent_baseline=False,
            mlp_skill_dependent_baseline=False,
            **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(learning_rate=step_size,
                                                 max_epochs=train_pi_iters,
                                                 **optimizer_args)
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        self.epsilon = epsilon

        super(Hippo,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.average_period = average_period

        # import pdb; pdb.set_trace()
        # self.sampler = BatchSampler(self)
        self.sampler = HierBatchSampler(self, period=None)

        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicyRandomTime)
        # self.old_policy = copy.deepcopy(self.policy)

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            new_obs_space_no_bi = curr_env.observation_space.shape[
                0] + 1  # 1 for the t_remaining
            skill_dependent_obs_space_dim = (new_obs_space_no_bi *
                                             (self.num_latents + 1) +
                                             self.num_latents, )
            skill_dependent_obs_space = Box(
                -1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space,
                                               skill_dependent_action_space)
            if self.mlp_skill_dependent_baseline:
                self.skill_dependent_baseline = GaussianMLPBaseline(
                    env_spec=skill_dependent_env_spec)
            else:
                self.skill_dependent_baseline = LinearFeatureBaseline(
                    env_spec=skill_dependent_env_spec)

    def init_opt(self):
        obs_var = ext.new_tensor(
            'obs', ndim=2, dtype=theano.config.floatX)  # todo: check the dtype

        manager_obs_var = ext.new_tensor('manager_obs',
                                         ndim=2,
                                         dtype=theano.config.floatX)

        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )

        # this will have to be the advantage every time the manager makes a decision
        manager_advantage_var = ext.new_tensor('manager_advantage',
                                               ndim=1,
                                               dtype=theano.config.floatX)

        skill_advantage_var = ext.new_tensor('skill_advantage',
                                             ndim=1,
                                             dtype=theano.config.floatX)

        latent_var_sparse = ext.new_tensor('sparse_latent',
                                           ndim=2,
                                           dtype=theano.config.floatX)

        latent_var = ext.new_tensor('latents',
                                    ndim=2,
                                    dtype=theano.config.floatX)

        mean_var = ext.new_tensor('mean', ndim=2, dtype=theano.config.floatX)

        log_std_var = ext.new_tensor('log_std',
                                     ndim=2,
                                     dtype=theano.config.floatX)

        manager_prob_var = ext.new_tensor('log_std',
                                          ndim=2,
                                          dtype=theano.config.floatX)

        assert isinstance(self.policy, HierarchicalPolicy)

        #############################################################
        ### calculating the manager portion of the surrogate loss ###
        #############################################################

        # i, j should contain the probability of latent j at time step self.period*i
        # should be a len(obs)//self.period by len(self.latent) tensor
        latent_probs = self.policy.manager.dist_info_sym(
            manager_obs_var)['prob']
        # old_latent_probs = self.old_policy.manager.dist_info_sym(manager_obs_var)['prob']

        actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1)
        old_actual_latent_probs = TT.sum(manager_prob_var * latent_var_sparse,
                                         axis=1)
        lr = TT.exp(
            TT.log(actual_latent_probs) - TT.log(old_actual_latent_probs))
        manager_surr_loss_vector = TT.minimum(
            lr * manager_advantage_var,
            TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) *
            manager_advantage_var)
        manager_surr_loss = -TT.mean(manager_surr_loss_vector)

        ############################################################
        ### calculating the skills portion of the surrogate loss ###
        ############################################################

        dist_info_var = self.policy.low_policy.dist_info_sym(
            obs_var, state_info_var=latent_var)
        old_dist_info_var = dict(mean=mean_var, log_std=log_std_var)
        skill_lr = self.diagonal.likelihood_ratio_sym(action_var,
                                                      old_dist_info_var,
                                                      dist_info_var)

        skill_surr_loss_vector = TT.minimum(
            skill_lr * skill_advantage_var,
            TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) *
            skill_advantage_var)
        skill_surr_loss = -TT.mean(skill_surr_loss_vector)

        surr_loss = manager_surr_loss / self.average_period + skill_surr_loss

        input_list = [
            obs_var, manager_obs_var, action_var, manager_advantage_var,
            skill_advantage_var, latent_var, latent_var_sparse, mean_var,
            log_std_var, manager_prob_var
        ]

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  inputs=input_list)
        return dict()

    # do the optimization
    def optimize_policy(self, itr, samples_data):
        # print(len(samples_data['observations']), self.period)
        # assert len(samples_data['observations']) % self.period == 0

        # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse
        if self.use_skill_dependent_baseline:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos", "skill_advantages"))
        else:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos"))

        time_remaining = input_values[3]['time_remaining']
        resampled_period = input_values[3]['resampled_period']
        obs_var = np.insert(input_values[0],
                            self.policy.obs_robot_dim,
                            time_remaining,
                            axis=1)
        manager_obs_var = obs_var[resampled_period]
        action_var = input_values[1]
        manager_adv_var = input_values[2][resampled_period]

        latent_var = input_values[3]['latents']
        latent_var_sparse = latent_var[resampled_period]
        mean = input_values[3]['mean']
        log_std = input_values[3]['log_std']
        prob = input_values[3]['prob'][resampled_period]
        if self.use_skill_dependent_baseline:
            skill_adv_var = input_values[4]
            all_input_values = (obs_var, manager_obs_var, action_var,
                                manager_adv_var, skill_adv_var, latent_var,
                                latent_var_sparse, mean, log_std, prob)
        else:
            skill_adv_var = input_values[2]
            all_input_values = (obs_var, manager_obs_var, action_var,
                                manager_adv_var, skill_adv_var, latent_var,
                                latent_var_sparse, mean, log_std, prob)

        # todo: assign current parameters to old policy; does this work?
        # old_param_values = self.policy.get_param_values()
        # self.old_policy.set_param_values(old_param_values)
        loss_before = self.optimizer.loss(all_input_values)
        self.optimizer.optimize(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()

    def get_itr_snapshot(self, itr, samples_data):
        return dict(itr=itr,
                    policy=self.policy,
                    baseline=self.baseline,
                    env=self.env)

    def log_diagnostics(self, paths):
        # paths obtained by self.sampler.obtain_samples
        BatchPolopt.log_diagnostics(self, paths)
Пример #5
0
class Concurrent_PPO(BatchPolopt):
    """
    Designed to enable concurrent training of a SNN that parameterizes skills
    and also train the manager at the same time

    Note that, if I'm not trying to do the sample approximation of the weird log of sum term,
    I don't need to know which skill was picked, just need to know the action
    """

    # double check this constructor later
    def __init__(self,
                 optimizer=None,
                 optimizer_args=None,
                 step_size=0.0003,
                 num_latents=6,
                 latents=None,  # some sort of iterable of the actual latent vectors
                 period=10,  # how often I choose a latent
                 truncate_local_is_ratio=None,
                 epsilon=0.1,
                 train_pi_iters=80,
                 use_skill_dependent_baseline=False,
                 **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args)
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        self.epsilon = epsilon

        super(Concurrent_PPO, self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.period = period

        # todo: fix this sampler stuff
        # import pdb; pdb.set_trace()
        self.sampler = HierBatchSampler(self, self.period)
        # self.sampler = BatchSampler(self)
        # i hope this is right
        self.diagonal = DiagonalGaussian(self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicy)
        if self.policy is not None:
            self.period = self.policy.period
        assert self.policy.period == self.period
        self.old_policy = copy.deepcopy(self.policy)

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            skill_dependent_obs_space_dim = ((curr_env.observation_space.shape[0] + 1) * self.num_latents,)
            skill_dependent_obs_space = Box(-1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_depdendent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space)
            self.skill_dependent_baseline = LinearFeatureBaseline(env_spec=skill_depdendent_env_spec)

    # initialize the computation graph
    # optimize is run on >= 1 trajectory at a time

    def init_opt(self):
        # obs_var_raw = self.env.observation_space.new_tensor_variable(
        #     'obs',
        #     extra_dims=1,
        # )


        obs_var_raw = ext.new_tensor('obs', ndim=3, dtype=theano.config.floatX)  # todo: check the dtype

        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )

        # this will have to be the advantage every self.period timesteps
        advantage_var_sparse = ext.new_tensor(
            'sparse_advantage',
            ndim=1,
            dtype=theano.config.floatX
        )

        advantage_var = ext.new_tensor(
            'advantage',
            ndim=1,
            dtype=theano.config.floatX
        )

        obs_var_sparse = ext.new_tensor(
            'sparse_obs',
            ndim=2,
            dtype=theano.config.floatX  # todo: check this with carlos, refer to discrete.py in rllab.spaces
        )

        latent_var_sparse = ext.new_tensor(
            'sparse_latent',
            ndim=2,
            dtype=theano.config.floatX
        )

        latent_var = ext.new_tensor(
            'latents',
            ndim=2,
            dtype=theano.config.floatX
        )

        assert isinstance(self.policy, HierarchicalPolicy)

        # todo: assumptions: 1 trajectory, which is a multiple of p; that the obs_var_probs is valid

        # undoing the reshape, so that batch sampling is ok
        obs_var = TT.reshape(obs_var_raw, [obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2]])
        # obs_var = obs_var_raw

        #############################################################
        ### calculating the manager portion of the surrogate loss ###
        #############################################################

        # i, j should contain the probability of latent j at time step self.period*i
        # should be a len(obs)//self.period by len(self.latent) tensor
        latent_probs = self.policy.manager.dist_info_sym(obs_var_sparse)['prob']
        old_latent_probs = self.old_policy.manager.dist_info_sym(obs_var_sparse)['prob']

        actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1)
        old_actual_latent_probs = TT.sum(old_latent_probs * latent_var_sparse, axis=1)
        lr = actual_latent_probs / old_actual_latent_probs
        manager_surr_loss_vector = TT.minimum(lr * advantage_var_sparse,
                                              TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var_sparse)
        manager_surr_loss = -TT.mean(manager_surr_loss_vector)
        # manager_surr_loss = - TT.mean(TT.log(actual_latent_probs) * advantage_var_sparse)

        ############################################################
        ### calculating the skills portion of the surrogate loss ###
        ############################################################

        # get the distribution parameters
        # dist_info_vars = []
        # for latent in self.latents:
        #     self.policy.low_policy.set_latent_train(latent)
        #     dist_info_vars.append(self.policy.low_policy.dist_info_sym(obs_var))
        # hopefully the above line takes multiple samples, and state_info_vars not needed as input

        dist_info_vars = self.policy.low_policy.dist_info_sym_all_latents(obs_var)
        probs = TT.stack([self.diagonal.log_likelihood_sym(action_var, dist_info) for dist_info in dist_info_vars],
                         axis=1)
        actual_action_log_probs = TT.sum(probs * latent_var, axis=1)  # todo: verify that dist_info_vars is in order

        # old policy stuff
        old_dist_info_vars = self.old_policy.low_policy.dist_info_sym_all_latents(obs_var)
        old_probs = TT.stack([self.diagonal.log_likelihood_sym(action_var, dist_info) for dist_info in old_dist_info_vars],
                        axis=1)
        old_actual_action_log_probs = TT.sum(old_probs * latent_var, axis=1)
        skill_lr = TT.exp(actual_action_log_probs - old_actual_action_log_probs)

        skill_surr_loss_vector = TT.minimum(skill_lr * advantage_var,
                                            TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var)
        skill_surr_loss = -TT.mean(skill_surr_loss_vector)
        # skill_surr_loss = - TT.mean(actual_action_log_probs*advantage_var)

        surr_loss = manager_surr_loss/self.period + skill_surr_loss  # so that the relative magnitudes are correct

        input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var, advantage_var_sparse, latent_var,
                      latent_var_sparse]
        # input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var]
        # npo has state_info_vars and old_dist_info_vars, I don't think I need them until I go for NPO/TRPO

        self.optimizer.update_opt(
            loss=surr_loss,
            target=self.policy,
            inputs=input_list
        )
        return dict()

    # do the optimization
    def optimize_policy(self, itr, samples_data):
        print(len(samples_data['observations']), self.period)
        assert len(samples_data['observations']) % self.period == 0

        # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse

        if self.use_skill_dependent_baseline:
            input_values = tuple(ext.extract(
                samples_data,
                "observations", "actions", "advantages", "agent_infos", "skill_advantages"
            ))
        else:
            input_values = tuple(ext.extract(
                samples_data,
                "observations", "actions", "advantages", "agent_infos"
            ))

        obs_raw = input_values[0].reshape(input_values[0].shape[0] // self.period, self.period,
                                          input_values[0].shape[1])

        obs_sparse = input_values[0].take([i for i in range(0, input_values[0].shape[0], self.period)], axis=0)
        advantage_sparse = input_values[2].reshape([input_values[2].shape[0] // self.period, self.period])[:, 0]
        latents = input_values[3]['latents']
        latents_sparse = latents.take([i for i in range(0, latents.shape[0], self.period)], axis=0)

        if self.use_skill_dependent_baseline:
            all_input_values = (
                obs_raw, obs_sparse, input_values[1], input_values[4], advantage_sparse, latents, latents_sparse)
        else:
            all_input_values = (
            obs_raw, obs_sparse, input_values[1], input_values[2], advantage_sparse, latents, latents_sparse)

        # todo: assign current parameters to old policy; does this work?
        old_param_values = self.policy.get_param_values(trainable=True)
        self.old_policy.set_param_values(old_param_values, trainable=True)

        loss_before = self.optimizer.loss(all_input_values)
        self.optimizer.optimize(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()

    def get_itr_snapshot(self, itr, samples_data):
        return dict(
            itr=itr,
            policy=self.policy,
            baseline=self.baseline,
            env=self.env
        )

    def log_diagnostics(self, paths):
        # paths obtained by self.sampler.obtain_samples
        BatchPolopt.log_diagnostics(self, paths)
Пример #6
0
class ConcurrentContinuousPPO(BatchPolopt):
    """
    Designed to enable concurrent training of a SNN that parameterizes skills
    and also train the manager at the same time

    Note that, if I'm not trying to do the sample approximation of the weird log of sum term,
    I don't need to know which skill was picked, just need to know the action
    """

    # double check this constructor later
    def __init__(
            self,
            optimizer=None,
            optimizer_args=None,
            step_size=0.003,
            num_latents=6,
            latents=None,  # some sort of iterable of the actual latent vectors
            period=10,  # how often I choose a latent
            truncate_local_is_ratio=None,
            epsilon=0.1,
            train_pi_iters=10,
            use_skill_dependent_baseline=False,
            mlp_skill_dependent_baseline=False,
            freeze_manager=False,
            freeze_skills=False,
            **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(learning_rate=step_size,
                                                 max_epochs=train_pi_iters,
                                                 **optimizer_args)
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        self.epsilon = epsilon

        super(ConcurrentContinuousPPO,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.period = period
        self.freeze_manager = freeze_manager
        self.freeze_skills = freeze_skills
        assert (not freeze_manager) or (not freeze_skills)

        # todo: fix this sampler stuff
        # import pdb; pdb.set_trace()
        self.sampler = HierBatchSampler(self, self.period)
        # self.sampler = BatchSampler(self)
        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicy)
        self.period = self.policy.period
        assert self.policy.period == self.period
        self.continuous_latent = self.policy.continuous_latent
        assert self.continuous_latent
        # self.old_policy = copy.deepcopy(self.policy)

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            new_obs_space_no_bi = curr_env.observation_space.shape[
                0] + 1  # 1 for the t_remaining
            skill_dependent_obs_space_dim = (new_obs_space_no_bi *
                                             (self.num_latents + 1) +
                                             self.num_latents, )
            skill_dependent_obs_space = Box(
                -1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space,
                                               skill_dependent_action_space)
            if self.mlp_skill_dependent_baseline:
                self.skill_dependent_baseline = GaussianMLPBaseline(
                    env_spec=skill_dependent_env_spec)
            else:
                self.skill_dependent_baseline = LinearFeatureBaseline(
                    env_spec=skill_dependent_env_spec)

    # initialize the computation graph
    # optimize is run on >= 1 trajectory at a time
    # assumptions: 1 trajectory, which is a multiple of p; that the obs_var_probs is valid
    def init_opt(self):
        assert isinstance(self.policy, HierarchicalPolicy)
        assert not self.freeze_manager and not self.freeze_skills
        manager_surr_loss = 0
        # skill_surr_loss = 0

        obs_var_sparse = ext.new_tensor('sparse_obs',
                                        ndim=2,
                                        dtype=theano.config.floatX)
        obs_var_raw = ext.new_tensor(
            'obs', ndim=3, dtype=theano.config.floatX)  # todo: check the dtype
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1,
                                       dtype=theano.config.floatX)
        # latent_var = ext.new_tensor('latents', ndim=2, dtype=theano.config.floatX)
        mean_var = ext.new_tensor('mean', ndim=2, dtype=theano.config.floatX)
        log_std_var = ext.new_tensor('log_std',
                                     ndim=2,
                                     dtype=theano.config.floatX)

        # undoing the reshape, so that batch sampling is ok
        obs_var = TT.reshape(obs_var_raw, [
            obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2]
        ])

        ############################################################
        ### calculating the skills portion of the surrogate loss ###
        ############################################################
        latent_var_sparse = self.policy.manager.dist_info_sym(
            obs_var_sparse)['mean']
        latent_var = TT.extra_ops.repeat(latent_var_sparse,
                                         self.period,
                                         axis=0)  #.dimshuffle(0, 'x')
        dist_info_var = self.policy.low_policy.dist_info_sym(
            obs_var, state_info_var=latent_var)
        old_dist_info_var = dict(mean=mean_var, log_std=log_std_var)
        skill_lr = self.diagonal.likelihood_ratio_sym(action_var,
                                                      old_dist_info_var,
                                                      dist_info_var)
        skill_surr_loss_vector = TT.minimum(
            skill_lr * advantage_var,
            TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) *
            advantage_var)
        skill_surr_loss = -TT.mean(skill_surr_loss_vector)

        surr_loss = skill_surr_loss  # so that the relative magnitudes are correct

        if self.freeze_skills and not self.freeze_manager:
            raise NotImplementedError
        elif self.freeze_manager and not self.freeze_skills:
            raise NotImplementedError
        else:
            assert (not self.freeze_manager) or (not self.freeze_skills)
            input_list = [
                obs_var_raw, obs_var_sparse, action_var, advantage_var,
                mean_var, log_std_var
            ]

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  inputs=input_list)
        return dict()

    # do the optimization
    def optimize_policy(self, itr, samples_data):
        print(len(samples_data['observations']), self.period)
        assert len(samples_data['observations']) % self.period == 0

        # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse

        if self.use_skill_dependent_baseline:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos", "skill_advantages"))
        else:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos"))

        obs_raw = input_values[0].reshape(
            input_values[0].shape[0] // self.period, self.period,
            input_values[0].shape[1])

        obs_sparse = input_values[0].take(
            [i for i in range(0, input_values[0].shape[0], self.period)],
            axis=0)
        if not self.continuous_latent:
            advantage_sparse = input_values[2].reshape(
                [input_values[2].shape[0] // self.period, self.period])[:, 0]
            latents = input_values[3]['latents']
            latents_sparse = latents.take(
                [i for i in range(0, latents.shape[0], self.period)], axis=0)
            prob = np.array(list(input_values[3]['prob'].take(
                [i for i in range(0, latents.shape[0], self.period)], axis=0)),
                            dtype=np.float32)
        mean = input_values[3]['mean']
        log_std = input_values[3]['log_std']

        if self.use_skill_dependent_baseline:
            advantage_var = input_values[4]
        else:
            advantage_var = input_values[2]
        # import ipdb; ipdb.set_trace()
        if self.freeze_skills and not self.freeze_manager:
            raise NotImplementedError
        elif self.freeze_manager and not self.freeze_skills:
            raise NotImplementedError
        else:
            assert (not self.freeze_manager) or (not self.freeze_skills)
            all_input_values = (obs_raw, obs_sparse, input_values[1],
                                advantage_var, mean, log_std)

        # todo: assign current parameters to old policy; does this work?
        # old_param_values = self.policy.get_param_values(trainable=True)
        # self.old_policy.set_param_values(old_param_values, trainable=True)
        # old_param_values = self.policy.get_param_values()
        # self.old_policy.set_param_values(old_param_values)
        loss_before = self.optimizer.loss(all_input_values)
        self.optimizer.optimize(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()

    def get_itr_snapshot(self, itr, samples_data):
        return dict(itr=itr,
                    policy=self.policy,
                    baseline=self.baseline,
                    env=self.env)

    def log_diagnostics(self, paths):
        # paths obtained by self.sampler.obtain_samples
        BatchPolopt.log_diagnostics(self, paths)