Exemplo n.º 1
0
    def optimize_policy(self, itr, samples_data):
        all_input_values = tuple(
            ext.extract(samples_data, "observations", "actions", "advantages"))
        returns = ext.extract(samples_data, "returns")

        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        dist_info_list = [
            agent_infos[k] for k in self.policy.distribution.dist_info_keys
        ]
        all_input_values += tuple(state_info_list) + tuple(dist_info_list)
        if self.policy.recurrent:
            all_input_values += (samples_data["valids"], )
        logger.log("Computing loss before")
        # TODO:
        loss_before = self.optimizer.loss(all_input_values)
        logger.log("Computing KL before")
        mean_kl_before = self.optimizer.constraint_val(all_input_values)
        logger.log("Optimizing")
        self.optimizer.optimize(all_input_values)
        logger.log("Computing KL after")
        mean_kl = self.optimizer.constraint_val(all_input_values)
        logger.log("Computing loss after")
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('MeanKLBefore', mean_kl_before)
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()
Exemplo n.º 2
0
    def optimize_policy(self, itr, samples_data):
        print(len(samples_data['observations']), self.period)
        assert len(samples_data['observations']) % self.period == 0

        # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse

        if self.use_skill_dependent_baseline:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos", "skill_advantages"))
        else:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos"))

        obs_raw = input_values[0].reshape(
            input_values[0].shape[0] // self.period, self.period,
            input_values[0].shape[1])

        obs_sparse = input_values[0].take(
            [i for i in range(0, input_values[0].shape[0], self.period)],
            axis=0)
        if not self.continuous_latent:
            advantage_sparse = input_values[2].reshape(
                [input_values[2].shape[0] // self.period, self.period])[:, 0]
            latents = input_values[3]['latents']
            latents_sparse = latents.take(
                [i for i in range(0, latents.shape[0], self.period)], axis=0)
            prob = np.array(list(input_values[3]['prob'].take(
                [i for i in range(0, latents.shape[0], self.period)], axis=0)),
                            dtype=np.float32)
        mean = input_values[3]['mean']
        log_std = input_values[3]['log_std']

        if self.use_skill_dependent_baseline:
            advantage_var = input_values[4]
        else:
            advantage_var = input_values[2]
        # import ipdb; ipdb.set_trace()
        if self.freeze_skills and not self.freeze_manager:
            raise NotImplementedError
        elif self.freeze_manager and not self.freeze_skills:
            raise NotImplementedError
        else:
            assert (not self.freeze_manager) or (not self.freeze_skills)
            all_input_values = (obs_raw, obs_sparse, input_values[1],
                                advantage_var, mean, log_std)

        # todo: assign current parameters to old policy; does this work?
        # old_param_values = self.policy.get_param_values(trainable=True)
        # self.old_policy.set_param_values(old_param_values, trainable=True)
        # old_param_values = self.policy.get_param_values()
        # self.old_policy.set_param_values(old_param_values)
        loss_before = self.optimizer.loss(all_input_values)
        self.optimizer.optimize(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()
Exemplo n.º 3
0
    def optimize_policy(self, itr, samples_data):
        logger.log('optimizing policy...')
        all_input_values = tuple(
            ext.extract(samples_data, "observations", "actions", "advantages",
                        "weights"))
        if self.safety_constraint:
            all_input_values += tuple(
                ext.extract(samples_data, "safety_values"))
            self.safety_gradient_rescale.set_value(
                samples_data['safety_rescale'])
            logger.record_tabular('SafetyGradientRescale',
                                  self.safety_gradient_rescale.get_value())

        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        dist_info_list = [
            agent_infos[k] for k in self.policy.distribution.dist_info_keys
        ]
        all_input_values += tuple(state_info_list) + tuple(dist_info_list)
        if self.policy.recurrent:
            all_input_values += (samples_data["valids"], )
        loss_before = self.optimizer.loss(all_input_values)

        if not (self.safety_constrained_optimizer):
            self.optimizer.optimize(all_input_values)
        else:
            threshold = max(
                self.safety_step_size - samples_data['safety_eval'], 0)
            if 'advantage' in self.safety_key:
                std_adv = np.std(samples_data["safety_values"])
                logger.record_tabular('StdSafetyAdv', std_adv)
                threshold = max(threshold - self.robustness_coeff * std_adv, 0)

            if 'safety_offset' in samples_data:
                logger.record_tabular('SafetyOffset',
                                      samples_data['safety_offset'])

            self.optimizer.optimize(
                all_input_values,
                precomputed_eval=samples_data['safety_eval'],
                precomputed_threshold=threshold,
                diff_threshold=True)

        mean_kl, max_kl = self.opt_info['f_kl'](*all_input_values)
        loss_after = self.optimizer.loss(all_input_values)

        if self.entropy_regularize and not (self.entropy_coeff_decay == 1):
            current_entropy_coeff = self.entropy_beta.get_value(
            ) * self.entropy_coeff_decay
            self.entropy_beta.set_value(current_entropy_coeff)
            logger.record_tabular('EntropyCoeff', current_entropy_coeff)

        if self.learn_safety_tradeoff_coeff:
            delta = samples_data['safety_eval'] - self.safety_step_size
            self.safety_tradeoff_coeff += self.safety_tradeoff_coeff_lr * delta
            self.safety_tradeoff_coeff = max(0, self.safety_tradeoff_coeff)
Exemplo n.º 4
0
    def do_training(self, itr, batch, offpolicy_batch):

        obs, actions, rewards, next_obs, terminals = ext.extract(
            batch, "observations", "actions", "rewards", "next_observations",
            "terminals")

        obs_off, actions_off, rewards_off, next_obs_off, terminals_off = ext.extract(
            offpolicy_batch, "observations", "actions", "rewards",
            "next_observations", "terminals")

        # compute the on-policy y values
        target_qf = self.opt_info["target_qf"]
        target_policy = self.opt_info["target_policy"]

        next_actions, _ = target_policy.get_actions(next_obs)
        next_qvals = target_qf.get_qval(next_obs, next_actions)

        ys = rewards + (1. -
                        terminals) * self.discount * next_qvals.reshape(-1)

        next_actions_off, _ = target_policy.get_actions(next_obs_off)
        next_qvals_off = target_qf.get_qval(next_obs_off, next_actions_off)

        ys_off = rewards + (
            1. - terminals_off) * self.discount * next_qvals_off.reshape(-1)

        f_train_qf = self.opt_info["f_train_qf"]
        f_train_policy = self.opt_info["f_train_policy"]

        qf_loss, qval, _ = f_train_qf(ys, obs, actions, ys_off, obs_off,
                                      actions_off, self.global_train_step)

        target_qf.set_param_values(target_qf.get_param_values() *
                                   (1.0 - self.soft_target_tau) +
                                   self.qf.get_param_values() *
                                   self.soft_target_tau)
        self.qf_loss_averages.append(qf_loss)
        self.q_averages.append(qval)
        self.y_averages.append(ys)  #TODO: also add ys_off

        self.train_policy_itr += self.policy_updates_ratio
        train_policy_itr = 0
        while self.train_policy_itr > 0:
            f_train_policy = self.opt_info["f_train_policy"]
            policy_surr, _ = f_train_policy(obs, obs_off,
                                            self.global_train_step)
            target_policy.set_param_values(target_policy.get_param_values() *
                                           (1.0 - self.soft_target_tau) +
                                           self.policy.get_param_values() *
                                           self.soft_target_tau)
            self.policy_surr_averages.append(policy_surr)
            self.train_policy_itr -= 1
            train_policy_itr += 1

        return 1, train_policy_itr  # number of itrs qf, policy are trained
Exemplo n.º 5
0
    def optimize_policy(self, itr, samples_data):
        # print(len(samples_data['observations']), self.period)
        # assert len(samples_data['observations']) % self.period == 0

        # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse
        if self.use_skill_dependent_baseline:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos", "skill_advantages"))
        else:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos"))

        time_remaining = input_values[3]['time_remaining']
        resampled_period = input_values[3]['resampled_period']
        obs_var = np.insert(input_values[0],
                            self.policy.obs_robot_dim,
                            time_remaining,
                            axis=1)
        manager_obs_var = obs_var[resampled_period]
        action_var = input_values[1]
        manager_adv_var = input_values[2][resampled_period]

        latent_var = input_values[3]['latents']
        latent_var_sparse = latent_var[resampled_period]
        mean = input_values[3]['mean']
        log_std = input_values[3]['log_std']
        prob = input_values[3]['prob'][resampled_period]
        if self.use_skill_dependent_baseline:
            skill_adv_var = input_values[4]
            all_input_values = (obs_var, manager_obs_var, action_var,
                                manager_adv_var, skill_adv_var, latent_var,
                                latent_var_sparse, mean, log_std, prob)
        else:
            skill_adv_var = input_values[2]
            all_input_values = (obs_var, manager_obs_var, action_var,
                                manager_adv_var, skill_adv_var, latent_var,
                                latent_var_sparse, mean, log_std, prob)

        # todo: assign current parameters to old policy; does this work?
        # old_param_values = self.policy.get_param_values()
        # self.old_policy.set_param_values(old_param_values)
        loss_before = self.optimizer.loss(all_input_values)
        self.optimizer.optimize(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()
Exemplo n.º 6
0
    def optimize_policy(
        self, itr, samples_data
    ):  # make that samples_data comes with latents: see train in batch_polopt
        all_input_values = tuple(
            ext.extract(  # it will be in agent_infos!!! under key "latents"
                samples_data, "observations", "actions", "advantages"))
        agent_infos = samples_data["agent_infos"]
        all_input_values += (
            agent_infos["latents"],
        )  # latents has already been processed and is the concat of all latents, but keeps key "latents"
        info_list = [
            agent_infos[k] for k in self.policy.distribution.dist_info_keys
        ]  # these are the mean and var used at rollout, corresponding to
        all_input_values += tuple(
            info_list)  # old_dist_info_vars_list as symbolic var
        if self.policy.recurrent:
            all_input_values += (samples_data["valids"], )

        loss_before = self.optimizer.loss(all_input_values)
        # this should always be 0. If it's not there is a problem.
        mean_kl_before = self.optimizer.constraint_val(all_input_values)
        logger.record_tabular('MeanKL_Before', mean_kl_before)

        with logger.prefix(' PolicyOptimize | '):
            self.optimizer.optimize(all_input_values)

        mean_kl = self.optimizer.constraint_val(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()
Exemplo n.º 7
0
    def optimize_global_policy(self, itr, all_samples_data):

        all_observations = np.concatenate([
            samples_data['observations'] for samples_data in all_samples_data
        ])
        all_actions = np.concatenate([
            samples_data['agent_infos']['mean']
            for samples_data in all_samples_data
        ])

        num_itrs = 1 if itr % self.distillation_period != 0 else 30

        for _ in range(num_itrs):
            self.center_optimizer.optimize([all_observations, all_actions])

        paths = self.global_sampler.obtain_samples(itr)
        samples_data = self.global_sampler.process_samples(itr, paths)

        obs_values = tuple(
            ext.extract(samples_data, "observations", "actions", "advantages"))
        dist_info_list = [
            samples_data["agent_infos"][k]
            for k in self.policy.distribution.dist_info_keys
        ]

        all_input_values = obs_values + tuple(dist_info_list)

        self.center_trpo_optimizer.optimize(all_input_values)
        self.env.log_diagnostics(paths)
Exemplo n.º 8
0
 def optimize_policy(self, itr, samples_data):
     all_input_values = tuple(ext.extract(
         samples_data,
         "observations", "actions", "advantages"
     ))
     agent_infos = samples_data["agent_infos"]
     state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
     dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
     all_input_values += tuple(state_info_list) + tuple(dist_info_list)
     if self.policy.recurrent:
         all_input_values += (samples_data["valids"],)
     logger.log("Computing loss before")
     loss_before = self.optimizer.loss(all_input_values)
     logger.log("Computing KL before")
     mean_kl_before = self.optimizer.constraint_val(all_input_values)
     logger.log("Optimizing")
     self.optimizer.optimize(all_input_values)
     logger.log("Computing KL after")
     mean_kl = self.optimizer.constraint_val(all_input_values)
     logger.log("Computing loss after")
     loss_after = self.optimizer.loss(all_input_values)
     logger.record_tabular('LossBefore', loss_before)
     logger.record_tabular('LossAfter', loss_after)
     logger.record_tabular('MeanKLBefore', mean_kl_before)
     logger.record_tabular('MeanKL', mean_kl)
     logger.record_tabular('dLoss', loss_before - loss_after)
     return dict()
Exemplo n.º 9
0
    def optimize_policy(self, itr, samples_data):
        inputs = ext.extract(samples_data, "observations", "actions",
                             "advantages")
        agent_info = samples_data["agent_info"]
        state_info_list = [agent_info[k] for k in self.policy.state_info_keys]
        inputs += tuple(state_info_list)
        dist_info_list = [
            agent_info[k] for k in self.policy.distribution.dist_info_keys
        ]
        loss_before = self.optimizer.loss(inputs)
        self.optimizer.optimize(inputs)
        loss_after = self.optimizer.loss(inputs)
        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)

        rewards = samples_data['rewards']
        entropy_loss = np.mean(self.policy.distribution.entropy(agent_info))
        self.log_summary(itr, loss_after, entropy_loss, np.mean(rewards),
                         np.sum(rewards))

        mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) +
                                                  dist_info_list))
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('MaxKL', max_kl)

        return dict()
Exemplo n.º 10
0
 def optimize_policy(self, itr, samples_data):
     logger.log('optimizing policy...')
     all_input_values = tuple(
         ext.extract(samples_data, "observations", "actions", "advantages",
                     "weights"))
     agent_infos = samples_data["agent_infos"]
     state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
     dist_info_list = [
         agent_infos[k] for k in self.policy.distribution.dist_info_keys
     ]
     all_input_values += tuple(state_info_list) + tuple(dist_info_list)
     if self.policy.recurrent:
         all_input_values += (samples_data["valids"], )
     loss_before = self.optimizer.loss(all_input_values)
     self.optimizer.optimize(all_input_values)
     mean_kl, max_kl = self.opt_info['f_kl'](*all_input_values)
     loss_after = self.optimizer.loss(all_input_values)
     if self.entropy_regularize and not (self.entropy_coeff_decay == 1):
         current_entropy_coeff = self.entropy_beta.get_value(
         ) * self.entropy_coeff_decay
         self.entropy_beta.set_value(current_entropy_coeff)
         logger.record_tabular('EntropyCoeff', current_entropy_coeff)
     logger.record_tabular('Time', time.time() - self.start_time)
     logger.record_tabular('LossBefore', loss_before)
     logger.record_tabular('LossAfter', loss_after)
     logger.record_tabular('MeanKL', mean_kl)
     logger.record_tabular('MaxKL', max_kl)
     logger.record_tabular('dLoss', loss_before - loss_after)
     logger.log('optimization finished')
Exemplo n.º 11
0
Arquivo: ppo.py Projeto: Neo-X/GMPS
    def optimize_policy(self, itr, samples_data):
        logger.log("optimizing policy")
        inputs = ext.extract(
            samples_data,
            "observations",
            "actions",
            "advantages"  ## GAE R - V(s) 
        )
        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        dist_info_list = [
            agent_infos[k] for k in self.policy.distribution.dist_info_keys
        ]
        inputs += tuple(state_info_list) + tuple(dist_info_list)
        if self.policy.recurrent:
            inputs += (samples_data["valids"], )
        dist_info_list = [
            agent_infos[k] for k in self.policy.distribution.dist_info_keys
        ]
        loss_before = self.optimizer.loss(inputs)
        ### For PPO this should be more than one step.
        self.optimizer.optimize(inputs)
        loss_after = self.optimizer.loss(inputs)
        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)

        mean_kl, max_kl, clip_frac, log_std = self.opt_info['f_kl'](
            *(list(inputs) + dist_info_list))
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('MaxKL', max_kl)
        logger.record_tabular("ClipFrac", clip_frac)
        logger.record_tabular("AvgStd", np.mean(np.exp(log_std)))

        if self.comet_logger:
            self.comet_logger.log_metric('ClipFrac', clip_frac)
Exemplo n.º 12
0
    def do_training(self, itr, batch):

        obs, actions, rewards, next_obs, terminals = ext.extract(
            batch, "observations", "actions", "rewards", "next_observations",
            "terminals")

        # compute the on-policy y values
        target_qf = self.opt_info_critic["target_qf"]

        next_actions, next_actions_dict = self.policy.get_actions(next_obs)
        if self.qprop_use_mean_action:
            next_actions = next_actions_dict["mean"]
        next_qvals = target_qf.get_qval(next_obs, next_actions)

        ys = rewards + (1. - terminals) * self.discount * next_qvals

        f_train_qf = self.opt_info_critic["f_train_qf"]

        qf_loss, qval, _ = f_train_qf(ys, obs, actions)

        target_qf.set_param_values(target_qf.get_param_values() *
                                   (1.0 - self.soft_target_tau) +
                                   self.qf.get_param_values() *
                                   self.soft_target_tau)

        self.qf_loss_averages.append(qf_loss)
        self.q_averages.append(qval)
        self.y_averages.append(ys)
Exemplo n.º 13
0
    def do_phi_training(self, itr, indices=None, samples_data=None):
        
        batch_samples = samples_data
        '''
        dict(
            observations=samples_data["observations"][indices],
            actions=samples_data["actions"][indices],
            origin_advantages=samples_data["origin_advantages"][indices],)
        '''
        inputs = ext.extract(
            batch_samples, 
            "observations", 
            "actions", 
            "origin_advantages", 
            "etas",)

        # the following code is useless
        # FIXME: write a better version of this
        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]

        inputs += tuple(state_info_list)

        #TODO: add recurrent
        if self.policy.recurrent:
            inputs += (samples_data["valid"], )
        
        pf_outputs = self.opt_train_phi['f_train_pf'](*inputs)
        pf_loss = pf_outputs.pop(0)
        self.pf_loss_averages.append(pf_loss)
    def optimize_policy(self, itr, all_samples_data, particle_idx):
        self.policy = self.policy_list[particle_idx]
        self.optimizer = self.optimizer_list[particle_idx]
        assert len(all_samples_data) == len(self.policy_list)
        assert len(all_samples_data[0]) == self.num_grad_updates + 1

        input_list = []
        for step in range(len(
                all_samples_data[0])):  # these are the gradient steps
            for n in range(len(all_samples_data)):
                obs_list, action_list, adv_list = [], [], []
                for i in range(self.meta_batch_size):
                    inputs = ext.extract(all_samples_data[n][step][i],
                                         "observations", "actions",
                                         "advantages")
                    obs_list.append(inputs[0])
                    action_list.append(inputs[1])
                    adv_list.append(inputs[2])
                input_list += obs_list + action_list + adv_list  # [ [obs_0], [act_0], [adv_0], [obs_1], ... ]

        dist_info_list = []
        for i in range(self.meta_batch_size):
            obs_list = all_samples_data[particle_idx][
                self.kl_constrain_step][i]['observations']
            agent_infos = self.policy.get_mean_logstd(obs_list,
                                                      self.meta_batch_size, i)
            dist_info_list += [
                agent_infos[k] for k in self.policy.distribution.dist_info_keys
            ]
        input_list += tuple(dist_info_list)

        self.optimizer.optimize(input_list)

        return dict()
Exemplo n.º 15
0
    def optimize_policy(self, itr, all_samples_data, particle_idx):
        logger.log("optimizing policy")
        assert len(all_samples_data) == len(self.policy_list)
        assert len(all_samples_data[0]) == self.num_leader_grad_updates
        
        input_list = []
        for step in range(len(all_samples_data[0])):
            for n in range(len(all_samples_data)):
                obs_list, action_list, adv_list = [], [], []
                for i in range(self.meta_batch_size):
                    inputs = ext.extract(
                        all_samples_data[n][step][i],
                        "observations", "actions", "advantages"
                    )
                    obs_list.append(inputs[0])
                    action_list.append(inputs[1])
                    adv_list.append(inputs[2])
                input_list += obs_list + action_list + adv_list

        if particle_idx == 0 and (self.n_particles > 1):
            sess = tf.get_default_session()
            global_h = sess.run(self.global_h,feed_dict=dict(list(zip(self.policy_list[0].input_list_for_grad, input_list))))
            logger.record_tabular('global_h', global_h)
        
        self.optimizer_list[particle_idx].optimize(input_list)
Exemplo n.º 16
0
    def optimize_policy(self, itr, samples_data):
        inputs = ext.extract(samples_data, "observations", "actions", "target")

        self.optimizer.optimize(inputs)
        self.loss_after = self.optimizer.loss(inputs)

        return dict()
Exemplo n.º 17
0
    def optimize_policy(self, itr, samples_data):
        logger.log("optimizing policy")
        inputs = ext.extract(samples_data, "observations", "actions",
                             "advantages")
        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        # state_info_keys is prev_action
        # so agent_infos should include prev_action
        # agent_infos from policy.get_action
        inputs += tuple(state_info_list)
        if self.policy.recurrent:
            inputs += (samples_data["valids"], )
        dist_info_list = [
            agent_infos[k] for k in self.policy.distribution.dist_info_keys
        ]
        loss_before = self.optimizer.loss(inputs)
        self.optimizer.optimize(inputs)
        loss_after = self.optimizer.loss(inputs)
        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)

        mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) +
                                                  dist_info_list))
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('MaxKL', max_kl)
Exemplo n.º 18
0
    def do_training(self, itr, batch):

        obs, actions, rewards, next_obs, terminals = ext.extract(
            batch, "observations", "actions", "rewards", "next_observations",
            "terminals")

        # compute the on-policy y values
        target_qf = self.opt_info["target_qf"]
        target_policy = self.opt_info["target_policy"]

        next_actions, _ = target_policy.get_actions(next_obs)
        next_qvals = target_qf.get_qval(next_obs, next_actions)

        ys = rewards + (1. - terminals) * self.discount * next_qvals

        f_train_qf = self.opt_info["f_train_qf"]
        f_train_policy = self.opt_info["f_train_policy"]

        qf_loss, qval = f_train_qf(ys, obs, actions)

        policy_surr = f_train_policy(obs)

        target_policy.set_param_values(target_policy.get_param_values() *
                                       (1.0 - self.soft_target_tau) +
                                       self.policy.get_param_values() *
                                       self.soft_target_tau)
        target_qf.set_param_values(target_qf.get_param_values() *
                                   (1.0 - self.soft_target_tau) +
                                   self.qf.get_param_values() *
                                   self.soft_target_tau)

        self.qf_loss_averages.append(qf_loss)
        self.policy_surr_averages.append(policy_surr)
        self.q_averages.append(qval)
        self.y_averages.append(ys)
    def optimize_policy(self, itr, samples_data, paths):
        sortedPaths = paths

        # select a subset of paths for training.
        if len(sortedPaths) > POWERGradient.numSampledPaths:
            # select a random subset of paths for tranining.
            # selected_samples = random.sample(range(len(sortedPaths)), 10)
            # sortedPaths = [sortedPaths[x] for x in selected_samples]

            # select the subset of best paths for training.
            sortedPaths = sorted(paths, key=lambda path: np.sum(path["rewards"]), reverse=True)
            sortedPaths = sortedPaths[0:POWERGradient.numSampledPaths]

        processed_samples = self.sampler.process_samples(itr, sortedPaths)

        all_input_values = ext.extract(
            processed_samples,
            "observations", "actions", "path_rewards"
        )

        polGrad = np.array([0.] * len(self.policy.get_param_values()))
        for obv, act, path_rew in zip(*all_input_values):
            polGrad = polGrad + path_rew * np.array(self.polLogGradFunc(obv, act))
        polGrad = polGrad / POWERGradient.numSampledPaths

        # RMSProp update of policy parameters.
        self.gS = 0.9 * self.gS + 0.1 * (polGrad ** 2)
        newPolParams = self.policy.get_param_values() + self.step_size * polGrad / np.sqrt(self.gS + TINY)
        self.policy.set_param_values(newPolParams)

        return dict()
Exemplo n.º 20
0
    def optimize_policy(self, itr, samples_data):
        logger.log("optimizing policy")
        inputs = ext.extract(
            samples_data,
            "observations", "actions", "advantages", "noises", "task_idxs"
        )
        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        inputs += tuple(state_info_list)
        if self.policy.recurrent:
            inputs += (samples_data["valids"],)
        dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
        loss_before = self.optimizer.loss(inputs)
        
        curr_mean = sess.run(self.policy.all_params['latent_means'])
        curr_std = np.exp(sess.run(self.policy.all_params['latent_stds']))
        import ipdb
        ipdb.set_trace()

        self.optimizer.optimize(inputs)

        curr_mean = sess.run(self.policy.all_params['latent_means'])
        curr_std = np.exp(sess.run(self.policy.all_params['latent_stds']))
        import ipdb
        ipdb.set_trace()

        loss_after = self.optimizer.loss(inputs)
        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)

        mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list))
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('MaxKL', max_kl)
    def MPC(self, num_samples):
        all_samples = []
        paths = self.obtain_samples(0, self.goal)

        samples_data = {}
        for key in paths.keys():  # the keys are the tasks
            # don't log because this will spam the consol with every task.
            samples_data[key] = self.process_samples(0, paths[key], log=False)

        obs_list, action_list, adv_list = [], [], []
        for i in range(num_samples):
            inputs = ext.extract(samples_data[i], 'observations', 'actions',
                                 'advantages')
            # inputs_list.append(np.concatenate((inputs[0], inputs[1]), axis = 1).astype(np.float32))
            obs_list.append(inputs[0])
            action_list.append(inputs[1].reshape(
                [-1, 20, self.env.action_space.flat_dim]))
            adv_list.append(0)

        for i in range(num_samples):
            new_obs_list = []
            for j in range(action_list[i].shape[0]):
                self.env.reset(init_state=obs_list[i][j])
                action = np.clip(action_list[i][j],
                                 *self.env.action_space.bounds)
                _, reward, _, _ = self.env.step(action)
                adv_list[i] = adv_list[i] + reward
                new_obs_list.append(
                    self.policy.get_state(obs_list[i][j], action_list[i][j]))
            obs_list[i] = new_obs_list

        index = np.argmax(adv_list)
        return action_list[index][0]
Exemplo n.º 22
0
    def optimize_policy(self, itr, samples_latent):
        logger.log("optimizing policy")
        # inputs = ext.extract(samples,
        #             'observations', 'actions', 'advantages', 'noises', 'task_idxs')

        # obs=inputs[0]
        # actions=inputs[1]
        # advantages=inputs[2]
        # noises=inputs[3]
        # task_idxs = inputs[4]

        latent_inputs = ext.extract(samples_latent, "advantages", "noises",
                                    "task_idxs")
        latent_advantages = latent_inputs[0]
        latent_noises = latent_inputs[1]
        latent_task_idxs = latent_inputs[2]

        sess = tf.get_default_session()

        means = sess.run(
            tf.gather(self.policy.all_params['latent_means'],
                      latent_task_idxs))
        logstds = sess.run(
            tf.gather(self.policy.all_params['latent_stds'], latent_task_idxs))
        #import ipdb
        #ipdb.set_trace()

        zs = means + latent_noises * np.exp(logstds)
        # self.num_top = 10
        # best_indices = advantages.argsort()[-self.num_top:][::-1]
        # good_noises = np.asarray([zs[ind] for ind in best_indices])
        # inputs = [obs,  actions, advantages, noises, task_idxs, latent_advantages, zs, latent_task_idxs]
        inputs = [latent_advantages, zs, latent_task_idxs]

        self.optimize(inputs, sess, itr)
Exemplo n.º 23
0
    def optimize_policy(self, itr, samples_data):
        logger.log("optimizing policy")
        inputs = ext.extract(samples_data, "observations", "actions",
                             "advantages")
        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        inputs += tuple(state_info_list)
        if self.policy.recurrent:
            inputs += (samples_data["valids"], )
        if self.qprop:
            inputs += (samples_data["etas"], )
            logger.log("Using Qprop optimizer")
        optimizer = self.optimizer
        dist_info_list = [
            agent_infos[k] for k in self.policy.distribution.dist_info_keys
        ]
        loss_before = optimizer.loss(inputs)
        gc.collect()
        optimizer.optimize(inputs)
        gc.collect()
        loss_after = optimizer.loss(inputs)
        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)

        mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) +
                                                  dist_info_list))
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('MaxKL', max_kl)
Exemplo n.º 24
0
 def optimize_policy(self, itr, samples_data):
     all_input_values = tuple(
         ext.extract(samples_data, "observations", "actions", "advantages"))
     agent_infos = samples_data["agent_infos"]
     state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
     dist_info_list = [
         agent_infos[k] for k in self.policy.distribution.dist_info_keys
     ]
     all_input_values += tuple(state_info_list) + tuple(dist_info_list)
     if self.policy.recurrent:
         all_input_values += (samples_data["valids"], )
     loss_before = self.optimizer.loss(all_input_values)
     mean_kl_before = self.optimizer.constraint_val(all_input_values)
     if (itr == 0):
         acceptViolation = True
     else:
         acceptViolation = False
     self.optimizer.optimize(all_input_values,
                             acceptViolation=acceptViolation)
     mean_kl = self.optimizer.constraint_val(all_input_values)
     loss_after = self.optimizer.loss(all_input_values)
     logger.record_tabular('LossBefore', loss_before)
     logger.record_tabular('LossAfter', loss_after)
     logger.record_tabular('MeanKLBefore', mean_kl_before)
     logger.record_tabular('MeanKL', mean_kl)
     logger.record_tabular('dLoss', loss_before - loss_after)
     return dict()
Exemplo n.º 25
0
    def do_training(self, itr, batch):
        # Update Q Function
        obs, actions, rewards, next_obs, terminals = ext.extract(
            batch, "observations", "actions", "rewards", "next_observations",
            "terminals")

        next_actions, _ = self.target_policy.get_action(next_obs)
        next_qvals = self.target_qf.get_qval(next_obs, next_actions)

        rewards = rewards.reshape(-1, 1)
        terminals_mask = (1.0 - terminals).reshape(-1, 1)
        ys = rewards + terminals_mask * self.discount * next_qvals

        qf_loss = self.train_qf(ys, obs, actions)
        policy_surr = self.train_policy(obs)

        self.target_policy.set_param_values(
            self.target_policy.get_param_values() *
            (1 - self.soft_target_tau) +
            self.policy.get_param_values() * self.soft_target_tau)

        self.target_qf.set_param_values(self.target_qf.get_param_values() *
                                        (1 - self.soft_target_tau) +
                                        self.qf.get_param_values() *
                                        self.soft_target_tau)

        self.qf_loss_averages.append(qf_loss)
        self.policy_surr_averages.append(policy_surr)
Exemplo n.º 26
0
    def optimize_policy(self, itr, samples_data):
        assert len(samples_data) // self.period == 0

        # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse

        input_values = tuple(
            ext.extract(samples_data, "observations", "actions", "advantages"))
        # print(input_values[0].shape)

        obs_raw = input_values[0].reshape(
            input_values[0].shape[0] // self.period, self.period,
            input_values[0].shape[1])
        # obs_raw = input_values[0]

        obs_sparse = input_values[0].take(
            [i for i in range(0, input_values[0].shape[0], self.period)],
            axis=0)
        advantage_sparse = np.sum(input_values[2].reshape(
            [input_values[2].shape[0] // self.period, self.period]),
                                  axis=1)
        all_input_values = (obs_raw, obs_sparse, input_values[1],
                            advantage_sparse)

        loss_before = self.optimizer.loss(all_input_values)
        self.optimizer.optimize(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()
Exemplo n.º 27
0
    def do_training(self, itr, batch):

        obs, actions, rewards, next_obs, terminals = ext.extract(
            batch,
            "observations", "actions", "rewards", "next_observations",
            "terminals"
        )

        # compute the on-policy y values
        target_qf = self.opt_info["target_qf"]
        target_policy = self.opt_info["target_policy"]

        next_actions, _ = target_policy.get_actions(next_obs)
        next_qvals = target_qf.get_qval(next_obs, next_actions)

        ys = rewards + (1. - terminals) * self.discount * next_qvals

        f_train_qf = self.opt_info["f_train_qf"]
        f_train_policy = self.opt_info["f_train_policy"]

        qf_loss, qval = f_train_qf(ys, obs, actions)

        policy_surr = f_train_policy(obs)

        target_policy.set_param_values(
            target_policy.get_param_values() * (1.0 - self.soft_target_tau) +
            self.policy.get_param_values() * self.soft_target_tau)
        target_qf.set_param_values(
            target_qf.get_param_values() * (1.0 - self.soft_target_tau) +
            self.qf.get_param_values() * self.soft_target_tau)

        self.qf_loss_averages.append(qf_loss)
        self.policy_surr_averages.append(policy_surr)
        self.q_averages.append(qval)
        self.y_averages.append(ys)
    def optimize_policy(self, itr, samples_data):
        # import IPython; IPython.embed()
        print(len(samples_data['observations']), self.period)
        assert len(samples_data['observations']) % self.period == 0

        # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse

        if self.use_skill_dependent_baseline:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos", "skill_advantages"))
        else:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos"))
        # print(input_values[0].shape)

        obs_raw = input_values[0].reshape(
            input_values[0].shape[0] // self.period, self.period,
            input_values[0].shape[1])
        # obs_raw = input_values[0]

        obs_sparse = input_values[0].take(
            [i for i in range(0, input_values[0].shape[0], self.period)],
            axis=0)
        advantage_sparse = input_values[2].reshape(
            [input_values[2].shape[0] // self.period, self.period])[:, 0]
        latents = input_values[3]['latents']
        latents_sparse = latents.take(
            [i for i in range(0, latents.shape[0], self.period)], axis=0)

        if self.use_skill_dependent_baseline:
            all_input_values = (obs_raw, obs_sparse, input_values[1],
                                input_values[4], advantage_sparse, latents,
                                latents_sparse)
        else:
            all_input_values = (obs_raw, obs_sparse, input_values[1],
                                input_values[2], advantage_sparse, latents,
                                latents_sparse)

        loss_before = self.optimizer.loss(all_input_values)
        self.optimizer.optimize(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()
Exemplo n.º 29
0
 def __setstate__(self, d):
     super(ReplayPool, self).__setstate__(d)
     self.bottom, self.top, self.size, self.observations, self.actions, \
     self.rewards, self.terminals, self.extras, self.rng = extract(
         d,
         "bottom", "top", "size", "observations", "actions", "rewards",
         "terminals", "extras", "rng"
     )
Exemplo n.º 30
0
 def __setstate__(self, d):
     super(ReplayPool, self).__setstate__(d)
     self.bottom, self.top, self.size, self.observations, self.actions, \
         self.rewards, self.terminals, self.extras, self.rng = extract(
             d,
             "bottom", "top", "size", "observations", "actions", "rewards",
             "terminals", "extras", "rng"
         )
Exemplo n.º 31
0
    def optimize_policy(self, itr, samples_data):
        all_input_values = tuple(
            ext.extract(samples_data, "observations", "actions", "advantages"))

        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        dist_info_list = [
            agent_infos[k] for k in self.policy.distribution.dist_info_keys
        ]
        all_input_values += tuple(state_info_list) + tuple(dist_info_list)

        aux_pred_data = self.aux_pred_pool.random_batch(
            int(self.pool_batch_size))

        all_input_values += tuple([np.array(aux_pred_data['inputs'])]) + tuple(
            [aux_pred_data['outputs']])

        if self.policy.recurrent:
            all_input_values += (samples_data["valids"], )
        loss_before = self.optimizer.loss(all_input_values)

        mean_kl_before = self.optimizer.constraint_val(all_input_values)
        self.optimizer.optimize(all_input_values)

        pred_loss = self.policy.aux_loss(aux_pred_data['inputs'],
                                         aux_pred_data['outputs'])

        if itr == 0:
            self.optimize_aux_tasks(epoch=100)
        '''loss_after = self.optimizer.loss(all_input_values)
        param_before = np.copy(self.policy.get_param_values(trainable=True))
        aux_net_param_before = np.copy(self.policy._aux_pred_network.get_param_values(trainable=True))
        if itr == 0:
            auxstep_size = 0
            self.optimize_aux_tasks(epoch=100)
        else:
            self.optimize_aux_tasks(1)
            policy_direction = self.policy.get_param_values(trainable=True) - param_before
            aux_net_direction = self.policy._aux_pred_network.get_param_values(trainable=True) - aux_net_param_before
            auxstep_size = 1
            for line_step in range(20):
                self.policy.set_param_values(param_before + auxstep_size * policy_direction, trainable=True)
                temp_kl = self.optimizer.constraint_val(all_input_values)
                temp_loss = self.optimizer.loss(all_input_values)
                if temp_loss < loss_after+abs(loss_after)*0.001 and temp_kl < self.step_size:
                    break
                auxstep_size *= 0.6
            self.policy._aux_pred_network.set_param_values(aux_net_param_before + auxstep_size * aux_net_direction,trainable=True)'''

        mean_kl = self.optimizer.constraint_val(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('Prediction Loss', pred_loss)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('MeanKLBefore', mean_kl_before)
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()
Exemplo n.º 32
0
    def optimize_policy(self, itr, all_samples_data, particle_idx):
        self.policy = self.policy_list[particle_idx]
        self.optimizer = self.optimizer_list[particle_idx]
        assert len(all_samples_data) == len(self.policy_list)
        assert len(all_samples_data[0]) == self.num_grad_updates + 1  

        input_list = []
        for n in range(len(all_samples_data)):
            for step in range(len(all_samples_data[0]) - 1):  # these are the gradient steps
                obs_list, action_list, adv_list = [], [], []
                for i in range(self.meta_batch_size):
                    inputs = ext.extract(
                        all_samples_data[n][step][i],
                        "observations", "actions", "advantages"
                    )
                    obs_list.append(inputs[0])
                    action_list.append(inputs[1])
                    adv_list.append(inputs[2])
                input_list += obs_list + action_list + adv_list  # [ [obs_0], [act_0], [adv_0], [obs_1], ... ]

        if particle_idx == 0 and (self.n_particles > 1):
            sess = tf.get_default_session()
            global_h = sess.run(self.global_h,feed_dict=dict(list(zip(self.policy_list[0].input_list_for_grad, input_list))))
            logger.record_tabular('global_h', global_h)

        obs_list, action_list, adv_list = [], [] , []
        for i in range(self.meta_batch_size):
            inputs = ext.extract(
                all_samples_data[particle_idx][-1][i],
                "observations", "actions", "advantages"
            )
            obs_list.append(inputs[0])
            action_list.append(inputs[1])
            adv_list.append(inputs[2])
        input_list += obs_list + action_list + adv_list

        dist_info_list = []
        for i in range(self.meta_batch_size):
            agent_infos = all_samples_data[particle_idx][self.kl_constrain_step][i]['agent_infos']
            dist_info_list += [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
        input_list += tuple(dist_info_list)

        self.optimizer.optimize(input_list)

        return dict()
Exemplo n.º 33
0
    def optimize_policy(self, itr, samples_data):
        # update the weight entropy input list
        ent_input = []
        for i in range(1000):
            ent_input.append(
                np.concatenate([self.base,
                                np.random.random(self.mp_dim)]).tolist())
        self.ent_input = [np.array(ent_input)]

        all_input_values = tuple(
            ext.extract(samples_data, "observations", "actions", "advantages"))

        ooo = ext.extract(samples_data, "observations", "actions",
                          "advantages")

        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        dist_info_list = [
            agent_infos[k] for k in self.policy.distribution.dist_info_keys
        ]
        all_input_values += tuple(state_info_list) + tuple(dist_info_list)

        all_input_values += tuple(self.ent_input)

        if self.policy.recurrent:
            all_input_values += (samples_data["valids"], )
        loss_before = self.optimizer.loss(all_input_values)

        mean_kl_before = self.optimizer.constraint_val(all_input_values)
        self.optimizer.optimize(all_input_values)

        ent_input = tuple(self.ent_input)
        blend_weight_entropy = self.policy._f_weightentropy(ent_input[0])[0]
        blend_choice_entropy = self.policy._f_choiceentropy(ent_input[0])[0]

        mean_kl = self.optimizer.constraint_val(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('Blend Weight Entropy', blend_weight_entropy)
        logger.record_tabular('Blend Choice Entropy', blend_choice_entropy)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('MeanKLBefore', mean_kl_before)
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()
Exemplo n.º 34
0
    def train_from_paths(self, paths, sub_sample=None, path_percentile=[10,15,33,50,66,85,90]):
        
        if sub_sample != None:
        	# Pick subset of paths whose returns are in the sub_sample percentile range
        	path_returns = [sum(p["rewards"]) for p in paths]
        	sub_range = [np.percentile(path_returns, sub_sample[i]) for i in range(2)]
        	# Find paths which satisfy criteria
        	idx = [i for i,ret in enumerate(path_returns) if sub_range[0]<=ret and ret<=sub_range[1]]
        	chosen_paths = [paths[i] for i in idx]
        else:
        	chosen_paths = paths

        self.baseline.fit(paths)
        # concatenate from all the trajectories
        observations = tensor_utils.concat_tensor_list([path["observations"] for path in chosen_paths])
        actions      = tensor_utils.concat_tensor_list([path["actions"] for path in chosen_paths])
        rewards      = tensor_utils.concat_tensor_list([path["rewards"] for path in chosen_paths])
        advantages   = tensor_utils.concat_tensor_list([path["advantages"] for path in chosen_paths])
        env_infos    = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in chosen_paths])
        agent_infos  = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in chosen_paths])

        samples_data = dict(
            observations=observations,
            actions=actions,
            rewards=rewards,
            advantages=advantages,
            env_infos=env_infos,
            agent_infos=agent_infos,
        )
        
        all_input_values = tuple(ext.extract(
            samples_data,
            "observations", "actions", "advantages"
        ))
        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
        all_input_values += tuple(state_info_list) + tuple(dist_info_list)
        
        # Take a step with optimizer
        self.optimizer.optimize(all_input_values)

        # cache return distributions for the paths
        path_returns = [sum(p["rewards"]) for p in paths]
        mean_return  = np.mean(path_returns)
        std_return   = np.std(path_returns)
        min_return   = np.amin(path_returns)
        max_return   = np.amax(path_returns)
        sub_mean     = np.mean([sum(p["rewards"]) for p in chosen_paths])

        base_stats = [mean_return, std_return, min_return, max_return, sub_mean]
        percetile_stats = []
        for p in path_percentile:
            percetile_stats.append(np.percentile(path_returns, p))

        return [base_stats, percetile_stats]
    def compute_updated_dists(self, samples):
        """ Compute fast gradients once and pull them out of tensorflow for sampling.
        """
        num_tasks = len(samples)
        param_keys = self.all_params.keys()

        sess = tf.get_default_session()

        obs_list, action_list, adv_list = [], [], []
        for i in range(num_tasks):
            inputs = ext.extract(samples[i],
                    'observations', 'actions', 'advantages')
            obs_list.append(inputs[0])
            action_list.append(inputs[1])
            adv_list.append(inputs[2])

        inputs = obs_list + action_list + adv_list

        # To do a second update, replace self.all_params below with the params that were used to collect the policy.
        init_param_values = None
        if self.all_param_vals is not None:
            init_param_values = self.get_variable_values(self.all_params)

        step_size = self.step_size
        for i in range(num_tasks):
            if self.all_param_vals is not None:
                self.assign_params(self.all_params, self.all_param_vals[i])

        if 'all_fast_params_tensor' not in dir(self):
            # make computation graph once
            self.all_fast_params_tensor = []
            for i in range(num_tasks):
                gradients = dict(zip(param_keys, tf.gradients(self.surr_objs[i], [self.all_params[key] for key in param_keys])))
                fast_params_tensor = dict(zip(param_keys, [self.all_params[key] - step_size*gradients[key] for key in param_keys]))
                self.all_fast_params_tensor.append(fast_params_tensor)

        # pull new param vals out of tensorflow, so gradient computation only done once
        self.all_param_vals = sess.run(self.all_fast_params_tensor, feed_dict=dict(list(zip(self.input_list_for_grad, inputs))))

        if init_param_values is not None:
            self.assign_params(self.all_params, init_param_values)

        outputs = []
        inputs = tf.split(0, num_tasks, self._l_obs)
        for i in range(num_tasks):
            # TODO - use a placeholder to feed in the params, so that we don't have to recompile every time.
            task_inp = inputs[i]
            info, _ = self.dist_info_sym(task_inp, dict(), all_params=self.all_param_vals[i],
                    is_training=False)

            outputs.append([info['prob']])

        self._cur_f_prob = tensor_utils.compile_function(
            inputs = [self._l_obs],
            outputs = outputs,
        )
Exemplo n.º 36
0
    def optimize_policy(self, itr, all_samples_data):
        assert len(all_samples_data) == self.num_grad_updates + 1  # we collected the rollouts to compute the grads and then the test!

        if not self.use_maml:
            all_samples_data = [all_samples_data[0]]

        input_list = []
        for step in range(len(all_samples_data)):  # these are the gradient steps
            obs_list, action_list, adv_list = [], [], []
            for i in range(self.meta_batch_size):

                inputs = ext.extract(
                    all_samples_data[step][i],
                    "observations", "actions", "advantages"
                )
                obs_list.append(inputs[0])
                action_list.append(inputs[1])
                adv_list.append(inputs[2])
            input_list += obs_list + action_list + adv_list  # [ [obs_0], [act_0], [adv_0], [obs_1], ... ]

            if step == 0:  ##CF not used?
                init_inputs = input_list

        if self.use_maml:
            dist_info_list = []
            for i in range(self.meta_batch_size):
                agent_infos = all_samples_data[self.kl_constrain_step][i]['agent_infos']
                dist_info_list += [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
            input_list += tuple(dist_info_list)
            logger.log("Computing KL before")
            mean_kl_before = self.optimizer.constraint_val(input_list)

        logger.log("Computing loss before")
        loss_before = self.optimizer.loss(input_list)
        logger.log("Optimizing")
        self.optimizer.optimize(input_list)
        logger.log("Computing loss after")
        loss_after = self.optimizer.loss(input_list)
        if self.use_maml:
            logger.log("Computing KL after")
            mean_kl = self.optimizer.constraint_val(input_list)
            logger.record_tabular('MeanKLBefore', mean_kl_before)  # this now won't be 0!
            logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()
Exemplo n.º 37
0
    def optimize_policy(self, itr, samples_data):
        logger.log("optimizing policy")
        inputs = ext.extract(
            samples_data,
            "observations", "actions", "advantages"
        )
        if self.policy.recurrent:
            inputs += (samples_data["valids"],)
        agent_infos = samples_data["agent_infos"]
        dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
        loss_before = self.optimizer.loss(inputs)
        self.optimizer.optimize(inputs)
        loss_after = self.optimizer.loss(inputs)
        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)

        mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list))
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('MaxKL', max_kl)
Exemplo n.º 38
0
    def train_from_paths(self, paths):
        
        self.baseline.fit(paths)
        # concatenate from all the trajectories
        observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths])
        actions      = tensor_utils.concat_tensor_list([path["actions"] for path in paths])
        rewards      = tensor_utils.concat_tensor_list([path["rewards"] for path in paths])
        advantages   = tensor_utils.concat_tensor_list([path["advantages"] for path in paths])
        env_infos    = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths])
        agent_infos  = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths])

        samples_data = dict(
            observations=observations,
            actions=actions,
            rewards=rewards,
            advantages=advantages,
            env_infos=env_infos,
            agent_infos=agent_infos,
        )
        
        all_input_values = tuple(ext.extract(
            samples_data,
            "observations", "actions", "advantages"
        ))
        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
        all_input_values += tuple(state_info_list)
        
        # Take a step with optimizer
        self.optimizer.optimize(all_input_values)

        # cache return distributions for the paths
        path_returns = [sum(p["rewards"]) for p in paths]
        mean_return  = np.mean(path_returns)
        std_return   = np.std(path_returns)
        min_return   = np.amin(path_returns)
        max_return   = np.amax(path_returns)
        return (mean_return, std_return, min_return, max_return)