예제 #1
0
    def step(self, sess, itr):
        with util.Timer() as t_all:
            with util.Timer() as t_sample:
                if itr == 0:
                    # extra batch to init std
                    trajbatchlist0, _ = self.sampler.sample(sess, itr)
                    for policy, baseline, trajbatch0 in util.safezip(self.policies, self.baselines,
                                                                     trajbatchlist0):
                        policy.update_obsnorm(trajbatch0.obs.stacked, sess=sess)
                        baseline.update_obsnorm(trajbatch0.obs.stacked, sess=sess)
                        self.sampler.rewnorm.update(trajbatch0.r.stacked[:, None], sess=sess)
                trajbatchlist, sampler_info_fields = self.sampler.sample(sess, itr)

            # Baseline
            with util.Timer() as t_base:
                trajbatch_vals_list, base_info_fields_list = [], []
                for agid, trajbatch in enumerate(trajbatchlist):
                    trajbatch_vals, base_info_fields = self.sampler.process(sess, itr, trajbatch,
                                                                            self.discount,
                                                                            self.gae_lambda,
                                                                            self.baselines[agid])
                    trajbatch_vals_list.append(trajbatch_vals)
                    base_info_fields_list += base_info_fields

            # Take policy steps
            with util.Timer() as t_step:
                step_print_fields_list = []
                params0_P_list = []
                for agid, policy in enumerate(self.policies):
                    params0_P = policy.get_params()
                    params0_P_list.append(params0_P)
                    step_print_fields = self.step_func(sess, policy, trajbatchlist[agid],
                                                       trajbatch_vals_list[agid]['advantage'])
                    step_print_fields_list += step_print_fields
                    policy.update_obsnorm(trajbatchlist[agid].obs.stacked, sess=sess)
                    self.sampler.rewnorm.update(trajbatchlist[agid].r.stacked[:, None], sess=sess)

        # LOG
        self.total_time += t_all.dt

        infos = []
        for agid in range(len(self.policies)):
            infos += [
                ('vf_r2_{}'.format(agid), trajbatch_vals_list[agid]['v_r'], float),
                ('tdv_r2_{}'.format(agid), trajbatch_vals_list[agid]['tv_r'], float),
                ('ent_{}'.format(agid), self.policies[agid]._compute_actiondist_entropy(
                    trajbatchlist[agid].adist.stacked).mean(), float),
                ('dx_{}'.format(agid),
                 util.maxnorm(params0_P_list[agid] - self.policies[agid].get_params()), float)
            ]
        fields = [
            ('iter', itr, int)
        ] + sampler_info_fields + infos + base_info_fields_list + step_print_fields_list + [
            ('tsamp', t_sample.dt, float),  # Time for sampling
            ('tbase', t_base.dt, float),  # Time for advantage/baseline computation
            ('tstep', t_step.dt, float),
            ('ttotal', self.total_time, float)
        ]
        return fields
예제 #2
0
def unflatten_into_vars(flatparams_P, param_vars, name=None):
    """
    Unflattens a vector produced by flatcat into the original variables
    """
    with tf.op_scope([flatparams_P] + param_vars, name,
                     'unflatten_into_vars') as scope:
        tensors = unflatten_into_tensors(
            flatparams_P, [v.get_shape().as_list() for v in param_vars])
        return tf.group(
            *[v.assign(t) for v, t in util.safezip(param_vars, tensors)],
            name=scope)
예제 #3
0
    def train(self, sess, log, save_freq, blend_freq=0, keep_kmax=0, blend_eval_trajs=50):
        for itr in range(self.start_iter, self.n_iter):
            iter_info = self.step(sess, itr)
            log.write(iter_info, print_header=itr % 20 == 0)
            if itr % save_freq == 0 or itr % self.n_iter:
                for policy in self.policies:
                    log.write_snapshot(sess, policy, itr)

            if blend_freq > 0:
                # Blending does not work
                assert self.target_policy is not None
                if itr == 0:
                    params_P_ag = [policy.get_params() for policy in self.policies]
                    weights, evalrewards = self._eval_policy_weights(blend_eval_trajs)
                    weightparams_P = np.sum([w * p for w, p in util.safezip(weights, params_P_ag)],
                                            axis=0)

                    blendparams_P = 0.001 * self.target_policy.get_params() + 0.999 * weightparams_P
                if itr > 0 and (itr % blend_freq == 0 or itr % self.n_iter == 0):
                    params_P_ag = [policy.get_params() for policy in self.policies]
                    weights, evalrewards = self._eval_policy_weights(blend_eval_trajs)
                    weightparams_P = np.sum([w * p for w, p in util.safezip(weights, params_P_ag)],
                                            axis=0)

                    blendparams_P = self.interp_alpha * self.target_policy.get_params() + (
                        1 - self.interp_alpha) * weightparams_P

                self.target_policy.set_params(blendparams_P)
                log.write_snapshot(sess, self.target_policy, itr)
                if keep_kmax:
                    keep_inds = np.argpartition(evalrewards, -keep_kmax)[-keep_kmax:]
                else:
                    keep_inds = []
                for agid, policies in enumerate(self.policies):
                    if agid in keep_inds:
                        continue
                    policies.set_params(blendparams_P)
예제 #4
0
 def __init__(self, arrays, lengths=None):
     if lengths is None:
         # Without provided lengths, `arrays` is interpreted as a list of arrays
         # and self.lengths is set to the list of lengths for those arrays
         self.arrays = arrays
         self.stacked = np.concatenate(arrays, axis=0)
         self.lengths = np.array([len(a) for a in arrays])
     else:
         # With provided lengths, `arrays` is interpreted as concatenated data
         # and self.lengths is set to the provided lengths.
         self.arrays = np.split(arrays, np.cumsum(lengths)[:-1])
         self.stacked = arrays
         self.lengths = np.asarray(lengths, dtype=int)
         assert all(len(a) == l for a, l in util.safezip(self.arrays, self.lengths))
         self.boundaries = np.concatenate([[0], np.cumsum(self.lengths)])
         assert self.boundaries[-1] == len(self.stacked)
예제 #5
0
    def save_h5(self, sess, h5file, key, extra_attrs=None):
        with h5py.File(h5file, 'a') as f:
            if key in f:
                util.warn('WARNING: key {} already exists in {}'.format(
                    key, h5file))
                dset = f[key]
            else:
                dset = f.create_group(key)

            vs = self.get_variables()
            vals = sess.run(vs)

            for v, val in util.safezip(vs, vals):
                dset[v.name] = val

            dset[self.varscope.name].attrs['hash'] = self.savehash(sess)
            if extra_attrs is not None:
                for k, v in extra_attrs:
                    if k in dset.attrs:
                        util.warn('Warning: attribute {} already exists in {}'.
                                  format(k, dset.name))
                    dset.attrs[k] = v
예제 #6
0
 def with_replaced_adist(self, new_adist):
     new_trajs = [
         Trajectory(traj.obs_T_Do, traj_new_adist, traj.a_T_Da, traj.r_T)
         for traj, traj_new_adist in util.safezip(self.trajs, new_adist)
     ]
     return TrajBatch(new_trajs, self.obs, new_adist, self.a, self.r, self.time)
예제 #7
0
 def with_replaced_reward(self, new_r):
     new_trajs = [
         Trajectory(traj.obs_T_Do, traj.adist_T_Pa, traj.a_T_Da, traj_new_r)
         for traj, traj_new_r in util.safezip(self.trajs, new_r)
     ]
     return TrajBatch(new_trajs, self.obs, self.adist, self.a, new_r, self.time)
예제 #8
0
 def savehash(self, sess):
     """Hash is based on values of variables"""
     vars_ = self.get_variables()
     vals = sess.run(vars_)
     return self._hash_name2array([(v.name, val)
                                   for v, val in util.safezip(vars_, vals)])
예제 #9
0
    def __init__(self, observation_space, action_space, num_actiondist_params,
                 enable_obsnorm, varscope_name):
        super(StochasticPolicy, self).__init__(observation_space, action_space)

        with tf.variable_scope(varscope_name) as self.varscope:
            batch_size = None
            if isinstance(self.action_space, spaces.Discrete):
                action_type = tf.int32
                if hasattr(self.action_space, 'ndim'):
                    action_dim = self.action_space.ndim
                else:
                    action_dim = 1
            elif isinstance(self.action_space, spaces.Box):
                action_type = tf.float32
                action_dim = self.action_space.shape[0]
            else:
                raise NotImplementedError()

            if self.recurrent:
                obs_shape = list((
                    batch_size,
                    None,
                ) + self.observation_space.shape)
                action_shape = [batch_size, None, action_dim]
                actiondist_shape = [batch_size, None, num_actiondist_params]
                advantage_shape = [batch_size, None]
            else:
                obs_shape = list((batch_size, ) + self.observation_space.shape)
                action_shape = [batch_size, action_dim]
                actiondist_shape = [batch_size, num_actiondist_params]
                advantage_shape = [batch_size]

            # Action distribution for current policy
            self._obs = tf.placeholder(tf.float32, obs_shape, name='obs')
            with tf.variable_scope('obsnorm'):
                self.obsnorm = (nn.Standardizer
                                if enable_obsnorm else nn.NoOpStandardizer)(
                                    self.observation_space.shape)
            self._normalized_obs = self.obsnorm.standardize_expr(self._obs)

            if self.recurrent:
                self._actiondist, self._flatinnet, self.compute_step_actiondist, self._hidden_vec = self._make_actiondist_ops(
                    self._normalized_obs)
            else:
                self._actiondist = self._make_actiondist_ops(
                    self._normalized_obs)

            self._input_action = tf.placeholder(
                action_type, action_shape,
                name='input_actions')  # Action dims FIXME type

            self._logprobs = self._make_actiondist_logprobs_ops(
                self._actiondist, self._input_action)

            # proposal distribution from old policy
            self._proposal_actiondist = tf.placeholder(
                tf.float32, actiondist_shape, name='proposal_actiondist')
            self._proposal_logprobs = self._make_actiondist_logprobs_ops(
                self._proposal_actiondist, self._input_action)

            # Advantage
            self._advantage = tf.placeholder(tf.float32,
                                             advantage_shape,
                                             name='advantage')

            if self.recurrent:
                self._valid = tf.placeholder(tf.float32,
                                             shape=[None, None],
                                             name="valid")
            else:
                self._valid = None

            # Plain pg objective (REINFORCE)
            impweight = tf.exp(self._logprobs - self._proposal_logprobs)
            if self.recurrent:
                self._reinfobj = tf.reduce_sum(
                    impweight * self._advantage * self._valid) / tf.reduce_sum(
                        self._valid)
            else:
                self._reinfobj = tf.reduce_mean(
                    impweight * self._advantage)  # Surrogate loss

            # KL
            self._kl_coeff = tf.placeholder(tf.float32, name='kl_cost_coeff')
            kl = self._make_actiondist_kl_ops(self._proposal_actiondist,
                                              self._actiondist)
            if self.recurrent:
                self._kl = tf.reduce_sum(kl * self._valid) / tf.reduce_sum(
                    self._valid)
            else:
                self._kl = tf.reduce_mean(kl, 0)  # Minimize kl divergence

            # KL Penalty objective for PPO
            self._penobj = self._reinfobj - self._kl_coeff * self._kl

            # All trainable vars done (only _make_* methods)

            # Reading params
            self._param_vars = self.get_variables(trainable=True)
            self._num_params = self.get_num_params(trainable=True)
            self._curr_params_P = tfutil.flatcat(
                self._param_vars)  # Flatten the params and concat

            self._all_param_vars = self.get_variables()
            self._num_all_params = self.get_num_params()
            self._curr_all_params_PA = tfutil.flatcat(self._all_param_vars)

            # Gradients of objective
            self._reinfobj_grad_P = tfutil.flatcat(
                tfutil.fixedgradients(self._reinfobj, self._param_vars))
            self._penobj_grad_P = tfutil.flatcat(
                tfutil.fixedgradients(self._penobj, self._param_vars))

            # KL gradient for TRPO
            self._kl_grad_P = tfutil.flatcat(
                tfutil.fixedgradients(self._kl, self._param_vars))

            ins = [
                self._obs, self._input_action, self._proposal_actiondist,
                self._advantage
            ]
            if self.recurrent:
                ins.append(self._valid)

            self._compute_internal_normalized_obs = tfutil.function(
                [self._obs], self._normalized_obs)
            self.compute_action_logprobs = tfutil.function(
                [self._obs, self._input_action], self._logprobs)
            self.compute_action_dist_params = tfutil.function([self._obs],
                                                              self._actiondist)

            self.compute_kl_cost = tfutil.function(ins, self._kl)
            self.compute_klgrad = tfutil.function(ins, self._kl_grad_P)
            self.compute_reinfobj_kl = tfutil.function(
                ins, [self._reinfobj, self._kl])
            self.compute_reinfobj_kl_with_grad = tfutil.function(
                ins, [self._reinfobj, self._kl, self._reinfobj_grad_P])

            self._ngstep = optim.make_ngstep_func(
                self,
                compute_obj_kl=self.compute_reinfobj_kl,
                compute_obj_kl_with_grad=self.compute_reinfobj_kl_with_grad,
                compute_hvp_helper=self.compute_klgrad)

            # Writing params
            self._flatparams_P = tf.placeholder(tf.float32, [self._num_params],
                                                name='flatparams_P')
            # For updating vars directly, e.g. for PPO
            self._assign_params = tfutil.unflatten_into_vars(
                self._flatparams_P, self._param_vars)

            self._flatallparams_PA = tf.placeholder(tf.float32,
                                                    [self._num_all_params],
                                                    name='flatallparams_PA')
            self._assign_all_params = tfutil.unflatten_into_vars(
                self._flatallparams_PA, self._all_param_vars)

            self.set_params = tfutil.function([self._flatparams_P], [],
                                              [self._assign_params])
            self.get_params = tfutil.function([], self._curr_params_P)
            self.get_state = tfutil.function([], self._curr_all_params_PA)
            self.set_state = tfutil.function([self._flatallparams_PA], [],
                                             [self._assign_all_params])
            # Treats placeholder self._flatparams_p as gradient for descent
            with tf.variable_scope('optimizer'):
                self._learning_rate = tf.placeholder(tf.float32,
                                                     name='learning_rate')
                vargrads = tfutil.unflatten_into_tensors(
                    self._flatparams_P,
                    [v.get_shape().as_list() for v in self._param_vars])
                self._take_descent_step = tf.train.AdamOptimizer(
                    learning_rate=self._learning_rate).apply_gradients(
                        util.safezip(vargrads, self._param_vars))