def step(self, sess, itr): with util.Timer() as t_all: with util.Timer() as t_sample: if itr == 0: # extra batch to init std trajbatchlist0, _ = self.sampler.sample(sess, itr) for policy, baseline, trajbatch0 in util.safezip(self.policies, self.baselines, trajbatchlist0): policy.update_obsnorm(trajbatch0.obs.stacked, sess=sess) baseline.update_obsnorm(trajbatch0.obs.stacked, sess=sess) self.sampler.rewnorm.update(trajbatch0.r.stacked[:, None], sess=sess) trajbatchlist, sampler_info_fields = self.sampler.sample(sess, itr) # Baseline with util.Timer() as t_base: trajbatch_vals_list, base_info_fields_list = [], [] for agid, trajbatch in enumerate(trajbatchlist): trajbatch_vals, base_info_fields = self.sampler.process(sess, itr, trajbatch, self.discount, self.gae_lambda, self.baselines[agid]) trajbatch_vals_list.append(trajbatch_vals) base_info_fields_list += base_info_fields # Take policy steps with util.Timer() as t_step: step_print_fields_list = [] params0_P_list = [] for agid, policy in enumerate(self.policies): params0_P = policy.get_params() params0_P_list.append(params0_P) step_print_fields = self.step_func(sess, policy, trajbatchlist[agid], trajbatch_vals_list[agid]['advantage']) step_print_fields_list += step_print_fields policy.update_obsnorm(trajbatchlist[agid].obs.stacked, sess=sess) self.sampler.rewnorm.update(trajbatchlist[agid].r.stacked[:, None], sess=sess) # LOG self.total_time += t_all.dt infos = [] for agid in range(len(self.policies)): infos += [ ('vf_r2_{}'.format(agid), trajbatch_vals_list[agid]['v_r'], float), ('tdv_r2_{}'.format(agid), trajbatch_vals_list[agid]['tv_r'], float), ('ent_{}'.format(agid), self.policies[agid]._compute_actiondist_entropy( trajbatchlist[agid].adist.stacked).mean(), float), ('dx_{}'.format(agid), util.maxnorm(params0_P_list[agid] - self.policies[agid].get_params()), float) ] fields = [ ('iter', itr, int) ] + sampler_info_fields + infos + base_info_fields_list + step_print_fields_list + [ ('tsamp', t_sample.dt, float), # Time for sampling ('tbase', t_base.dt, float), # Time for advantage/baseline computation ('tstep', t_step.dt, float), ('ttotal', self.total_time, float) ] return fields
def unflatten_into_vars(flatparams_P, param_vars, name=None): """ Unflattens a vector produced by flatcat into the original variables """ with tf.op_scope([flatparams_P] + param_vars, name, 'unflatten_into_vars') as scope: tensors = unflatten_into_tensors( flatparams_P, [v.get_shape().as_list() for v in param_vars]) return tf.group( *[v.assign(t) for v, t in util.safezip(param_vars, tensors)], name=scope)
def train(self, sess, log, save_freq, blend_freq=0, keep_kmax=0, blend_eval_trajs=50): for itr in range(self.start_iter, self.n_iter): iter_info = self.step(sess, itr) log.write(iter_info, print_header=itr % 20 == 0) if itr % save_freq == 0 or itr % self.n_iter: for policy in self.policies: log.write_snapshot(sess, policy, itr) if blend_freq > 0: # Blending does not work assert self.target_policy is not None if itr == 0: params_P_ag = [policy.get_params() for policy in self.policies] weights, evalrewards = self._eval_policy_weights(blend_eval_trajs) weightparams_P = np.sum([w * p for w, p in util.safezip(weights, params_P_ag)], axis=0) blendparams_P = 0.001 * self.target_policy.get_params() + 0.999 * weightparams_P if itr > 0 and (itr % blend_freq == 0 or itr % self.n_iter == 0): params_P_ag = [policy.get_params() for policy in self.policies] weights, evalrewards = self._eval_policy_weights(blend_eval_trajs) weightparams_P = np.sum([w * p for w, p in util.safezip(weights, params_P_ag)], axis=0) blendparams_P = self.interp_alpha * self.target_policy.get_params() + ( 1 - self.interp_alpha) * weightparams_P self.target_policy.set_params(blendparams_P) log.write_snapshot(sess, self.target_policy, itr) if keep_kmax: keep_inds = np.argpartition(evalrewards, -keep_kmax)[-keep_kmax:] else: keep_inds = [] for agid, policies in enumerate(self.policies): if agid in keep_inds: continue policies.set_params(blendparams_P)
def __init__(self, arrays, lengths=None): if lengths is None: # Without provided lengths, `arrays` is interpreted as a list of arrays # and self.lengths is set to the list of lengths for those arrays self.arrays = arrays self.stacked = np.concatenate(arrays, axis=0) self.lengths = np.array([len(a) for a in arrays]) else: # With provided lengths, `arrays` is interpreted as concatenated data # and self.lengths is set to the provided lengths. self.arrays = np.split(arrays, np.cumsum(lengths)[:-1]) self.stacked = arrays self.lengths = np.asarray(lengths, dtype=int) assert all(len(a) == l for a, l in util.safezip(self.arrays, self.lengths)) self.boundaries = np.concatenate([[0], np.cumsum(self.lengths)]) assert self.boundaries[-1] == len(self.stacked)
def save_h5(self, sess, h5file, key, extra_attrs=None): with h5py.File(h5file, 'a') as f: if key in f: util.warn('WARNING: key {} already exists in {}'.format( key, h5file)) dset = f[key] else: dset = f.create_group(key) vs = self.get_variables() vals = sess.run(vs) for v, val in util.safezip(vs, vals): dset[v.name] = val dset[self.varscope.name].attrs['hash'] = self.savehash(sess) if extra_attrs is not None: for k, v in extra_attrs: if k in dset.attrs: util.warn('Warning: attribute {} already exists in {}'. format(k, dset.name)) dset.attrs[k] = v
def with_replaced_adist(self, new_adist): new_trajs = [ Trajectory(traj.obs_T_Do, traj_new_adist, traj.a_T_Da, traj.r_T) for traj, traj_new_adist in util.safezip(self.trajs, new_adist) ] return TrajBatch(new_trajs, self.obs, new_adist, self.a, self.r, self.time)
def with_replaced_reward(self, new_r): new_trajs = [ Trajectory(traj.obs_T_Do, traj.adist_T_Pa, traj.a_T_Da, traj_new_r) for traj, traj_new_r in util.safezip(self.trajs, new_r) ] return TrajBatch(new_trajs, self.obs, self.adist, self.a, new_r, self.time)
def savehash(self, sess): """Hash is based on values of variables""" vars_ = self.get_variables() vals = sess.run(vars_) return self._hash_name2array([(v.name, val) for v, val in util.safezip(vars_, vals)])
def __init__(self, observation_space, action_space, num_actiondist_params, enable_obsnorm, varscope_name): super(StochasticPolicy, self).__init__(observation_space, action_space) with tf.variable_scope(varscope_name) as self.varscope: batch_size = None if isinstance(self.action_space, spaces.Discrete): action_type = tf.int32 if hasattr(self.action_space, 'ndim'): action_dim = self.action_space.ndim else: action_dim = 1 elif isinstance(self.action_space, spaces.Box): action_type = tf.float32 action_dim = self.action_space.shape[0] else: raise NotImplementedError() if self.recurrent: obs_shape = list(( batch_size, None, ) + self.observation_space.shape) action_shape = [batch_size, None, action_dim] actiondist_shape = [batch_size, None, num_actiondist_params] advantage_shape = [batch_size, None] else: obs_shape = list((batch_size, ) + self.observation_space.shape) action_shape = [batch_size, action_dim] actiondist_shape = [batch_size, num_actiondist_params] advantage_shape = [batch_size] # Action distribution for current policy self._obs = tf.placeholder(tf.float32, obs_shape, name='obs') with tf.variable_scope('obsnorm'): self.obsnorm = (nn.Standardizer if enable_obsnorm else nn.NoOpStandardizer)( self.observation_space.shape) self._normalized_obs = self.obsnorm.standardize_expr(self._obs) if self.recurrent: self._actiondist, self._flatinnet, self.compute_step_actiondist, self._hidden_vec = self._make_actiondist_ops( self._normalized_obs) else: self._actiondist = self._make_actiondist_ops( self._normalized_obs) self._input_action = tf.placeholder( action_type, action_shape, name='input_actions') # Action dims FIXME type self._logprobs = self._make_actiondist_logprobs_ops( self._actiondist, self._input_action) # proposal distribution from old policy self._proposal_actiondist = tf.placeholder( tf.float32, actiondist_shape, name='proposal_actiondist') self._proposal_logprobs = self._make_actiondist_logprobs_ops( self._proposal_actiondist, self._input_action) # Advantage self._advantage = tf.placeholder(tf.float32, advantage_shape, name='advantage') if self.recurrent: self._valid = tf.placeholder(tf.float32, shape=[None, None], name="valid") else: self._valid = None # Plain pg objective (REINFORCE) impweight = tf.exp(self._logprobs - self._proposal_logprobs) if self.recurrent: self._reinfobj = tf.reduce_sum( impweight * self._advantage * self._valid) / tf.reduce_sum( self._valid) else: self._reinfobj = tf.reduce_mean( impweight * self._advantage) # Surrogate loss # KL self._kl_coeff = tf.placeholder(tf.float32, name='kl_cost_coeff') kl = self._make_actiondist_kl_ops(self._proposal_actiondist, self._actiondist) if self.recurrent: self._kl = tf.reduce_sum(kl * self._valid) / tf.reduce_sum( self._valid) else: self._kl = tf.reduce_mean(kl, 0) # Minimize kl divergence # KL Penalty objective for PPO self._penobj = self._reinfobj - self._kl_coeff * self._kl # All trainable vars done (only _make_* methods) # Reading params self._param_vars = self.get_variables(trainable=True) self._num_params = self.get_num_params(trainable=True) self._curr_params_P = tfutil.flatcat( self._param_vars) # Flatten the params and concat self._all_param_vars = self.get_variables() self._num_all_params = self.get_num_params() self._curr_all_params_PA = tfutil.flatcat(self._all_param_vars) # Gradients of objective self._reinfobj_grad_P = tfutil.flatcat( tfutil.fixedgradients(self._reinfobj, self._param_vars)) self._penobj_grad_P = tfutil.flatcat( tfutil.fixedgradients(self._penobj, self._param_vars)) # KL gradient for TRPO self._kl_grad_P = tfutil.flatcat( tfutil.fixedgradients(self._kl, self._param_vars)) ins = [ self._obs, self._input_action, self._proposal_actiondist, self._advantage ] if self.recurrent: ins.append(self._valid) self._compute_internal_normalized_obs = tfutil.function( [self._obs], self._normalized_obs) self.compute_action_logprobs = tfutil.function( [self._obs, self._input_action], self._logprobs) self.compute_action_dist_params = tfutil.function([self._obs], self._actiondist) self.compute_kl_cost = tfutil.function(ins, self._kl) self.compute_klgrad = tfutil.function(ins, self._kl_grad_P) self.compute_reinfobj_kl = tfutil.function( ins, [self._reinfobj, self._kl]) self.compute_reinfobj_kl_with_grad = tfutil.function( ins, [self._reinfobj, self._kl, self._reinfobj_grad_P]) self._ngstep = optim.make_ngstep_func( self, compute_obj_kl=self.compute_reinfobj_kl, compute_obj_kl_with_grad=self.compute_reinfobj_kl_with_grad, compute_hvp_helper=self.compute_klgrad) # Writing params self._flatparams_P = tf.placeholder(tf.float32, [self._num_params], name='flatparams_P') # For updating vars directly, e.g. for PPO self._assign_params = tfutil.unflatten_into_vars( self._flatparams_P, self._param_vars) self._flatallparams_PA = tf.placeholder(tf.float32, [self._num_all_params], name='flatallparams_PA') self._assign_all_params = tfutil.unflatten_into_vars( self._flatallparams_PA, self._all_param_vars) self.set_params = tfutil.function([self._flatparams_P], [], [self._assign_params]) self.get_params = tfutil.function([], self._curr_params_P) self.get_state = tfutil.function([], self._curr_all_params_PA) self.set_state = tfutil.function([self._flatallparams_PA], [], [self._assign_all_params]) # Treats placeholder self._flatparams_p as gradient for descent with tf.variable_scope('optimizer'): self._learning_rate = tf.placeholder(tf.float32, name='learning_rate') vargrads = tfutil.unflatten_into_tensors( self._flatparams_P, [v.get_shape().as_list() for v in self._param_vars]) self._take_descent_step = tf.train.AdamOptimizer( learning_rate=self._learning_rate).apply_gradients( util.safezip(vargrads, self._param_vars))