class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.save = None self.load = None # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize( target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if var.name.endswith('/w:0') and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def step(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale B = obs0.shape[0] for b in range(B): self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) if self.normalize_observations: self.obs_rms.update(np.array([obs0[b]])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.save = functools.partial(save_variables, sess=self.sess) self.load = functools.partial(load_variables, sess=self.load) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): try: from mpi4py import MPI except ImportError: MPI = None if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() else: mean_distance = distance if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() else: mean_distance = distance self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]): rms = RunningMeanStd(shape=x.shape[1:]) norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range)) return norm_x, rms
def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-1., 1.), action_range= [0.2, 0.2, 0.2, 0.2, 0.2, 0.2], return_range=(-np.inf, np.inf), adaptive_param_noise=True, critic_l2_reg=0., adaptive_param_noise_policy_threshold=.1, actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., restore=False): # Inputs. # self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs0 = tf.placeholder(tf.float32, shape=(None, observation_shape), name='obs0') # self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.obs1 = tf.placeholder(tf.float32, shape=(None, observation_shape), name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, action_shape), name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() """Filewriter summary""" monitor_directory = os.path.join("Experiment_data") self.summary_dir = os.path.join(monitor_directory, "summary") # if restore: # dirname = 'run20' # The last name # self.summary_dir = os.path.join(self.summary_dir, dirname) # else: self.summary_dir = utils.new_summary_dir(self.summary_dir) # record the detailed parameters utils.log_params(self.summary_dir, { "actor learning rate": self.actor_lr, "critic learning rate": self.critic_lr, "batch size": self.batch_size, "actor update rate": self.tau, "critic update rate": self.tau, "action noise": self.action_noise, "param noise": self.param_noise, "reward function": 'General reward function', "result_function": 'The second 100' }) self.merged = tf.summary.merge_all()
def _init(self, np_random, flavor, dim, hid_size=32, n_hid=2, alpha_sysid=0.1, test=False): print("obs dim:", dim.ob) # inputs & hyperparameters self.flavor = flavor self.dim = dim self.alpha_sysid = alpha_sysid self.ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=(None, dim.ob_concat)) self.ob_traj = U.get_placeholder(name="ob_traj", dtype=tf.float32, shape=[None, dim.window, dim.ob]) self.ac_traj = U.get_placeholder(name="ac_traj", dtype=tf.float32, shape=[None, dim.window, dim.ac]) # regular inputs whitening ob, sysid = tf.split(self.ob, [dim.ob, dim.sysid], axis=1) with tf.variable_scope("ob_filter"): self.ob_rms = RunningMeanStd(shape=(dim.ob_concat)) obz_all = tf.clip_by_value( (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0, name="ob_normalizer") obz, sysidz = tf.split(obz_all, [dim.ob, dim.sysid], axis=1) print("obz dim:", obz.shape, "sysidz dim:", sysidz.shape) with tf.variable_scope("ob_white"): obz = tf.identity(obz) with tf.variable_scope("sysid_white"): self.sysidz = tf.identity(sysidz) # trajectory inputs for SysID # NOTE: the environment should be defined such that # actions are relatively close to Normal(0,1) ob_trajz = tf.clip_by_value( (self.ob_traj - self.ob_rms.mean[:dim.ob]) / self.ob_rms.std[:dim.ob], -5.0, 5.0, name="ob_traj_white") trajs = tf.concat([ob_trajz, self.ac_traj], axis=2) # these rewards will be optimized via direct gradient-based optimization # (not RL reward), in the same place as e.g. the entropy regularization self.extra_rewards = [] self.extra_reward_names = [] with tf.variable_scope("sysid"): if flavor == PLAIN: self.traj2sysid = sysid_convnet(np_random, trajs, dim.sysid) elif flavor == EXTRA: self.traj2sysid = sysid_convnet(np_random, trajs, dim.sysid) elif flavor == EMBED: self.traj2embed = sysid_convnet(np_random, trajs, dim.embed) EMBED_N_HID = 2 EMBED_HID_SZ = 2 * dim.sysid # policy with tf.variable_scope("pol"): if flavor == BLIND: policy_input = obz self.sysid_err_supervised = tf.constant(0.0) elif flavor == PLAIN: self.sysid_err_supervised = tf.losses.mean_squared_error( tf.stop_gradient(sysidz), self.traj2sysid) policy_input = tf.concat([obz, self.traj2sysid ]) if test else obz_all elif flavor == EXTRA: sysid_processor_input = self.traj2sysid if test else sysidz sysid_processor = MLPModule(np_random, sysid_processor_input, EMBED_N_HID, EMBED_HID_SZ, 1.0, dim.embed, "sysid_processor") policy_input = tf.concat([obz, sysid_processor], axis=1, name="input_concat") self.sysid_err_supervised = tf.losses.mean_squared_error( tf.stop_gradient(sysidz), self.traj2sysid) elif flavor == EMBED: self.embed = MLPModule(np_random, sysidz, EMBED_N_HID, EMBED_HID_SZ, 1.0, dim.embed, "embed") embed_input = self.traj2embed if test else self.embed policy_input = tf.concat([obz, embed_input], axis=1, name="input_concat") self.sysid_err_supervised = tf.losses.mean_squared_error( tf.stop_gradient(self.embed), self.traj2embed) mean, var = tf.nn.moments(self.embed, 0) dist = tf.distributions.Normal(loc=mean, scale=tf.sqrt(var)) std_dist = tf.distributions.Normal(loc=0.0, scale=1.0) embed_KL = tf.reduce_mean( tf.distributions.kl_divergence(dist, std_dist)) self.extra_rewards.append(-0.1 * embed_KL) self.extra_reward_names.append("neg_embed_KL") elif flavor == TRAJ: self.traj_conv = sysid_convnet(np_random, trajs, dim.embed) policy_input = tf.concat([obz, self.traj_conv], axis=1, name="input_concat") self.sysid_err_supervised = tf.constant(0.0) else: raise ValueError("flavor '{}' does not exist".format(flavor)) # main policy MLP. outputs mean and logstd of stochastic Gaussian policy with tf.variable_scope("policy"): print("policy input dimensionality:", policy_input.get_shape().as_list()) mean = MLPModule(np_random, policy_input, n_hid, hid_size, 0.01, dim.ac, "pol") logstd = tf.maximum( tf.get_variable(name="logstd", shape=[1, dim.ac], initializer=tf.constant_initializer(-0.3)), -1.0) with tf.variable_scope("policy_to_gaussian"): pdparam = tf.concat([mean, mean * 0.0 + logstd], 1) self.pdtype = DiagGaussianPdType(dim.ac) self.pd = self.pdtype.pdfromflat(pdparam) # value function with tf.variable_scope("vf"): self.vpred = MLPModule(np_random, tf.stop_gradient(policy_input), n_hid, hid_size, 0.1, 1, "vf")[:, 0] # switch between stochastic and deterministic policy with tf.variable_scope("stochastic_switch"): self.stochastic = tf.placeholder(dtype=tf.bool, shape=(), name="stochastic") self.ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode()) # function we'll call when interacting with environment self._act = U.function([self.stochastic, self.ob], [self.ac, self.vpred]) # for test time, the trajectory is fed in self._act_traj = U.function( [self.stochastic, self.ob, self.ob_traj, self.ac_traj], [self.ac, self.vpred])
class TransitionClassifier(object): def __init__(self, ob_size, ac_size, hidden_size=100, log_reward=False, entcoeff=0.001, scope="adversary", dyn_norm=True): self.scope = scope self.ob_size = ob_size self.ac_size = ac_size # self.input_size = ob_size + ac_size self.hidden_size = hidden_size self.log_reward = log_reward self.dyn_norm = dyn_norm self.build_ph() # Build grpah generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph) expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph) # Build accuracy generator_acc = tf.reduce_mean( tf.cast(tf.nn.sigmoid(generator_logits) < 0.5, tf.float32)) expert_acc = tf.reduce_mean( tf.cast(tf.nn.sigmoid(expert_logits) > 0.5, tf.float32)) # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) generator_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=generator_logits, labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) entropy_loss = -entcoeff * entropy # Loss + Accuracy terms self.losses = [ generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc ] self.loss_name = [ "generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc" ] self.total_loss = generator_loss + expert_loss + entropy_loss # Build Reward for policy if log_reward: reward_op = -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8) else: reward_op = tf.nn.sigmoid(generator_logits) self.reward = U.function( [self.generator_obs_ph, self.generator_acs_ph], reward_op) lr = tf.placeholder(tf.float32, None) self.trainer = tf.train.AdamOptimizer(learning_rate=lr) gvs = self.trainer.compute_gradients(self.total_loss, self.get_trainable_variables()) self._train = U.function([ self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph, lr ], self.losses, updates=[self.trainer.apply_gradients(gvs)]) def build_ph(self): self.generator_obs_ph = tf.placeholder(tf.float32, (None, self.ob_size), name="observations_ph") self.generator_acs_ph = tf.placeholder(tf.float32, (None, self.ac_size), name="actions_ph") self.expert_obs_ph = tf.placeholder(tf.float32, (None, self.ob_size), name="expert_observations_ph") self.expert_acs_ph = tf.placeholder(tf.float32, (None, self.ac_size), name="expert_actions_ph") def build_graph(self, obs_ph, acs_ph): with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE): with tf.variable_scope("obfilter"): self.obs_rms = RunningMeanStd(shape=[self.ob_size]) obs = normalize(obs_ph, self.obs_rms) _input = tf.concat( [obs, acs_ph], axis=1) # concatenate the two input -> form a transition p_h1 = tf.contrib.layers.fully_connected(_input, self.hidden_size, activation_fn=tf.nn.tanh) p_h2 = tf.contrib.layers.fully_connected(p_h1, self.hidden_size, activation_fn=tf.nn.tanh) logits = tf.contrib.layers.fully_connected(p_h2, 1, activation_fn=None) return logits def get_trainable_variables(self): return tf.trainable_variables(self.scope) def get_reward(self, obs, acs): return np.squeeze(self.reward(obs, acs)) def build_reward_op(self, obs_ph, acs_ph): logits = self.build_graph(obs_ph, acs_ph) if self.log_reward: return -tf.log(1 - tf.nn.sigmoid(logits) + 1e-8) return tf.nn.sigmoid(logits) def set_expert_data(self, data): self.data = Dataset(data, deterministic=False) def train(self, rl_ob, rl_ac, steps=1, lr=3e-4): n = rl_ob.shape[0] loss_buf = [] batch_size = rl_ob.shape[0] // steps for batch in iterbatches([rl_ob, rl_ac], include_final_partial_batch=False, batch_size=batch_size): exp_ob, exp_ac = self.data.next_batch(batch_size) if self.obs_rms and self.dyn_norm: self.obs_rms.update(np.concatenate([exp_ob, rl_ob], axis=0)) loss_buf.append(self._train(*batch, exp_ob, exp_ac, lr)) logger.info(fmt_row(13, self.loss_name)) logger.info(fmt_row(13, np.mean(loss_buf, axis=0)))
class MADDPG(object): def __init__(self, name, actor, critic, memory, obs_space_n, act_space_n, agent_index, obs_rms, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): self.name = name self.num_agents = len(obs_space_n) self.agent_index = agent_index from gym import spaces continuous_ctrl = not isinstance(act_space_n[0], spaces.Discrete) # TODO: remove after testing assert continuous_ctrl # Multi-agent inputs # self.obs0 = [] # self.obs1 = [] self.actions = [] # self.norm_obs0_ph = [] # self.norm_obs1_ph = [] self.obs0 = tf.placeholder(tf.float32, shape=( self.num_agents, None, ) + obs_space_n[self.agent_index].shape, name="obs0") self.obs1 = tf.placeholder(tf.float32, shape=( self.num_agents, None, ) + obs_space_n[self.agent_index].shape, name="obs1") # if continuous_ctrl: # self.actions = tf.placeholder(tf.float32, shape=(self.num_agents, None,) + act_space_n[self.agent_index].shape, name="action") # else: # act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # self.actions = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] # this is required to reshape obs and actions for concatenation obs_shape_list = [self.num_agents] + list( obs_space_n[self.agent_index].shape) act_shape_list = [self.num_agents] + list( act_space_n[self.agent_index].shape) self.obs_shape_prod = np.prod(obs_shape_list) self.act_shape_prod = np.prod(act_shape_list) for i in range(self.num_agents): # each obs in obs0,obs1 contains info about ego agent and relative pos/vel of other agents # self.obs0.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="obs0_"+str(i))) # self.obs1.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="obs1_"+str(i))) if continuous_ctrl: self.actions.append( tf.placeholder(tf.float32, shape=[None] + list(act_space_n[i].shape), name="action" + str(i))) else: self.actions.append( make_pdtype(act_space_n[i]).sample_placeholder( [None], name="action" + str(i))) # self.norm_obs0_ph.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="norm_obs0_"+str(i))) # self.norm_obs1_ph.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="norm_obs1_"+str(i))) # self.norm_obs0_ph = tf.placeholder(tf.float32, shape=[self.num_agents, None] + list(obs_space_n[self.agent_index].shape), name="norm_obs0") # self.norm_obs1_ph = tf.placeholder(tf.float32, shape=[self.num_agents, None] + list(obs_space_n[self.agent_index].shape), name="norm_obs1") # we only provide single agent inputs for these placeholders self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. # TODO: need to update the replay buffer storage function to account for multiple agents if self.normalize_observations: self.obs_rms = obs_rms else: self.obs_rms = None # Need to transpose observations so we can normalize them # converts tensor to shape (batch_size, num_agents, space_size) # transose on dim 0 and 1, leave dim 2 unchanged obs0_t = tf.transpose(self.obs0, perm=[1, 0, 2]) obs1_t = tf.transpose(self.obs1, perm=[1, 0, 2]) actions_t = tf.transpose(self.actions, perm=[1, 0, 2]) # each entry in obs_t is normalized wrt the agent normalized_obs0 = tf.clip_by_value(normalize(obs0_t, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(obs1_t, self.obs_rms), self.observation_range[0], self.observation_range[1]) # convert the obs to original shape after normalization for convenience normalized_act_obs0 = tf.transpose(normalized_obs0, perm=[1, 0, 2]) normalized_act_obs1 = tf.transpose(normalized_obs1, perm=[1, 0, 2]) # need to specify exact shape, since we dont always pass batch size number of obs/act normalized_obs0_flat = tf.reshape(normalized_obs0, [-1, self.obs_shape_prod]) normalized_obs1_flat = tf.reshape(normalized_obs1, [-1, self.obs_shape_prod]) actions_t_flat = tf.reshape(actions_t, [-1, self.act_shape_prod]) # Return normalization. # TODO: update this to handle multiple agents if required if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. # Each agents gets its own observation self.actor_tf = actor(normalized_act_obs0[self.agent_index]) self.target_actor_tf = target_actor( normalized_act_obs1[self.agent_index]) # Critic gets all observations self.normalized_critic_tf = critic(normalized_obs0_flat, actions_t_flat) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # need to provide critic() with all actions act_input_n = self.actions + [] # copy actions act_input_n[ self. agent_index] = self.actor_tf # update current agent action using its actor act_input_n_t = tf.transpose(act_input_n, perm=[1, 0, 2]) act_input_n_t_flat = tf.reshape(act_input_n_t, [-1, self.act_shape_prod]) self.normalized_critic_with_actor_tf = critic(normalized_obs0_flat, act_input_n_t_flat, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # we need to use actions for all agents target_act_input_n = self.actions + [] # copy actions target_act_input_n[ self. agent_index] = self.target_actor_tf # update current agent action using its target actor target_act_input_n_t = tf.transpose(target_act_input_n, perm=[1, 0, 2]) target_act_input_n_t_flat = tf.reshape(target_act_input_n_t, [-1, self.act_shape_prod]) Q_obs1 = denormalize( target_critic(normalized_obs1_flat, target_act_input_n_t_flat), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: # param noise is added to actor; hence obs for current agent is required self.setup_param_noise(normalized_act_obs0[self.agent_index]) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if var.name.endswith('/w:0') and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names # TODO: need to provide all observations to compute q def step(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} # feed_dict={ph: [data] for ph, data in zip(self.obs0, obs)} # feed_dict = {self.obs0: [obs]} # Get the normalized obs first # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict) # use the normalized obs for training # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action[0], q, None, None # TODO: test this # Computing this every time step may slow things def get_q_value(self, obs_n, act_n): # assuming computing q value for one state; hence need [] around data feed_dict = {ph: [data] for ph, data in zip(self.obs0, obs_n)} act_dict = {ph: [data] for ph, data in zip(self.actions, act_n)} feed_dict.update(act_dict) q = self.sess.run(self.critic_with_actor_tf, feed_dict=feed_dict) return q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale # print(action) B = obs0.shape[0] a_idx = self.agent_index for b in range(B): self.memory.append(obs0[b][a_idx], action[b][a_idx], reward[b][a_idx], obs1[b][a_idx], terminal1[b][a_idx]) # NOTE: calling update for each agent is ok, since the mean and std are uneffected # this is because the same obs are repeated num_agent times, which dont affect value if self.normalize_observations: # provide full obs for obs_rms update obs0_shape = (len(obs0[b]), ) + obs0[b][a_idx].shape assert obs0_shape == (self.num_agents, ) + obs0[b][a_idx].shape self.obs_rms.update(np.array([obs0[b]])) # TODO: not using this right now def update_obs_rms(self, obs0): if not self.normalize_observations: return B = obs0.shape[0] for b in range(B): # provide full obs for obs_rms update self.obs_rms.update(np.array([obs0[b]])) return def train(self, agents): # generate indices to access batches from all agents replay_sample_index = self.memory.generate_index(self.batch_size) # collect replay sample from all agents obs0_n = [] obs1_n = [] rewards_n = [] act_n = [] terminals1_n = [] for i in range(self.num_agents): # Get a batch. batch = agents[i].memory.sample(batch_size=self.batch_size, index=replay_sample_index) obs0_n.append(batch['obs0']) obs1_n.append(batch['obs1']) act_n.append(batch['actions']) # rewards_n.append(batch['rewards']) # terminals1_n.append(batch['terminals1']) batch = self.memory.sample(batch_size=self.batch_size, index=replay_sample_index) # fill placeholders in obs1 with corresponding obs from each agent's replay buffer # self.obs1 and obs1_n are lists of size num_agents # feed_dict={ph: data for ph, data in zip(self.obs1, obs1_n)} feed_dict = {self.obs1: obs1_n} # TODO: find a better way to do this # Get the normalized obs first # norm_obs1 = self.sess.run(self.norm_obs1, feed_dict=feed_dict) # use the normalized obs for training # feed_dict = {self.norm_obs1_ph: norm_obs1} # feed_dict = {ph: data for ph, data in zip(self.norm_obs1_ph, norm_obs1)} # actions required for critic act_dict = {ph: data for ph, data in zip(self.actions, act_n)} feed_dict.update(act_dict) feed_dict.update({self.rewards: batch['rewards']}) feed_dict.update( {self.terminals1: batch['terminals1'].astype('float32')}) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict=feed_dict) # old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict=feed_dict) # target_Q = self.sess.run(self.target_Q, feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] # generate feed_dict for multiple observations and actions # feed_dict={ph: data for ph, data in zip(self.obs0, obs0_n)} feed_dict = {self.obs0: obs0_n} # Get the normalized obs first # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict) # use the normalized obs for training # feed_dict = {self.norm_obs0_ph: norm_obs0} # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)} # act_dict={ph: data for ph, data in zip(self.actions, act_n)} feed_dict.update(act_dict) feed_dict.update({self.critic_target: target_Q}) actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict=feed_dict) # actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ # self.obs0: batch['obs0'], # self.actions: batch['actions'], # self.critic_target: target_Q, # }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess def agent_initialize(self, sess): self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) # setup saving and loading functions self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self, agents): if self.stats_sample is None: replay_sample_index = self.memory.generate_index(self.batch_size) # collect replay sample from all agents obs0_n, act_n = [], [] for i in range(self.num_agents): batch = agents[i].memory.sample(batch_size=self.batch_size, index=replay_sample_index) obs0_n.append(batch['obs0']) act_n.append(batch['actions']) # generate feed_dict for multiple observations and actions # feed_dict={ph: data for ph, data in zip(self.obs0, obs0_n)} feed_dict = {self.obs0: obs0_n} # Get the normalized obs first # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict) # use the normalized obs for training # feed_dict = {self.norm_obs0_ph: norm_obs0} # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)} actions_dict = {ph: data for ph, data in zip(self.actions, act_n)} feed_dict.update(actions_dict) # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = feed_dict values = self.sess.run(self.stats_ops, feed_dict=self.stats_sample) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self, agents): try: from mpi4py import MPI except ImportError: MPI = None if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. replay_sample_index = self.memory.generate_index(self.batch_size) obs0_n = [] for i in range(self.num_agents): batch = agents[i].memory.sample(batch_size=self.batch_size, index=replay_sample_index) obs0_n.append(batch['obs0']) # feed_dict={ph: data for ph, data in zip(self.obs0, obs0_n)} feed_dict = {self.obs0: obs0_n} # Get the normalized obs first # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict) # use the normalized obs for training # feed_dict = {self.norm_obs0_ph: norm_obs0} # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)} feed_dict.update( {self.param_noise_stddev: self.param_noise.current_stddev}) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict=feed_dict) # distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ # self.obs0: batch['obs0'], # self.param_noise_stddev: self.param_noise.current_stddev, # }) if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() else: mean_distance = distance if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() else: mean_distance = distance self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, obs_name='ob',\ obrms=True, final_std=0.01, init_logstd=0.0, observation_permutation=None,action_permutation=None, soft_mirror=False): assert isinstance(ob_space, gym.spaces.Box) obs_perm_mat = np.zeros( (len(observation_permutation), len(observation_permutation)), dtype=np.float32) self.obs_perm_mat = obs_perm_mat for i, perm in enumerate(observation_permutation): obs_perm_mat[i][int(np.abs(perm))] = np.sign(perm) if isinstance(ac_space, gym.spaces.Box): act_perm_mat = np.zeros( (len(action_permutation), len(action_permutation)), dtype=np.float32) self.act_perm_mat = act_perm_mat for i, perm in enumerate(action_permutation): self.act_perm_mat[i][int(np.abs(perm))] = np.sign(perm) elif isinstance(ac_space, gym.spaces.MultiDiscrete): total_dim = int(np.sum(ac_space.nvec)) dim_index = np.concatenate([[0], np.cumsum(ac_space.nvec)]) act_perm_mat = np.zeros((total_dim, total_dim), dtype=np.float32) self.act_perm_mat = act_perm_mat for i, perm in enumerate(action_permutation): perm_mat = np.identity(ac_space.nvec[i]) if np.sign(perm) < 0: perm_mat = np.flipud(perm_mat) self.act_perm_mat[ dim_index[i]:dim_index[i] + ac_space.nvec[i], dim_index[int(np.abs(perm) )]:dim_index[int(np.abs(perm))] + ac_space.nvec[int(np.abs(perm))]] = perm_mat self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None print(self.pdtype) print([sequence_length] + list(ob_space.shape)) ob = U.get_placeholder(name=obs_name, dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) mirror_ob = tf.matmul(ob, obs_perm_mat) mirror_obz = tf.clip_by_value( (mirror_ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) if not obrms: obz = ob last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] if isinstance(ac_space, gym.spaces.Box): pol_net = GenericFF('pol_net', ob_space.shape[0], [], pdtype.param_shape()[0] // 2, hid_size, num_hid_layers) elif isinstance(ac_space, gym.spaces.MultiDiscrete): pol_net = GenericFF('pol_net', ob_space.shape[0], [], pdtype.param_shape()[0], hid_size, num_hid_layers) orig_out = pol_net.get_output_tensor(obz, None, tf.nn.tanh) mirr_out = tf.matmul( pol_net.get_output_tensor(mirror_obz, None, tf.nn.tanh), act_perm_mat) if not soft_mirror: mean = orig_out + mirr_out else: mean = orig_out self.additional_loss = tf.reduce_mean( tf.abs(orig_out - mirr_out)) * 1.0 if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.constant_initializer(init_logstd)) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = mean self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, actor, critic, experts, obs_dim, memory, observation_shape, action_shape, expert_is_np=False, param_noise=None, action_noise=None, gamma=0.95, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = copy(actor) self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.experts = experts self.obs_dim = obs_dim # self.critic_obs0 = self.experts[0].obs0 # self.critic_obs1 = self.experts[0].obs1 # self.critic_actor = self.experts[0].use_tf_actor # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(self.actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic expert0_normalize_obs0 = [tf.clip_by_value(normalize(self.obs0[:, :self.obs_dim], self.experts[i].obs_rms), self.observation_range[0], self.observation_range[1]) for i in range(len(self.experts))] expert_qv0 = tf.squeeze(tf.stack([experts[i].critic(expert0_normalize_obs0[i], self.actions)\ for i in range(len(self.experts))]), axis=2) # expert_qv0 = tf.Print(expert_qv0, [expert_qv0], '>>>> qv0 :', summarize=10) expert_qv0 = tf.reduce_sum(self.obs0[:, self.obs_dim:] * tf.transpose(expert_qv0), axis=1) # Create networks and core TF parts that are shared across setup parts. self.actor_tf = self.actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions, tf.stop_gradient(expert_qv0)) self.critic_tf = tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]) expert_qv0_with_actor_tf = tf.squeeze(tf.stack([experts[i].critic(expert0_normalize_obs0[i], self.actor_tf) for i in range(len(self.experts))]), axis=2) expert_qv0_with_actor_tf = tf.reduce_sum(self.obs0[:, self.obs_dim:] * tf.transpose(expert_qv0_with_actor_tf), axis=1) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, tf.stop_gradient(expert_qv0_with_actor_tf)) self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) action1 = target_actor(normalized_obs1) expert0_normalize_obs1 = [tf.clip_by_value(normalize(self.obs1[:, :self.obs_dim], self.experts[i].obs_rms), self.observation_range[0], self.observation_range[1]) for i in range(len(self.experts))] expert_qv1 = tf.squeeze(tf.stack([(experts[i].critic(expert0_normalize_obs1[i], action1)) for i in range(len(self.experts))]), axis=2) expert_qv1 = tf.reduce_sum(self.obs1[:, self.obs_dim:] * tf.transpose(expert_qv1), axis=1) self.Q_obs1 = target_critic(normalized_obs1, action1, tf.stop_gradient(expert_qv1)) # self.Q_obs1 = tf.Print(self.Q_obs1, [self.Q_obs1], '>>>> Q :', summarize=10) # self.terminals1 = tf.Print(self.terminals1, [self.terminals1], '>>>> terminal :', summarize=10) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * self.Q_obs1 self.expert_qv1 = expert_qv1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet
class MlpPolicy(object): recurrent = False def __init__(self, name, *args, **kwargs): self.scope = name with tf.variable_scope(name, reuse=tf.AUTO_REUSE): self._init(*args, **kwargs) def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=False, popart=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("popart"): self.v_rms = RunningMeanStd(shape=[1]) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.norm_vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] if popart: self.vpred = denormalize(self.norm_vpred, self.v_rms) else: self.vpred = self.norm_vpred last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred]) self.use_popart = popart if popart: self.init_popart() ret = tf.placeholder(tf.float32, [None]) vferr = tf.reduce_mean(tf.square(self.vpred - ret)) self.vlossandgrad = U.function([ob, ret], U.flatgrad(vferr, self.get_vf_variable())) def init_popart(self): old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.v_rms.std old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.v_rms.mean renormalize_Q_outputs_op = [] vs = self.output_vars M, b = vs renormalize_Q_outputs_op += [M.assign(M * old_std / new_std)] renormalize_Q_outputs_op += [ b.assign((b * old_std + old_mean - new_mean) / new_std) ] self.renorm_v = U.function([old_std, old_mean], [], updates=renormalize_Q_outputs_op) def act(self, stochastic, ob): ac1, vpred1 = self._act(stochastic, ob[None]) return ac1[0], vpred1[0] def get_variables(self): return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) def get_trainable_variables(self): return tf.trainable_variables(self.scope) def get_initial_state(self): return [] def get_vf_variable(self): return tf.trainable_variables(self.scope + "/vf") def update_popart(self, v_targets): old_mean, old_std = U.get_session().run( [self.v_rms.mean, self.v_rms.std]) self.v_rms.update(v_targets) self.renorm_v(old_std, old_mean) @property def output_vars(self): output_vars = [ var for var in self.get_vf_variable() if 'vffinal' in var.name ] return output_vars def save_policy(self, name): U.save_variables(name, variables=self.get_variables()) def load_policy(self, name): U.load_variables(name, variables=self.get_variables())
def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.observation_shape = observation_shape self.critic = critic self.actor = actor self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.actor_lr = tf.constant(actor_lr) self.critic_lr = tf.constant(critic_lr) # Observation normalization. if self.normalize_observations: with tf.name_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None # Return normalization. if self.normalize_returns: with tf.name_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. self.target_critic = Critic(actor.nb_actions, observation_shape, name='target_critic', network=critic.network, **critic.network_kwargs) self.target_actor = Actor(actor.nb_actions, observation_shape, name='target_actor', network=actor.network, **actor.network_kwargs) # Set up parts. if self.param_noise is not None: self.setup_param_noise() if MPI is not None: comm = MPI.COMM_WORLD self.actor_optimizer = MpiAdamOptimizer( comm, self.actor.trainable_variables) self.critic_optimizer = MpiAdamOptimizer( comm, self.critic.trainable_variables) else: self.actor_optimizer = tf.keras.optimizers.Adam( learning_rate=actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam( learning_rate=critic_lr) logger.info('setting up actor optimizer') actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_variables ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) logger.info('setting up critic optimizer') critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_variables ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) if self.critic_l2_reg > 0.: critic_reg_vars = [] for layer in self.critic.network_builder.layers[1:]: critic_reg_vars.append(layer.kernel) for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) logger.info('setting up critic target updates ...') for var, target_var in zip(self.critic.variables, self.target_critic.variables): logger.info(' {} <- {}'.format(target_var.name, var.name)) logger.info('setting up actor target updates ...') for var, target_var in zip(self.actor.variables, self.target_actor.variables): logger.info(' {} <- {}'.format(target_var.name, var.name)) if self.param_noise: logger.info('setting up param noise') for var, perturbed_var in zip(self.actor.variables, self.perturbed_actor.variables): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format( perturbed_var.name, var.name)) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) for var, perturbed_var in zip( self.actor.variables, self.perturbed_adaptive_actor.variables): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format( perturbed_var.name, var.name)) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) if self.normalize_returns and self.enable_popart: self.setup_popart() self.initial_state = None # recurrent architectures not supported yet
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None # None: no recurrent network, Int num: MLP ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) share_layer = obz for i in range(num_hid_layers): share_layer = tf.nn.tanh(tf.layers.dense(share_layer, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.lstm_input = tf.expand_dims(share_layer, [0]) step_size = tf.shape(obz)[:1] # add the recurrent layers or combined with one Mlp lstm_cell = tf.contrib.rnn.BasicLSTMCell(hid_size, state_is_tuple=True) c_init = np.zeros((1, lstm_cell.state_size.c), np.float32) h_init = np.zeros((1, lstm_cell.state_size.h), np.float32) self.lstm_state_init = [c_init, h_init] c_in = U.get_placeholder(name="cin", dtype=tf.float32, shape=[1, lstm_cell.state_size.c]) h_in = U.get_placeholder(name="hin", dtype=tf.float32, shape=[1, lstm_cell.state_size.h]) self.state_in = [c_in, h_in] lstm_state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in) self.lstm_out1, lstm_state_out = tf.nn.dynamic_rnn(lstm_cell, self.lstm_input, initial_state=lstm_state_in, sequence_length=step_size, time_major=False) lstm_c, lstm_h = lstm_state_out self.state_out = [lstm_c[:1, :], lstm_h[:1, :]] self.lstm_out = tf.reshape(self.lstm_out1, [-1, hid_size]) with tf.variable_scope('vf'): self.last_out = self.lstm_out # for i in range(num_hid_layers): # self.last_out = tf.nn.tanh(tf.layers.dense(self.last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense(self.last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] with tf.variable_scope('pol'): self.last_out_pol = self.lstm_out # for i in range(num_hid_layers): # self.last_out_pol = tf.nn.tanh(tf.layers.dense(self.last_out_pol, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense(self.last_out_pol, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) self.pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: self.pdparam = tf.layers.dense(self.last_out_pol, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(self.pdparam) # self.state_in = [] # self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob, self.state_in[0], self.state_in[1]], [ac, self.vpred, self.state_out])
class DDPG(tf.Module): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.observation_shape = observation_shape self.critic = critic self.actor = actor self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.actor_lr = tf.constant(actor_lr) self.critic_lr = tf.constant(critic_lr) # Observation normalization. if self.normalize_observations: with tf.name_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None # Return normalization. if self.normalize_returns: with tf.name_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. self.target_critic = Critic(actor.nb_actions, observation_shape, name='target_critic', network=critic.network, **critic.network_kwargs) self.target_actor = Actor(actor.nb_actions, observation_shape, name='target_actor', network=actor.network, **actor.network_kwargs) # Set up parts. if self.param_noise is not None: self.setup_param_noise() if MPI is not None: comm = MPI.COMM_WORLD self.actor_optimizer = MpiAdamOptimizer( comm, self.actor.trainable_variables) self.critic_optimizer = MpiAdamOptimizer( comm, self.critic.trainable_variables) else: self.actor_optimizer = tf.keras.optimizers.Adam( learning_rate=actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam( learning_rate=critic_lr) logger.info('setting up actor optimizer') actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_variables ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) logger.info('setting up critic optimizer') critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_variables ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) if self.critic_l2_reg > 0.: critic_reg_vars = [] for layer in self.critic.network_builder.layers[1:]: critic_reg_vars.append(layer.kernel) for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) logger.info('setting up critic target updates ...') for var, target_var in zip(self.critic.variables, self.target_critic.variables): logger.info(' {} <- {}'.format(target_var.name, var.name)) logger.info('setting up actor target updates ...') for var, target_var in zip(self.actor.variables, self.target_actor.variables): logger.info(' {} <- {}'.format(target_var.name, var.name)) if self.param_noise: logger.info('setting up param noise') for var, perturbed_var in zip(self.actor.variables, self.perturbed_actor.variables): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format( perturbed_var.name, var.name)) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) for var, perturbed_var in zip( self.actor.variables, self.perturbed_adaptive_actor.variables): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format( perturbed_var.name, var.name)) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) if self.normalize_returns and self.enable_popart: self.setup_popart() self.initial_state = None # recurrent architectures not supported yet def setup_param_noise(self): assert self.param_noise is not None # Configure perturbed actor. self.perturbed_actor = Actor(self.actor.nb_actions, self.observation_shape, name='param_noise_actor', network=self.actor.network, **self.actor.network_kwargs) # Configure separate copy for stddev adoption. self.perturbed_adaptive_actor = Actor( self.actor.nb_actions, self.observation_shape, name='adaptive_param_noise_actor', network=self.actor.network, **self.actor.network_kwargs) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 @tf.function def step(self, obs, apply_noise=True, compute_Q=True): normalized_obs = tf.clip_by_value(normalize(obs, self.obs_rms), self.observation_range[0], self.observation_range[1]) actor_tf = self.actor(normalized_obs) if self.param_noise is not None and apply_noise: action = self.perturbed_actor(normalized_obs) else: action = actor_tf if compute_Q: normalized_critic_with_actor_tf = self.critic( normalized_obs, actor_tf) q = denormalize( tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) else: q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() action += noise action = tf.clip_by_value(action, self.action_range[0], self.action_range[1]) return action, q, None, None def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale B = obs0.shape[0] for b in range(B): self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) if self.normalize_observations: self.obs_rms.update(np.array([obs0[b]])) def train(self): batch = self.memory.sample(batch_size=self.batch_size) obs0, obs1 = tf.constant(batch['obs0']), tf.constant(batch['obs1']) actions, rewards, terminals1 = tf.constant( batch['actions']), tf.constant(batch['rewards']), tf.constant( batch['terminals1'], dtype=tf.float32) normalized_obs0, target_Q = self.compute_normalized_obs0_and_target_Q( obs0, obs1, rewards, terminals1) if self.normalize_returns and self.enable_popart: old_mean = self.ret_rms.mean old_std = self.ret_rms.std self.ret_rms.update(target_Q.flatten()) # renormalize Q outputs new_mean = self.ret_rms.mean new_std = self.ret_rms.std for vs in [ self.critic.output_vars, self.target_critic.output_vars ]: kernel, bias = vs kernel.assign(kernel * old_std / new_std) bias.assign((bias * old_std + old_mean - new_mean) / new_std) actor_grads, actor_loss = self.get_actor_grads(normalized_obs0) critic_grads, critic_loss = self.get_critic_grads( normalized_obs0, actions, target_Q) if MPI is not None: self.actor_optimizer.apply_gradients(actor_grads, self.actor_lr) self.critic_optimizer.apply_gradients(critic_grads, self.critic_lr) else: self.actor_optimizer.apply_gradients( zip(actor_grads, self.actor.trainable_variables)) self.critic_optimizer.apply_gradients( zip(critic_grads, self.critic.trainable_variables)) return critic_loss, actor_loss @tf.function def compute_normalized_obs0_and_target_Q(self, obs0, obs1, rewards, terminals1): normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) Q_obs1 = denormalize( self.target_critic(normalized_obs1, self.target_actor(normalized_obs1)), self.ret_rms) target_Q = rewards + (1. - terminals1) * self.gamma * Q_obs1 return normalized_obs0, target_Q @tf.function def get_actor_grads(self, normalized_obs0): with tf.GradientTape() as tape: actor_tf = self.actor(normalized_obs0) normalized_critic_with_actor_tf = self.critic( normalized_obs0, actor_tf) critic_with_actor_tf = denormalize( tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) actor_loss = -tf.reduce_mean(critic_with_actor_tf) actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables) if self.clip_norm: actor_grads = [ tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in actor_grads ] if MPI is not None: actor_grads = tf.concat( [tf.reshape(g, (-1, )) for g in actor_grads], axis=0) return actor_grads, actor_loss @tf.function def get_critic_grads(self, normalized_obs0, actions, target_Q): with tf.GradientTape() as tape: normalized_critic_tf = self.critic(normalized_obs0, actions) normalized_critic_target_tf = tf.clip_by_value( normalize(target_Q, self.ret_rms), self.return_range[0], self.return_range[1]) critic_loss = tf.reduce_mean( tf.square(normalized_critic_tf - normalized_critic_target_tf)) # The first is input layer, which is ignored here. if self.critic_l2_reg > 0.: # Ignore the first input layer. for layer in self.critic.network_builder.layers[1:]: # The original l2_regularizer takes half of sum square. critic_loss += (self.critic_l2_reg / 2.) * tf.reduce_sum( tf.square(layer.kernel)) critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables) if self.clip_norm: critic_grads = [ tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in critic_grads ] if MPI is not None: critic_grads = tf.concat( [tf.reshape(g, (-1, )) for g in critic_grads], axis=0) return critic_grads, critic_loss def initialize(self): if MPI is not None: sync_from_root(self.actor.trainable_variables + self.critic.trainable_variables) self.target_actor.set_weights(self.actor.get_weights()) self.target_critic.set_weights(self.critic.get_weights()) @tf.function def update_target_net(self): for var, target_var in zip(self.actor.variables, self.target_actor.variables): target_var.assign((1. - self.tau) * target_var + self.tau * var) for var, target_var in zip(self.critic.variables, self.target_critic.variables): target_var.assign((1. - self.tau) * target_var + self.tau * var) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) obs0 = self.stats_sample['obs0'] actions = self.stats_sample['actions'] normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_critic_tf = self.critic(normalized_obs0, actions) critic_tf = denormalize( tf.clip_by_value(normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) actor_tf = self.actor(normalized_obs0) normalized_critic_with_actor_tf = self.critic(normalized_obs0, actor_tf) critic_with_actor_tf = denormalize( tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) stats = {} if self.normalize_returns: stats['ret_rms_mean'] = self.ret_rms.mean stats['ret_rms_std'] = self.ret_rms.std if self.normalize_observations: stats['obs_rms_mean'] = tf.reduce_mean(self.obs_rms.mean) stats['obs_rms_std'] = tf.reduce_mean(self.obs_rms.std) stats['reference_Q_mean'] = tf.reduce_mean(critic_tf) stats['reference_Q_std'] = reduce_std(critic_tf) stats['reference_actor_Q_mean'] = tf.reduce_mean(critic_with_actor_tf) stats['reference_actor_Q_std'] = reduce_std(critic_with_actor_tf) stats['reference_action_mean'] = tf.reduce_mean(actor_tf) stats['reference_action_std'] = reduce_std(actor_tf) if self.param_noise: perturbed_actor_tf = self.perturbed_actor(normalized_obs0) stats['reference_perturbed_action_mean'] = tf.reduce_mean( perturbed_actor_tf) stats['reference_perturbed_action_std'] = reduce_std( perturbed_actor_tf) stats.update(self.param_noise.get_stats()) return stats def adapt_param_noise(self, obs0): try: from mpi4py import MPI except ImportError: MPI = None if self.param_noise is None: return 0. mean_distance = self.get_mean_distance(obs0).numpy() if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce( mean_distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance @tf.function def get_mean_distance(self, obs0): # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. update_perturbed_actor(self.actor, self.perturbed_adaptive_actor, self.param_noise.current_stddev) normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) actor_tf = self.actor(normalized_obs0) adaptive_actor_tf = self.perturbed_adaptive_actor(normalized_obs0) mean_distance = tf.sqrt( tf.reduce_mean(tf.square(actor_tf - adaptive_actor_tf))) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: update_perturbed_actor(self.actor, self.perturbed_actor, self.param_noise.current_stddev)
def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., td3_variant=False, td3_policy_freq=1, td3_policy_noise=0.0, td3_noise_clip=0.5): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg #추가된 내용 #parameters for using TD3 variant of DDPG #https://arxiv.org/abs/1802.09477 self.td3_variant = td3_variant self.td3_policy_freq = td3_policy_freq self.td3_policy_noise = td3_policy_noise self.td3_noise_clip = td3_noise_clip #노말라이제이션 코드 her에서 가져오자. # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) if self.td3_variant: logger.info('using TD3 variant model') self.normalized_critic_tf, self.normalized_critic_tf2 = critic( normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf, _ = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) out_q1, out_q2 = target_critic(normalized_obs1, target_actor(normalized_obs1)) min_q1 = tf.minimum(out_q1, out_q2) #min 값 내준다 Q_obs1 = denormalize(min_q1, self.ret_rms) #기존 ddpg와의 차이점은 동일한 네트워크를 2개의 네트워크로 학습시키고 작은 q값을 내준다. --> 두 Q함수들은 하나의 타겟을 사용합니다, 두 Q함수중 작은 값을 사용하여 계산됩니다. # 그리고 난 다음 둘다 이 타겟에 대해 regressing 함으로써 배웁니다. 이로써 Q함수의 과대평가를 막는다 # 그걸 rms와 denormalize해서 Q_obs1로 넘겨줌 #-->Clipped Double-Q Learning 알고리즘을 따른다. else: self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize( target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., td3_variant=False, td3_policy_freq=1, td3_policy_noise=0.0, td3_noise_clip=0.5): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg #추가된 내용 #parameters for using TD3 variant of DDPG #https://arxiv.org/abs/1802.09477 self.td3_variant = td3_variant self.td3_policy_freq = td3_policy_freq self.td3_policy_noise = td3_policy_noise self.td3_noise_clip = td3_noise_clip #노말라이제이션 코드 her에서 가져오자. # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) if self.td3_variant: logger.info('using TD3 variant model') self.normalized_critic_tf, self.normalized_critic_tf2 = critic( normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf, _ = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) out_q1, out_q2 = target_critic(normalized_obs1, target_actor(normalized_obs1)) min_q1 = tf.minimum(out_q1, out_q2) #min 값 내준다 Q_obs1 = denormalize(min_q1, self.ret_rms) #기존 ddpg와의 차이점은 동일한 네트워크를 2개의 네트워크로 학습시키고 작은 q값을 내준다. --> 두 Q함수들은 하나의 타겟을 사용합니다, 두 Q함수중 작은 값을 사용하여 계산됩니다. # 그리고 난 다음 둘다 이 타겟에 대해 regressing 함으로써 배웁니다. 이로써 Q함수의 과대평가를 막는다 # 그걸 rms와 denormalize해서 Q_obs1로 넘겨줌 #-->Clipped Double-Q Learning 알고리즘을 따른다. else: self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize( target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] # self.target_soft_updates = [actor_soft_updates, critic_soft_updates] DDPG self.actor_target_soft_updates = actor_soft_updates self.critic_target_soft_updates = critic_soft_updates def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) if self.td3_variant: logger.info('using TD3 variant loss') self.critic_loss = tf.losses.mean_squared_error(normalized_critic_target_tf,self.normalized_critic_tf) \ + tf.losses.mean_squared_error(normalized_critic_target_tf,self.normalized_critic_tf2) else: self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def step(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = { self.obs0: U.adjust_shape(self.obs0, [obs]) } #obs0에만 obs feed해준다 if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None #Exploration을 위해 액션에 노이즈 추가 if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale B = obs0.shape[0] for b in range(B): self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) if self.normalize_observations: self.obs_rms.update(np.array([obs0[b]])) def train(self, train_iter): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) if self.td3_policy_noise > 0: noise = np.random.normal(loc=0.0, scale=self.td3_policy_noise, size=np.shape(batch['actions'])) noise = np.clip(noise, -self.td3_noise_clip, self.td3_noise_clip) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: np.clip(batch['actions'] + noise, self.action_range[0], self.action_range[1]), self.critic_target: target_Q, }) else: # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) #TD3 has hyperparameter for how frequently to update actor policy and target networks if train_iter % self.td3_policy_freq == 0: self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self, train_iter): # TD3 has hyperparameter for how frequently to update actor policy and target networks if train_iter % self.td3_policy_freq == 0: self.sess.run(self.actor_target_soft_updates) self.sess.run(self.critic_target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats #파라미터에 노이즈는 왜추가하지? def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates()
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz last_out = tf.nn.tanh( U.conv2d(last_out, 64, 'vfconv1', (7, 7), (3, 3), pad='VALID')) last_out = tf.nn.tanh( U.conv2d(last_out, 64, 'vfconv2', (5, 5), (2, 2), pad='VALID')) last_out = tf.nn.tanh( U.conv2d(last_out, 64, 'vfconv3', (3, 3), (1, 1), pad='VALID')) last_out = tf.nn.tanh( U.conv2d(last_out, 64, 'vfconv4', (3, 3), (1, 1), pad='VALID')) last_out = tf.reshape(last_out, tf.convert_to_tensor([-1, 784 * 4])) last_out = tf.nn.tanh( tf.layers.dense(last_out, 512, kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, kernel_initializer=U.normc_initializer(1.0))[:, 0] last_out = obz last_out = tf.nn.tanh( U.conv2d(last_out, 64, 'polconv1', (7, 7), (3, 3), pad='VALID')) last_out = tf.nn.tanh( U.conv2d(last_out, 64, 'polconv2', (5, 5), (2, 2), pad='VALID')) last_out = tf.nn.tanh( U.conv2d(last_out, 64, 'polconv3', (3, 3), (1, 1), pad='VALID')) last_out = tf.nn.tanh( U.conv2d(last_out, 64, 'polconv4', (3, 3), (1, 1), pad='VALID')) last_out = tf.reshape(last_out, tf.convert_to_tensor([-1, 784 * 4])) last_out = tf.nn.tanh( tf.layers.dense(last_out, 512, kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): last_out = obz num_policy = 1 state_embedding = tf.tile(tf.expand_dims(obz, axis=1), [1, num_policy, 1]) rnn_cell = rnn.BasicLSTMCell(num_units=pdtype.param_shape()[0]) self.sub_policies, states = tf.nn.dynamic_rnn( cell=rnn_cell, inputs=state_embedding, dtype=tf.float32, scope='subpolicy') lstm_cell = rnn.BasicLSTMCell(num_units=num_policy) concatenated = tf.concat([self.sub_policies, state_embedding], axis=2) self.out, states = tf.nn.dynamic_rnn(cell=lstm_cell, inputs=concatenated, dtype=tf.float32, scope='master') last_output = self.out[:, -1, :] self.chosen_index = tf.argmax(last_output, axis=1) # self.weights = tf.nn.softmax(logits=last_output, dim= self.weights = tf.one_hot(indices=self.chosen_index, depth=num_policy) last_out = tf.reduce_sum(tf.expand_dims(self.weights, axis=2) * self.sub_policies, axis=1) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(last_out) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, sensor_space, ac_space, hid_size, num_hid_layers, kind, elm_mode): assert isinstance(ob_space, gym.spaces.Box) assert isinstance(sensor_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) ob_sensor = U.get_placeholder(name="ob_sensor", dtype=tf.float32, shape=[sequence_length] + list(sensor_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = ODEBlock(32, (3, 3))(x) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) ## Obfilter on sensor output with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=sensor_space.shape) obz_sensor = tf.clip_by_value( (ob_sensor - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) last_out = obz_sensor if not elm_mode: ## Adapted from mlp_policy for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="vffc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) y = tf.layers.dense(last_out, 64, name="vffinal", kernel_initializer=U.normc_initializer(1.0)) else: last_out = tf.nn.tanh( tf.layers.dense(last_out, hid_size, name="vffc1", kernel_initializer=U.normc_initializer(1.0), trainable=False)) y = tf.layers.dense(last_out, 64, name="vffinal", kernel_initializer=U.normc_initializer(1.0)) x = tf.concat([x, y], 1) logits = tf.layers.dense(x, pdtype.param_shape()[0], name="logits", kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = tf.layers.dense( x, 1, name="value", kernel_initializer=U.normc_initializer(1.0))[:, 0] # self.session.run(logits.kernel) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob, ob_sensor], [ac, self.vpred, logits])
def __init__(self, name, actor, critic, memory, obs_space_n, act_space_n, agent_index, obs_rms, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): self.name = name self.num_agents = len(obs_space_n) self.agent_index = agent_index from gym import spaces continuous_ctrl = not isinstance(act_space_n[0], spaces.Discrete) # TODO: remove after testing assert continuous_ctrl # Multi-agent inputs # self.obs0 = [] # self.obs1 = [] self.actions = [] # self.norm_obs0_ph = [] # self.norm_obs1_ph = [] self.obs0 = tf.placeholder(tf.float32, shape=( self.num_agents, None, ) + obs_space_n[self.agent_index].shape, name="obs0") self.obs1 = tf.placeholder(tf.float32, shape=( self.num_agents, None, ) + obs_space_n[self.agent_index].shape, name="obs1") # if continuous_ctrl: # self.actions = tf.placeholder(tf.float32, shape=(self.num_agents, None,) + act_space_n[self.agent_index].shape, name="action") # else: # act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # self.actions = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] # this is required to reshape obs and actions for concatenation obs_shape_list = [self.num_agents] + list( obs_space_n[self.agent_index].shape) act_shape_list = [self.num_agents] + list( act_space_n[self.agent_index].shape) self.obs_shape_prod = np.prod(obs_shape_list) self.act_shape_prod = np.prod(act_shape_list) for i in range(self.num_agents): # each obs in obs0,obs1 contains info about ego agent and relative pos/vel of other agents # self.obs0.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="obs0_"+str(i))) # self.obs1.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="obs1_"+str(i))) if continuous_ctrl: self.actions.append( tf.placeholder(tf.float32, shape=[None] + list(act_space_n[i].shape), name="action" + str(i))) else: self.actions.append( make_pdtype(act_space_n[i]).sample_placeholder( [None], name="action" + str(i))) # self.norm_obs0_ph.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="norm_obs0_"+str(i))) # self.norm_obs1_ph.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="norm_obs1_"+str(i))) # self.norm_obs0_ph = tf.placeholder(tf.float32, shape=[self.num_agents, None] + list(obs_space_n[self.agent_index].shape), name="norm_obs0") # self.norm_obs1_ph = tf.placeholder(tf.float32, shape=[self.num_agents, None] + list(obs_space_n[self.agent_index].shape), name="norm_obs1") # we only provide single agent inputs for these placeholders self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. # TODO: need to update the replay buffer storage function to account for multiple agents if self.normalize_observations: self.obs_rms = obs_rms else: self.obs_rms = None # Need to transpose observations so we can normalize them # converts tensor to shape (batch_size, num_agents, space_size) # transose on dim 0 and 1, leave dim 2 unchanged obs0_t = tf.transpose(self.obs0, perm=[1, 0, 2]) obs1_t = tf.transpose(self.obs1, perm=[1, 0, 2]) actions_t = tf.transpose(self.actions, perm=[1, 0, 2]) # each entry in obs_t is normalized wrt the agent normalized_obs0 = tf.clip_by_value(normalize(obs0_t, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(obs1_t, self.obs_rms), self.observation_range[0], self.observation_range[1]) # convert the obs to original shape after normalization for convenience normalized_act_obs0 = tf.transpose(normalized_obs0, perm=[1, 0, 2]) normalized_act_obs1 = tf.transpose(normalized_obs1, perm=[1, 0, 2]) # need to specify exact shape, since we dont always pass batch size number of obs/act normalized_obs0_flat = tf.reshape(normalized_obs0, [-1, self.obs_shape_prod]) normalized_obs1_flat = tf.reshape(normalized_obs1, [-1, self.obs_shape_prod]) actions_t_flat = tf.reshape(actions_t, [-1, self.act_shape_prod]) # Return normalization. # TODO: update this to handle multiple agents if required if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. # Each agents gets its own observation self.actor_tf = actor(normalized_act_obs0[self.agent_index]) self.target_actor_tf = target_actor( normalized_act_obs1[self.agent_index]) # Critic gets all observations self.normalized_critic_tf = critic(normalized_obs0_flat, actions_t_flat) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # need to provide critic() with all actions act_input_n = self.actions + [] # copy actions act_input_n[ self. agent_index] = self.actor_tf # update current agent action using its actor act_input_n_t = tf.transpose(act_input_n, perm=[1, 0, 2]) act_input_n_t_flat = tf.reshape(act_input_n_t, [-1, self.act_shape_prod]) self.normalized_critic_with_actor_tf = critic(normalized_obs0_flat, act_input_n_t_flat, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # we need to use actions for all agents target_act_input_n = self.actions + [] # copy actions target_act_input_n[ self. agent_index] = self.target_actor_tf # update current agent action using its target actor target_act_input_n_t = tf.transpose(target_act_input_n, perm=[1, 0, 2]) target_act_input_n_t_flat = tf.reshape(target_act_input_n_t, [-1, self.act_shape_prod]) Q_obs1 = denormalize( target_critic(normalized_obs1_flat, target_act_input_n_t_flat), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: # param noise is added to actor; hence obs for current agent is required self.setup_param_noise(normalized_act_obs0[self.agent_index]) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2,dc=0): assert isinstance(ob_space, gym.spaces.Box) # determine the dimensions of the state space and observation space self.ac_space_dim = ac_space.shape[0] self.ob_space_dim = ob_space.shape[0] self.dc = dc self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32) self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32) self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) # create a filter for the pure shape, meaning excluding u[k-1] obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim),) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("obfilter_pure"): self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_pure = tf.clip_by_value((ob[:,:-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0) # define the Q-function network here last_out0 = obz_pure # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh(U.dense(last_out0, hid_size, "vffc0%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh(U.dense(last_out1, hid_size, "vffc1%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0)) # return the Q function value Q(s,o) self.vpred = U.switch(option[0], last_out1, last_out0)[:,0] # define the policy over options here last_out0 = obz_pure # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh(U.dense(last_out0, hid_size, "oppi0%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh(U.dense(last_out1, hid_size, "oppi1%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0)) last_out = tf.concat([last_out0, last_out1], 1) # return the probabilities for executing the options self.op_pi = tf.nn.softmax(last_out) self.tpred = tf.nn.sigmoid(dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:,0] # we always terminate termination_sample = tf.constant([True]) # implement the intra option policy last_out = obz_pure for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0]//2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01),bias=False) mean = tf.nn.tanh(mean) logstd = tf.get_variable(name="logstd", shape=[num_options, 1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # now we never perform the ZOH, both policies are fully functional stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) ac = tf.clip_by_value(ac,-1.0,1.0) self.last_action = tf.stop_gradient(ac) self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd]) self._get_v = U.function([ob, option], [self.vpred]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op = U.function([ob], [self.op_pi])
class ActorLearner(object): def __init__(self, name, actor, memory, observation_shape, action_shape, gamma=0.95, tau=0.001, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), return_range=(-np.inf, np.inf), actor_l2_reg=0., actor_lr=5e-5, clip_norm=None, ): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='expert_actor_obs0') self.action_target = tf.placeholder(tf.float32, shape=(None,) + action_shape, name=name+'action_target') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.return_range = return_range self.observation_range = observation_range self.clip_norm = clip_norm self.batch_size = batch_size self.stats_sample = None self.actor_l2_reg = actor_l2_reg self.actor = actor self.actor_lr = actor_lr # Observation normalization. if self.normalize_observations: with tf.variable_scope(name + 'obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) # Set up parts. self.setup_actor_optimizer() self.setup_stats() self.initial_state = None # recurrent architectures not supported yet def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = tf.reduce_mean(tf.square(self.actor_tf - self.action_target)) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = tf.train.AdamOptimizer(learning_rate=self.actor_lr) self.optimize_expr = self.actor_optimizer.minimize(self.actor_loss, var_list=self.actor.trainable_vars) def setup_stats(self): ops = [] names = [] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ['obs_rms_mean', 'obs_rms_std'] self.stats_ops = ops self.stats_names = names def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) # Get all gradients and perform a synced update. ops = [self.actor_grads, self.actor_loss] actor_grads, actor_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.action_target: batch['actions'], }) # with self.graph.as_default(): self.optimize_expr.run(session=self.sess, feed_dict={ self.obs0: batch['obs0'], self.action_target: batch['actions'], } ) return actor_loss def initialize(self, sess): self.sess = sess def save(self, path): save_variables(path) def load(self, path): load_variables(path) def store_transition(self, obs0, action): # B = obs0.shape[0] # for b in range(B): self.memory.append(obs0, action) if self.normalize_observations: self.obs_rms.update(obs0) print("Stored ", obs0.shape) def __call__(self, obs): # with self.graph.as_default(): print("Expert Actor call") feed_dict = {self.obs0: U.adjust_shape(self.obs0, obs)} # import IPython; IPython.embed() action = self.sess.run([self.actor_tf], feed_dict=feed_dict) print("Expert Actor return") return action
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): # assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz last_out = tf.one_hot(indices=tf.cast(last_out, dtype=tf.int32), depth=ob_space.n) print(last_out) for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): last_out = obz last_out = tf.one_hot(indices=tf.cast(last_out, dtype=tf.int32), depth=ob_space.n) for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, bound_by_sigmoid=False, sigmoid_coef=1., activation='tanh', normalize_obs=True, actions='gaussian', avg_norm_symmetry=False, symmetric_interpretation=False, stdclip=5.0, gaussian_bias=False, gaussian_from_binary=False, parallel_value=False, pv_layers=2, pv_hid_size=512, three=False): assert isinstance(ob_space, gym.spaces.Box) if actions == 'binary': self.pdtype = pdtype = MultiCategoricalPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32)) elif actions == 'beta': self.pdtype = pdtype = BetaPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32)) elif actions == 'bernoulli': self.pdtype = pdtype = BernoulliPdType(ac_space.low.size) elif actions == 'gaussian': self.pdtype = pdtype = make_pdtype(ac_space) elif actions == 'cat_3': self.pdtype = pdtype = MultiCategoricalPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32) * 2) elif actions == 'cat_5': self.pdtype = pdtype = MultiCategoricalPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32) * 4) else: assert False sequence_length = None self.ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) self.st = U.get_placeholder(name="st", dtype=tf.int32, shape=[None]) if normalize_obs: with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) if avg_norm_symmetry: # Warning works only for normal observations (41 numbers) ob_mean = (tf.gather(self.ob_rms.mean, ORIG_SYMMETRIC_IDS) + self.ob_rms.mean) / 2 ob_std = (tf.gather(self.ob_rms.std, ORIG_SYMMETRIC_IDS) + self.ob_rms.std) / 2 # Pretty crude else: ob_mean = self.ob_rms.mean ob_std = self.ob_rms.std obz = tf.clip_by_value((self.ob - ob_mean) / ob_std, -stdclip, stdclip) #obz = tf.Print(obz, [self.ob_rms.mean], message='rms_mean', summarize=41) #obz = tf.Print(obz, [self.ob_rms.std], message='rms_std', summarize=41) else: obz = self.ob vpreds = [] pparams = [] for part in range(1 if not three else 3): part_prefix = "" if part == 0 else "part_" + str(part) # Predicted value last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, part_prefix + "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) vpreds.append( U.dense(last_out, 1, part_prefix + "vffinal", weight_init=U.normc_initializer(1.0))) vpreds[-1] = vpreds[-1][:, 0] if parallel_value: last_out_2 = obz for i in range(pv_layers): last_out_2 = tf.nn.tanh( U.dense(last_out_2, pv_hid_size, part_prefix + "pv_vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out_2 = U.dense(last_out_2, 1, part_prefix + "pv_vffinal", weight_init=U.normc_initializer(1.0)) vpreds[-1] += last_out_2[:, 0] last_out = obz if activation == 'tanh': activation = tf.nn.tanh elif activation == 'relu': activation = tf.nn.relu for i in range(num_hid_layers): dense = U.dense(last_out, hid_size, part_prefix + "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0)) last_out = activation(dense) if actions == 'gaussian': if gaussian_fixed_var: mean = U.dense(last_out, pdtype.param_shape()[0] // 2, part_prefix + "polfinal", U.normc_initializer(0.01)) if bound_by_sigmoid: mean = tf.nn.sigmoid(mean * sigmoid_coef) logstd = tf.get_variable( name=part_prefix + "logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) logstd = mean * 0.0 + logstd else: mean = U.dense(last_out, pdtype.param_shape()[0] // 2, part_prefix + "polfinal", U.normc_initializer(0.01)) logstd = U.dense(last_out, pdtype.param_shape()[0] // 2, part_prefix + "polfinal_2", U.normc_initializer(0.01)) if gaussian_bias: mean = mean + 0.5 pdparam = U.concatenate([mean, logstd], axis=1) elif actions == 'beta': pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "beta_lastlayer", U.normc_initializer(0.01)) pdparam = tf.nn.softplus(pdparam) elif actions in ['bernoulli', 'binary']: if bound_by_sigmoid: raise NotImplementedError( "bound by sigmoid not implemented here") pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "polfinal", U.normc_initializer(0.01)) elif actions in ['cat_3']: pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "cat3_lastlayer", U.normc_initializer(0.01)) # prob = tf.reshape(pdparam, [18, -1]) # prob = tf.nn.softmax(prob) # elogit = tf.exp(pdparam) # pdparam = tf.Print(pdparam, [prob], summarize=18) elif actions in ['cat_5']: pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "cat5_lastlayer", U.normc_initializer(0.01)) # prob = tf.reshape(pdparam, [18, -1]) # prob = tf.nn.softmax(prob) # elogit = tf.exp(pdparam) # pdparam = tf.Print(pdparam, [prob], summarize=18) else: assert False pparams.append(pdparam) pparams = tf.stack(pparams) vpreds = tf.stack(vpreds) pparams = tf.transpose(pparams, perm=(1, 0, 2)) # [batchsize, networks, values] vpreds = tf.transpose(vpreds, perm=(1, 0)) # [batchsize, networks, values] self.stochastic = tf.placeholder(name="stochastic", dtype=tf.bool, shape=()) if three: batchsize = tf.shape(pdparam)[0] NO_OBSTACLES_ID = 5 OBST_DIST = [278, 279, 280, 281, 282, 283, 284, 285] # TODO: Alternative approach distances = [self.ob[:, i] for i in OBST_DIST] distances = tf.stack(distances, axis=1) no_obstacles = tf.cast(tf.equal(self.ob[:, NO_OBSTACLES_ID], 1.0), tf.int32) distances = tf.cast(tf.reduce_all(tf.equal(distances, 3), axis=1), tf.int32) no_obstacles_ahead = distances * no_obstacles # 0 if obstacles, 1 if no obstacles begin = tf.cast(tf.less(self.st, 75), tf.int32) take_id = (1 - begin) * ( 1 + no_obstacles_ahead ) # begin==1 => 0, begin==0 => 1 + no_obstacles_ahead take_id = tf.stack((tf.range(batchsize), take_id), axis=1) pdparam = tf.gather_nd(pparams, take_id) self.vpred = tf.gather_nd(vpreds, take_id) #self.vpred = tf.Print(self.vpred, [take_id]) else: self.vpred = vpreds[:, 0] pdparam = pparams[:, 0] self.pd = pdtype.pdfromflat(pdparam) if hasattr(self.pd, 'real_mean'): real_mean = self.pd.real_mean() ac = U.switch(self.stochastic, self.pd.sample(), real_mean) else: ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([self.stochastic, self.ob, self.st], [ac, self.vpred, ob_mean, ob_std]) if actions == 'binary': self._binary_f = U.function([self.stochastic, self.ob, self.st], [ac, self.pd.flat, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2,dc=0, kind='small'): assert isinstance(ob_space, gym.spaces.Box) self.dc = dc self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0))) else: raise NotImplementedError # Network to compute value function and termination probabilities with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = x last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0] self.vpred_ent = dense3D2(last_out, 1, "vffinal_ent", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0] self.tpred = tf.nn.sigmoid(dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:,0] termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.)) # Network to compute policy over options and intra_option policies last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Discrete): # mean = dense3D2(last_out, pdtype.param_shape()[0]//2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01)) # logstd = tf.get_variable(name="logstd", shape=[num_options, 1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) # pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) # else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob, option], [ac, self.vpred, self.vpred_ent, last_out]) self._get_logits = U.function([stochastic, ob, option], [self.pd.logits] ) self._get_v = U.function([ob, option], [self.vpred]) self._get_v_ent = U.function([ob, option], [self.vpred_ent]) # Entropy value estimate self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self.get_vpred_ent = U.function([ob, option], [self.vpred_ent]) # Entropy value estimate self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., expert=None, save_networks=False, supervise=False, actor_only=False, critic_only=False, both_ours_sup=False, gail=False, pofd=False): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.expert = expert self.save_networks = save_networks self.supervise = supervise self.actor_only = actor_only self.critic_only = critic_only self.both_ours_sup = both_ours_sup self.gail=gail self.pofd=pofd # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' print(target_actor.vars) self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 if self.expert is not None: self.expert.set_tf(actor=actor, critic=critic, obs0=normalized_obs0, actions=self.actions, obs_rms=self.obs_rms, ret_rms=self.ret_rms, observation_range=self.observation_range, return_range=self.return_range, supervise=self.supervise, actor_only=self.actor_only, critic_only=self.critic_only, both_ours_sup=self.both_ours_sup, gail=self.gail, pofd=self.pofd) training_step = tf.get_variable('training_step', shape=[1], initializer=tf.ones_initializer) self.training_step_run = training_step.assign(training_step + 1) self.training_step = 0 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) if self.expert is not None: self.expert_actor_loss = self.expert.actor_loss + self.actor_loss if self.gail: self.expert_actor_loss = self.expert.actor_loss if self.pofd: self.expert_actor_loss = self.expert.actor_loss + self.actor_loss # self.expert_actor_loss = self.actor_loss actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) if self.expert is not None: self.expert_actor_grads = U.flatgrad(self.expert_actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) else: self.expert_actor_grads = None self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg if self.expert is not None: self.expert_critic_loss = self.expert.critic_loss + self.critic_loss if self.gail: self.expert_critic_loss = self.expert.discriminator_loss if self.pofd: self.expert_critic_loss = self.expert.discriminator_loss + self.critic_loss critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) if self.expert is not None: self.expert_critic_grads = U.flatgrad(self.expert_critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) else: self.expert_critic_grads = None self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self, pretrain): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std : np.array([old_std]), self.old_mean : np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. if self.expert is not None and pretrain: # self.training_step < self.expert_steps: expert_batch = self.expert.sample(batch_size=self.batch_size) ops = [self.training_step_run, self.expert_actor_grads, self.expert_actor_loss, self.expert_critic_grads, self.expert_critic_loss] training_step, actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, self.expert.expert_state: expert_batch['obs0'], self.expert.expert_action: expert_batch['actions'] }) else: ops = [self.training_step_run, self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] training_step, actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) self.training_step = training_step[0] if self.save_networks: if self.training_step.astype(int) % self.save_steps == 0: logger.info('Saved network with {} training steps'.format(self.training_step.astype(int))) self.saver.save(self.sess, self.ckp_dir, global_step=self.training_step.astype(int)) return critic_loss, actor_loss def initialize(self, sess, saver, ckp_dir, save_steps, expert_steps): self.sess = sess self.saver = saver self.ckp_dir = ckp_dir self.save_steps = save_steps self.expert_steps = expert_steps self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) checkpoint = tf.train.get_checkpoint_state(self.ckp_dir) if self.saver and checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) logger.info('Successfully loaded {}'.format(checkpoint.model_checkpoint_path)) else: logger.info('Could not find old network weights') def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = mpi_mean(distance) self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-1., 1.), action_range= [0.2, 0.2, 0.2, 0.2, 0.2, 0.2], return_range=(-np.inf, np.inf), adaptive_param_noise=True, critic_l2_reg=0., adaptive_param_noise_policy_threshold=.1, actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., restore=False): # Inputs. # self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs0 = tf.placeholder(tf.float32, shape=(None, observation_shape), name='obs0') # self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.obs1 = tf.placeholder(tf.float32, shape=(None, observation_shape), name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, action_shape), name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() """Filewriter summary""" monitor_directory = os.path.join("Experiment_data") self.summary_dir = os.path.join(monitor_directory, "summary") # if restore: # dirname = 'run20' # The last name # self.summary_dir = os.path.join(self.summary_dir, dirname) # else: self.summary_dir = utils.new_summary_dir(self.summary_dir) # record the detailed parameters utils.log_params(self.summary_dir, { "actor learning rate": self.actor_lr, "critic learning rate": self.critic_lr, "batch size": self.batch_size, "actor update rate": self.tau, "critic update rate": self.tau, "action noise": self.action_noise, "param noise": self.param_noise, "reward function": 'General reward function', "result_function": 'The second 100' }) self.merged = tf.summary.merge_all() def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None ## 确保假设完全正确 # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = mpi_mean(distance) self.param_noise.adapt(mean_distance) return mean_distance def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.multiply(action, self.action_range) # action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def save_data(self): self.memory.save_data() def train(self, dec_actor_lr, dec_critic_lr): # change the learning rate self.actor_lr = dec_actor_lr self.critic_lr = dec_critic_lr # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std : np.array([old_std]), self.old_mean : np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) ## wirte the graph self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.sess.graph) def restore_model(self, model_directory, saver, sess): ckpt = tf.train.get_checkpoint_state(model_directory) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) self.sess = sess logger.info('Load the saved model from the directory!!!') self.summary_writer = tf.summary.FileWriter(self.summary_dir) def update_target_net(self): self.sess.run(self.target_soft_updates) def feedback_adptive_explore(self): self.param_noise.adapt_variance() def ou_adaptive_explore(self): self.action_noise.adapt_decrease() def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) def log_scalar(self, name, value, index): summary_value = summary_pb2.Summary.Value(tag=name, simple_value=value) summary_2 = summary_pb2.Summary(value=[summary_value]) self.summary_writer.add_summary(summary_2, global_step=index)
def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., expert=None, save_networks=False, supervise=False, actor_only=False, critic_only=False, both_ours_sup=False, gail=False, pofd=False): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.expert = expert self.save_networks = save_networks self.supervise = supervise self.actor_only = actor_only self.critic_only = critic_only self.both_ours_sup = both_ours_sup self.gail=gail self.pofd=pofd # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' print(target_actor.vars) self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 if self.expert is not None: self.expert.set_tf(actor=actor, critic=critic, obs0=normalized_obs0, actions=self.actions, obs_rms=self.obs_rms, ret_rms=self.ret_rms, observation_range=self.observation_range, return_range=self.return_range, supervise=self.supervise, actor_only=self.actor_only, critic_only=self.critic_only, both_ours_sup=self.both_ours_sup, gail=self.gail, pofd=self.pofd) training_step = tf.get_variable('training_step', shape=[1], initializer=tf.ones_initializer) self.training_step_run = training_step.assign(training_step + 1) self.training_step = 0 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates()
def _mlpPolicy(hiddens, ob, ob_space, ac_space, scope, gaussian_fixed_var=True, reuse=False): assert isinstance(ob_space, gym.spaces.Box) with tf.variable_scope(scope, reuse=reuse): pdtype = pdtype = make_pdtype(ac_space) sequence_length = None with tf.variable_scope("obfilter"): ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - ob_rms.mean) / ob_rms.std, -5.0, 5.0) last_out = obz #for i in range(num_hid_layers): for (i, hidden) in zip(range(len(hiddens)), hiddens): last_out = tf.nn.tanh( tf.layers.dense( last_out, hidden, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): last_out = obz #for i in range(num_hid_layers): #for hidden in hiddens: for (i, hidden) in zip(range(len(hiddens)), hiddens): last_out = tf.nn.tanh( tf.layers.dense( last_out, hidden, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) pd = pdtype.pdfromflat(pdparam) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, pd.sample(), pd.mode()) _act = U.function([stochastic, ob], [ac, vpred]) return pd.logits, _act
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): alpha = tf.nn.softplus( U.dense(last_out, ac_space.high.size, 'polfc_alpha', weight_init=U.normc_initializer(0.001))) + 1.0 beta = tf.nn.softplus( U.dense(last_out, ac_space.high.size, 'polfc_beta', weight_init=U.normc_initializer(0.001))) + 1.0 else: raise NotImplementedError self.pd = tfp.distributions.Beta(alpha, beta) self.state_in = [] self.state_out = [] # compute sampled action sampled_action = self.pd.sample() stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, sampled_action, self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.save = None self.load = None # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize( target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std : np.array([old_std]), self.old_mean : np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = mpi_mean(distance) self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })