def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)
def get_target_updates(vars, target_vars, tau): logger.info('setting up target updates ...') soft_updates = [] init_updates = [] assert len(vars) == len(target_vars) for var, target_var in zip(vars, target_vars): logger.info(' {} <- {}'.format(target_var.name, var.name)) init_updates.append(tf.assign(target_var, var)) soft_updates.append( tf.assign(target_var, (1. - tau) * target_var + tau * var)) assert len(init_updates) == len(vars) assert len(soft_updates) == len(vars) return tf.group(*init_updates), tf.group(*soft_updates)
def display_var_info(vars): from test.baselines import logger count_params = 0 for v in vars: name = v.name if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue v_params = np.prod(v.shape.as_list()) count_params += v_params if "/b:" in name or "/biases" in name: continue # Wx+b, bias is not interesting to look at => count params, but not print logger.info(" %s%s %i params %s" % (name, " " * (55 - len(name)), v_params, str(v.shape))) logger.info("Total model parameters: %0.2f million" % (count_params * 1e-6))
def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev): assert len(actor.vars) == len(perturbed_actor.vars) assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars) updates = [] for var, perturbed_var in zip(actor.vars, perturbed_actor.vars): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) updates.append( tf.assign( perturbed_var, var + tf.random_normal( tf.shape(var), mean=0., stddev=param_noise_stddev))) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) updates.append(tf.assign(perturbed_var, var)) assert len(updates) == len(actor.vars) return tf.group(*updates)
def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))
def train(policy, rollout_worker, evaluator, n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval, save_policies, **kwargs): rank = MPI.COMM_WORLD.Get_rank() latest_policy_path = os.path.join(logger.get_dir(), 'policy_latest.pkl') best_policy_path = os.path.join(logger.get_dir(), 'policy_best.pkl') periodic_policy_path = os.path.join(logger.get_dir(), 'policy_{}.pkl') logger.info("Training...") best_success_rate = -1 for epoch in range(n_epochs): # train rollout_worker.clear_history() for _ in range(n_cycles): episode = rollout_worker.generate_rollouts() policy.store_episode(episode) for _ in range(n_batches): policy.train() policy.update_target_net() # test evaluator.clear_history() for _ in range(n_test_rollouts): evaluator.generate_rollouts() # record logs logger.record_tabular('epoch', epoch) for key, val in evaluator.logs('test'): logger.record_tabular(key, mpi_average(val)) for key, val in rollout_worker.logs('train'): logger.record_tabular(key, mpi_average(val)) for key, val in policy.logs(): logger.record_tabular(key, mpi_average(val)) if rank == 0: logger.dump_tabular() # save the policy if it's better than the previous ones success_rate = mpi_average(evaluator.current_success_rate()) if rank == 0 and success_rate >= best_success_rate and save_policies: best_success_rate = success_rate logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path)) evaluator.save_policy(best_policy_path) evaluator.save_policy(latest_policy_path) if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_policies: policy_path = periodic_policy_path.format(epoch) logger.info('Saving periodic policy to {} ...'.format(policy_path)) evaluator.save_policy(policy_path) # make sure that different threads have different seeds local_uniform = np.random.uniform(size=(1,)) root_uniform = local_uniform.copy() MPI.COMM_WORLD.Bcast(root_uniform, root=0) if rank != 0: assert local_uniform[0] != root_uniform[0]
def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank==0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = 2 #env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=(nb_actions,), observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.test(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf.get_default_session() if self.sess is None: self.sess = tf.InteractiveSession() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic(target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value( batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.Q_loss_tf = tf.reduce_mean( tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars( 'g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
parser.add_argument('--action-dim', type=int, default=2) boolean_flag(parser, 'evaluation', default=True) args = parser.parse_args() print(args) # we don't directly specify timesteps for this script, so make sure that if we do specify them # they agree with the other parameters if args.num_timesteps is not None: assert (args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps) dict_args = vars(args) del dict_args['num_timesteps'] return dict_args import os.path as osp import json import time import datetime if __name__ == '__main__': args = parse_args() if MPI.COMM_WORLD.Get_rank() == 0: dir = osp.join( 'log_files', datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) logger.configure(dir) logger.info(args) # Run actual script. run(**args)
def test(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, model_path=None, action_dim=2): rank = MPI.COMM_WORLD.Get_rank() action_dim = action_dim assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = 1 logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, (action_dim, ), gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.sess = sess #agent.sess.run(tf.global_variables_initializer()) saver.restore(sess, model_path) agent.actor_optimizer.sync() agent.critic_optimizer.sync() agent.sess.run(agent.target_init_updates) #agent.initialize(sess) #sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) #print(eval_action) # eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if render_eval: eval_env.render() eval_episode_reward += eval_r epoch_actions.append(eval_action) eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = {} combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) # combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) # combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) # combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = np.mean(eval_episode_rewards) combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = np.mean(eval_qs) combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) #combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) #combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: # if epoch %5 ==0: saver.save(sess, os.path.join( logdir, 'trained_variables{}.ckpt'.format(epoch)), write_meta_graph=False) if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def log_params(params, logger=logger): for key in sorted(params.keys()): logger.info('{}: {}'.format(key, params[key]))