def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): # Target advantage function (if applicable) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ret = tf.placeholder(dtype=tf.float32, shape=[None]) # learning rate multiplier, updated with schedule lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # Annealed cliping parameter epislon clip_param = self.clip_param * lrmult obs_ph = self.policy_pi.obs_ph action_ph = self.policy_pi.pdtype.sample_placeholder( [None]) kloldnew = old_pi.proba_distribution.kl( self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-self.entcoeff) * meanent # pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action_ph) - old_pi.proba_distribution.logp(action_ph)) # surrogate from conservative policy iteration surr1 = ratio * atarg surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # PPO's pessimistic surrogate (L^CLIP) pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2)) vf_loss = tf.reduce_mean( tf.square(self.policy_pi.value_flat - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] self.loss_names = [ "pol_surr", "pol_entpen", "vf_loss", "kl", "ent" ] tf.summary.scalar('entropy_loss', pol_entpen) tf.summary.scalar('policy_gradient_loss', pol_surr) tf.summary.scalar('value_function_loss', vf_loss) tf.summary.scalar('approximate_kullback-leibler', meankl) tf.summary.scalar('clip_factor', clip_param) tf.summary.scalar('loss', total_loss) self.params = tf_util.get_trainable_vars("model") self.assign_old_eq_new = tf_util.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model")) ]) with tf.variable_scope("Adam_mpi", reuse=False): self.adam = MpiAdam(self.params, epsilon=self.adam_epsilon, sess=self.sess) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.optim_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('clip_range', tf.reduce_mean(self.clip_param)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', ret) tf.summary.histogram('learning_rate', self.optim_stepsize) tf.summary.histogram('advantage', atarg) tf.summary.histogram('clip_range', self.clip_param) if tf_util.is_image(self.observation_space): tf.summary.image('observation', obs_ph) else: tf.summary.histogram('observation', obs_ph) self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state tf_util.initialize(sess=self.sess) self.summary = tf.summary.merge_all() self.lossandgrad = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], [self.summary, tf_util.flatgrad(total_loss, self.params)] + losses) self.compute_losses = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], losses)
def train(env_id, algo, num_timesteps, seed, sgd_steps, t_pi, t_c, lam, log, expert_path, pretrain, pretrain_epochs, mdpo_update_steps, num_trajectories, expert_model, exploration_bonus, bonus_coef, random_action_len, is_action_features, dir_name, neural, lipschitz, args): """ Train TRPO model for the mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ with tf_util.single_threaded_session(): # from mpi4py import MPI # rank = MPI.COMM_WORLD.Get_rank() rank = 0 env_name = env_id[:-3].lower() log_dir = './experiments/' + env_name + '/' + str(algo).lower() + '/'\ + 'tpi' + str(t_pi) + '_tc' + str(t_c) + '_lam' + str(lam) log_dir += '_' + dir_name + '/' log_name = str(algo) + '_updateSteps' + str(mdpo_update_steps) # log_name += '_randLen' + str(random_action_len) if exploration_bonus: log_name += '_exploration' + str(bonus_coef) if pretrain: log_name += '_pretrain' + str(pretrain_epochs) if not is_action_features: log_name += "_states_only" log_name += '_s' + str(seed) log_path = log_dir + log_name expert_path = './experts/' + expert_path num_timesteps = int(num_timesteps) args = args.__dict__ dir_path = os.getcwd() + log_dir[1:] if not os.path.exists(dir_path): os.makedirs(dir_path) with open(os.getcwd() + log_dir[1:] + 'args.txt', 'w') as file: file.write("Experiment Arguments:") for key, val in args.items(): print(key, ": ", val, file=file) if log: if rank == 0: logger.configure(log_path) else: logger.configure(log_path, format_strs=[]) logger.set_level(logger.DISABLED) else: if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) # workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() # env = make_mujoco_env(env_id, workerseed) def make_env(): # env_out = gym.make(env_id, reset_noise_scale=1.0) env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) env_out = wrap_mujoco(env_out, random_action_len=random_action_len) return env_out # env = DummyVecEnv([make_env]) # env = VecNormalize(env) if algo == 'Train': train = True else: train = False if algo == 'Evaluate': eval = True else: eval = False if train: from stable_baselines import SAC env = VecNormalize(env, norm_reward=False, norm_obs=False) if num_timesteps > 0: model = SAC('MlpPolicy', env_id, verbose=1, buffer_size=1000000, batch_size=256, ent_coef='auto', train_freq=1, tau=0.01, gradient_steps=1, learning_starts=10000) else: model = SAC.load(expert_model, env) generate_expert_traj(model, expert_path, n_timesteps=num_timesteps, n_episodes=num_trajectories) if num_timesteps > 0: model.save('sac_' + env_name + '_' + str(num_timesteps)) elif eval: from stable_baselines import SAC env = VecNormalize(env, norm_reward=False, norm_obs=False) model = SAC.load(expert_model, env) generate_expert_traj(model, expert_path, n_timesteps=num_timesteps, n_episodes=10, evaluate=True) else: expert_path = expert_path + '.npz' dataset = ExpertDataset(expert_path=expert_path, traj_limitation=10, verbose=1) if algo == 'MDAL': model = MDAL_MDPO_OFF('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/mdal/", seed=seed, buffer_size=1000000, ent_coef=0.0, learning_starts=10000, batch_size=256, tau=0.01, gamma=0.99, gradient_steps=sgd_steps, mdpo_update_steps=mdpo_update_steps, lam=0.0, train_freq=1, d_step=10, tsallis_q=1, reparameterize=True, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, neural=neural, lipschitz=lipschitz) elif algo == 'MDAL_ON_POLICY': model = MDAL_MDPO_ON('MlpPolicy', env, dataset, verbose=1, timesteps_per_batch=2048, tensorboard_log="./experiments/" + env_name + "/mdal_mdpo_on/", seed=seed, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, adversary_entcoeff=0.001, gamma=0.99, lam=0.95, vf_iters=5, vf_stepsize=1e-3, sgd_steps=sgd_steps, klcoeff=1.0, method="multistep-SGD", tsallis_q=1.0, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, neural=neural) elif algo == 'MDAL_TRPO': model = MDAL_TRPO('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/mdal_trpo/", seed=seed, gamma=0.99, g_step=3, d_step=5, sgd_steps=1, d_stepsize=9e-5, entcoeff=0.0, adversary_entcoeff=0.001, max_kl=t_pi, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, neural=neural, lam=0.98, timesteps_per_batch=2000, lipschitz=lipschitz) elif algo == 'GAIL': from mpi4py import MPI from stable_baselines import GAIL model = GAIL('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/gail/", seed=seed, entcoeff=0.0, adversary_entcoeff=0.001, lipschitz=lipschitz) elif algo == 'GAIL_MDPO_OFF': # from mpi4py import MPI from stable_baselines import GAIL_MDPO_OFF model = GAIL_MDPO_OFF('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/gail_mdpo_off/", seed=seed, ent_coef=0.0, adversary_entcoeff=0.001, buffer_size=1000000, learning_starts=10000, batch_size=256, tau=0.01, gamma=0.99, gradient_steps=sgd_steps, mdpo_update_steps=mdpo_update_steps, lam=0.0, train_freq=1, tsallis_q=1, reparameterize=True, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, lipschitz=lipschitz) else: raise ValueError("Not a valid algorithm.") if pretrain: model.pretrain(dataset, n_epochs=pretrain_epochs) model.learn(total_timesteps=num_timesteps, tb_log_name=log_name) env.close()
def setup_model(self): # prevent import loops from stable_baselines.gail.adversary import TransitionClassifier with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier(self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl(self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean(tf.square(self.policy_pi.value_flat - ret)) # advantage * pnew / pold ratio = tf.exp(self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name] vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name] self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape)) start += var_size gvp = tf.add_n([tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 fvp = tf_util.flatgrad(gvp, var_list) tf.summary.scalar('entropy_loss', meanent) tf.summary.scalar('policy_gradient_loss', optimgain) tf.summary.scalar('value_function_loss', surrgain) tf.summary.scalar('approximate_kullback-leibler', meankl) tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg], losses) self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg], fvp) self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret], tf_util.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print(colorize("done in {:.3f} seconds".format((time.time() - start_time)), color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) if self.using_gail: self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess) self.d_adam.sync() self.vfadam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', ret) tf.summary.histogram('learning_rate', self.vf_stepsize) tf.summary.histogram('advantage', atarg) tf.summary.histogram('kl_clip_range', self.max_kl) if tf_util.is_image(self.observation_space): tf.summary.image('observation', observation) else: tf.summary.histogram('observation', observation) self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi") if self.using_gail: self.params.extend(self.reward_giver.get_trainable_variables()) self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, old_policy.obs_ph, action, atarg, ret], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
def setup_model(self): with SetVerbosity(self.verbose): assert isinstance(self.action_space, gym.spaces.Box), \ "Error: DDPG cannot output a {} action space, only spaces.Box is supported.".format(self.action_space) assert issubclass(self.policy, DDPGPolicy), "Error: the input policy for the DDPG model must be " \ "an instance of DDPGPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) self.memory = self.memory_policy(limit=self.memory_limit, action_shape=self.action_space.shape, observation_shape=self.observation_space.shape) with tf.variable_scope("input", reuse=False): # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) else: self.obs_rms = None # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None) # Create target networks. self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None) self.obs_target = self.target_policy.obs_ph self.action_target = self.target_policy.action_ph normalized_obs0 = tf.clip_by_value(normalize(self.policy_tf.processed_obs, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.target_policy.processed_obs, self.obs_rms), self.observation_range[0], self.observation_range[1]) if self.param_noise is not None: # Configure perturbed actor. self.param_noise_actor = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None) self.obs_noise = self.param_noise_actor.obs_ph self.action_noise_ph = self.param_noise_actor.action_ph # Configure separate copy for stddev adoption. self.adaptive_param_noise_actor = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None) self.obs_adapt_noise = self.adaptive_param_noise_actor.obs_ph self.action_adapt_noise = self.adaptive_param_noise_actor.action_ph # Inputs. self.obs_train = self.policy_tf.obs_ph self.action_train_ph = self.policy_tf.action_ph self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Create networks and core TF parts that are shared across setup parts. with tf.variable_scope("model", reuse=False): self.actor_tf = self.policy_tf.make_actor(normalized_obs0) self.normalized_critic_tf = self.policy_tf.make_critic(normalized_obs0, self.actions) self.normalized_critic_with_actor_tf = self.policy_tf.make_critic(normalized_obs0, self.actor_tf, reuse=True) # Noise setup if self.param_noise is not None: self._setup_param_noise(normalized_obs0) with tf.variable_scope("target", reuse=False): critic_target = self.target_policy.make_critic(normalized_obs1, self.target_policy.make_actor(normalized_obs1)) with tf.variable_scope("loss", reuse=False): self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) q_obs1 = denormalize(critic_target, self.ret_rms) self.target_q = self.rewards + (1. - self.terminals1) * self.gamma * q_obs1 tf.summary.scalar('critic_target', tf.reduce_mean(self.critic_target)) tf.summary.histogram('critic_target', self.critic_target) # Set up parts. if self.normalize_returns and self.enable_popart: self._setup_popart() self._setup_stats() self._setup_target_network_updates() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('rewards', tf.reduce_mean(self.rewards)) tf.summary.histogram('rewards', self.rewards) tf.summary.scalar('param_noise_stddev', tf.reduce_mean(self.param_noise_stddev)) tf.summary.histogram('param_noise_stddev', self.param_noise_stddev) if len(self.observation_space.shape) == 3 and self.observation_space.shape[0] in [1, 3, 4]: tf.summary.image('observation', self.obs_train) else: tf.summary.histogram('observation', self.obs_train) with tf.variable_scope("Adam_mpi", reuse=False): self._setup_actor_optimizer() self._setup_critic_optimizer() tf.summary.scalar('actor_loss', self.actor_loss) tf.summary.scalar('critic_loss', self.critic_loss) self.params = find_trainable_variables("model") self.target_params = find_trainable_variables("target") with self.sess.as_default(): self._initialize(self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) # Construct network for new policy with tf.variable_scope("pi", reuse=False): self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False) # Target advantage function (if applicable) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ret = tf.placeholder(dtype=tf.float32, shape=[None]) # learning rate multiplier, updated with schedule lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # Annealed cliping parameter epislon clip_param = self.clip_param * lrmult obs_ph = self.policy_pi.obs_ph action_ph = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_pi.proba_distribution.kl( self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-self.entcoeff) * meanent # pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action_ph) - old_pi.proba_distribution.logp(action_ph)) # surrogate from conservative policy iteration surr1 = ratio * atarg surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # PPO's pessimistic surrogate (L^CLIP) pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2)) vf_loss = tf.reduce_mean( tf.square(self.policy_pi.value_fn[:, 0] - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] self.loss_names = [ "pol_surr", "pol_entpen", "vf_loss", "kl", "ent" ] self.params = tf_util.get_trainable_vars("pi") self.lossandgrad = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], losses + [tf_util.flatgrad(total_loss, self.params)]) self.adam = MpiAdam(self.params, epsilon=self.adam_epsilon, sess=self.sess) self.assign_old_eq_new = tf_util.function( [], [], updates=[ tf.assign(oldv, newv) for ( oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("pi")) ]) self.compute_losses = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], losses) self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state tf_util.initialize(sess=self.sess)
def setup_model(self): # prevent import loops from stable_baselines.gail.adversary import TransitionClassifier from stable_baselines.mdal.adversary import TabularAdversaryTF, NeuralAdversaryTRPO with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the MDPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) # self._setup_learn(self.seed) self._setup_learn() if self.using_gail: self.reward_giver = TransitionClassifier(self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) elif self.using_mdal: if self.neural: self.reward_giver = NeuralAdversaryTRPO(self.sess, self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) else: self.reward_giver = TabularAdversaryTF(self.sess, self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff, expert_features=self.expert_dataset.successor_features, exploration_bonus=self.exploration_bonus, bonus_coef=self.bonus_coef, t_c=self.t_c, is_action_features=self.is_action_features) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): self.old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for fitting closed form with tf.variable_scope("closedpi", reuse=False): self.closed_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) self.vtarg = tf.placeholder(dtype=tf.float32, shape=[None]) self.ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name="learning_rate_ph") self.outer_learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name="outer_learning_rate_ph") self.old_vpred_ph = tf.placeholder(dtype=tf.float32, shape=[None], name="old_vpred_ph") self.clip_range_vf_ph = tf.placeholder(dtype=tf.float32, shape=[], name="clip_range_ph") observation = self.policy_pi.obs_ph self.action = self.policy_pi.pdtype.sample_placeholder([None]) if self.tsallis_q == 1.0: kloldnew = self.policy_pi.proba_distribution.kl(self.old_policy.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) else: logp_pi = self.policy_pi.proba_distribution.logp(self.action) logp_pi_old = self.old_policy.proba_distribution.logp(self.action) ent = self.policy_pi.proba_distribution.entropy() #kloldnew = self.policy_pi.proba_distribution.kl_tsallis(self.old_policy.proba_distribution, self.tsallis_q) tsallis_q = 2.0 - self.tsallis_q meankl = tf.reduce_mean(tf_log_q(tf.exp(logp_pi), tsallis_q) - tf_log_q(tf.exp(logp_pi_old), tsallis_q)) #tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent if self.cliprange_vf is None: vpred_clipped = self.policy_pi.value_flat else: vpred_clipped = self.old_vpred_ph + \ tf.clip_by_value(self.policy_pi.value_flat - self.old_vpred_ph, - self.clip_range_vf_ph, self.clip_range_vf_ph) vf_losses1 = tf.square(self.policy_pi.value_flat - self.ret) vf_losses2 = tf.square(vpred_clipped - self.ret) vferr = tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # advantage * pnew / pold ratio = tf.exp(self.policy_pi.proba_distribution.logp(self.action) - self.old_policy.proba_distribution.logp(self.action)) if self.method == "multistep-SGD": surrgain = tf.reduce_mean(ratio * self.atarg) - meankl / self.learning_rate_ph elif self.method == "closedreverse-KL": surrgain = tf.reduce_mean(tf.exp(self.atarg) * self.policy_pi.proba_distribution.logp(self.action)) else: policygain = tf.reduce_mean(tf.exp(self.atarg) * tf.log(self.closed_policy.proba_distribution.mean)) surrgain = tf.reduce_mean(ratio * self.atarg) - tf.reduce_mean(self.learning_rate_ph * ratio * self.policy_pi.proba_distribution.logp(self.action)) optimgain = surrgain #+ entbonus - self.learning_rate_ph * meankl losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name] vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name] print("policy vars", var_list) all_closed_var_list = tf_util.get_trainable_vars("closedpi") closed_var_list = [v for v in all_closed_var_list if "/vf" not in v.name and "/q" not in v.name] self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape)) start += var_size gvp = tf.add_n([tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 fvp = tf_util.flatgrad(gvp, var_list) # tf.summary.scalar('entropy_loss', meanent) # tf.summary.scalar('policy_gradient_loss', optimgain) # tf.summary.scalar('value_function_loss', surrgain) # tf.summary.scalar('approximate_kullback-leibler', meankl) # tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function([observation, self.old_policy.obs_ph, self.action, self.atarg, self.learning_rate_ph, self.vtarg], losses) self.compute_fvp = tf_util.function([flat_tangent, observation, self.old_policy.obs_ph, self.action, self.atarg], fvp) self.compute_vflossandgrad = tf_util.function([observation, self.old_policy.obs_ph, self.ret, self.old_vpred_ph, self.clip_range_vf_ph], tf_util.flatgrad(vferr, vf_var_list)) grads = tf.gradients(-optimgain, var_list) grads, _grad_norm = tf.clip_by_global_norm(grads, 0.5) trainer = tf.train.AdamOptimizer(learning_rate=self.outer_learning_rate_ph, epsilon=1e-5) # trainer = tf.train.AdamOptimizer(learning_rate=3e-4, epsilon=1e-5) grads = list(zip(grads, var_list)) self._train = trainer.apply_gradients(grads) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: # print(colorize(msg, color='magenta')) # start_time = time.time() yield # print(colorize("done in {:.3f} seconds".format((time.time() - start_time)), # color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) if self.using_gail or self.using_mdal: self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess) self.d_adam.sync() self.vfadam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(self.atarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.ret) tf.summary.histogram('learning_rate', self.vf_stepsize) tf.summary.histogram('advantage', self.atarg) tf.summary.histogram('kl_clip_range', self.max_kl) if tf_util.is_image(self.observation_space): tf.summary.image('observation', observation) else: tf.summary.histogram('observation', observation) self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi") if self.using_gail: self.params.extend(self.reward_giver.get_trainable_variables()) self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, self.old_policy.obs_ph, self.action, self.atarg, self.ret, self.learning_rate_ph, self.vtarg, self.closed_policy.obs_ph], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
def _setup_model(self, rank, memory_size, alpha, obs_space, action_space, full_state_space, noise_target_action, **kwargs): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) if self.use_prioritiy: from algorithm.priority_memory import PrioritizedMemory self.memory = PrioritizedMemory(capacity=memory_size, alpha=alpha) else: from algorithm.memory import Memory self.memory = Memory(limit=memory_size, action_shape=action_space.shape, observation_shape=obs_space.shape, full_state_shape=full_state_space.shape) # 定义 placeholders self.observe_Input = tf.placeholder(tf.float32, [None] + list(obs_space.shape), name='observe_Input') self.observe_Input_ = tf.placeholder(tf.float32, [None] + list(obs_space.shape), name='observe_Input_') self.f_s = tf.placeholder(tf.float32, [None] + list(full_state_space.shape), name='full_state_Input') self.f_s_ = tf.placeholder(tf.float32, [None] + list(full_state_space.shape), name='fill_state_Input_') self.R = tf.placeholder(tf.float32, [None, 1], 'r') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights') self.n_step_steps = tf.placeholder(tf.float32, shape=(None, 1), name='n_step_reached') self.q_demo = tf.placeholder(tf.float32, [None, 1], name='Q_of_actions_from_memory') self.come_from_demo = tf.placeholder(tf.float32, [None, 1], name='Demo_index') self.action_memory = tf.placeholder(tf.float32, [None] + list(action_space.shape), name='actions_from_memory') with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=obs_space.shape) with tf.variable_scope('state_rms'): self.state_rms = RunningMeanStd(shape=full_state_space.shape) with tf.name_scope('obs_preprocess'): self.normalized_observe_Input = tf.clip_by_value( normalize(self.observe_Input, self.obs_rms), -5., 5.) self.normalized_observe_Input_ = tf.clip_by_value( normalize(self.observe_Input_, self.obs_rms), -5., 5.) with tf.name_scope('state_preprocess'): self.normalized_f_s0 = normalize(self.f_s, self.state_rms) self.normalized_f_s1 = normalize(self.f_s_, self.state_rms) with tf.variable_scope('Actor'): self.action, f_s_predict = self.build_actor( self.normalized_observe_Input, scope='eval', trainable=True, full_state_dim=full_state_space.shape[0]) self.action_, _ = self.build_actor( self.normalized_observe_Input_, scope='target', trainable=False, full_state_dim=full_state_space.shape[0]) # Target policy smoothing, by adding clipped noise to target actions if noise_target_action: epsilon = tf.random_normal(tf.shape(self.action_), stddev=0.007) epsilon = tf.clip_by_value(epsilon, -0.01, 0.01) a2 = self.action_ + epsilon noised_action_ = tf.clip_by_value(a2, -1, 1) else: noised_action_ = self.action_ with tf.variable_scope('Critic'): # Q值都要被clip 防止过估计. self.q_1 = tf.clip_by_value( self.build_critic(self.normalized_f_s0, self.action, scope='eval_1', trainable=True), self.Q_value_range[0], self.Q_value_range[1]) q_1_ = self.build_critic(self.normalized_f_s1, noised_action_, scope='target_1', trainable=False) if self.use_TD3: q_2 = tf.clip_by_value( self.build_critic(self.normalized_f_s0, self.action, scope='eval_2', trainable=True), self.Q_value_range[0], self.Q_value_range[1]) q_2_ = self.build_critic(self.normalized_f_s1, noised_action_, scope='target_2', trainable=False) # Collect networks parameters. It would make it more easily to manage them. self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval') self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target') self.ce1_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_1') self.ct1_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_1') if self.use_TD3: self.ce2_params = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_2') self.ct2_params = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_2') with tf.variable_scope('Soft_Update'): self.soft_replace_a = [ tf.assign(t, (1 - TAU) * t + TAU * e) for t, e in zip(self.at_params, self.ae_params) ] self.soft_replace_c = [ tf.assign(t, (1 - TAU) * t + TAU * e) for t, e in zip(self.ct1_params, self.ce1_params) ] if self.use_TD3: self.soft_replace_c += [ tf.assign(t, (1 - TAU) * t + TAU * e) for t, e in zip(self.ct2_params, self.ce2_params) ] # critic 的误差 为 (one-step-td 误差 + n-step-td 误差 + critic_online 的L2惩罚) # TD3: critic一共有4个, 算两套 critic的误差, 秀儿. with tf.variable_scope('Critic_Lose'): if self.use_TD3: min_q_ = tf.minimum(q_1_, q_2_) else: min_q_ = q_1_ self.q_target = self.R + (1. - self.terminals1) * GAMMA * min_q_ if self.use_n_step: self.n_step_target_q = self.R + ( 1. - self.terminals1) * tf.pow( GAMMA, self.n_step_steps) * min_q_ cliped_n_step_target_q = tf.clip_by_value( self.n_step_target_q, self.Q_value_range[0], self.Q_value_range[1]) cliped_q_target = tf.clip_by_value(self.q_target, self.Q_value_range[0], self.Q_value_range[1]) self.td_error_1 = tf.abs(cliped_q_target - self.q_1) if self.use_TD3: self.td_error_2 = tf.abs(cliped_q_target - q_2) if self.use_n_step: self.nstep_td_error_1 = tf.abs(cliped_n_step_target_q - self.q_1) if self.use_TD3: self.nstep_td_error_2 = tf.abs(cliped_n_step_target_q - q_2) L2_regular_1 = tf.contrib.layers.apply_regularization( tf.contrib.layers.l2_regularizer(0.001), weights_list=self.ce1_params) if self.use_TD3: L2_regular_2 = tf.contrib.layers.apply_regularization( tf.contrib.layers.l2_regularizer(0.001), weights_list=self.ce2_params) one_step_losse_1 = tf.reduce_mean( tf.multiply(self.ISWeights, tf.square( self.td_error_1))) * self.lambda_1_step if self.use_TD3: one_step_losse_2 = tf.reduce_mean( tf.multiply(self.ISWeights, tf.square( self.td_error_2))) * self.lambda_1_step if self.use_n_step: n_step_td_losses_1 = tf.reduce_mean( tf.multiply( self.ISWeights, tf.square( self.nstep_td_error_1))) * self.lambda_n_step c_loss_1 = one_step_losse_1 + n_step_td_losses_1 + L2_regular_1 if self.use_TD3: n_step_td_losses_2 = tf.reduce_mean( tf.multiply(self.ISWeights, tf.square(self.nstep_td_error_2)) ) * self.lambda_n_step c_loss_2 = one_step_losse_2 + n_step_td_losses_2 + L2_regular_2 else: c_loss_1 = one_step_losse_1 + L2_regular_1 if self.use_TD3: c_loss_2 = one_step_losse_2 + L2_regular_2 # actor 的 loss 为 最大化q(s,a) 最小化行为克隆误差. # (只有demo的transition 且 demo的action 比 actor生成的action q_1(s,a)高的时候 才会有克隆误差) with tf.variable_scope('Actor_lose'): Is_worse_than_demo = self.q_1 < self.q_demo Is_worse_than_demo = tf.cast(Is_worse_than_demo, tf.float32) worse_than_demo = tf.cast(tf.reduce_sum(Is_worse_than_demo), tf.int8) # 算action误差 我用的是平方和, 也有人用均方误差 reduce_mean. 其实都可以. # 我的action本来都是很小的数. action_diffs = Is_worse_than_demo * tf.reduce_sum( self.come_from_demo * tf.square(self.action - self.action_memory), 1, keepdims=True) L_BC = self.LAMBDA_BC * tf.reduce_sum(action_diffs) auxiliary_predict_loss = self.LAMBDA_predict * tf.reduce_mean( tf.square(f_s_predict - self.f_s)) a_loss = -tf.reduce_mean( self.q_1) + L_BC + auxiliary_predict_loss # Setting optimizer for Actor and Critic with tf.variable_scope('Critic_Optimizer'): if self.use_TD3: self.critic_grads_1 = tf_util.flatgrad( loss=c_loss_1, var_list=self.ce1_params) self.critic_grads_2 = tf_util.flatgrad( loss=c_loss_2, var_list=self.ce2_params) self.critic_optimizer_1 = MpiAdam(var_list=self.ce1_params, beta1=0.9, beta2=0.999, epsilon=1e-08) self.critic_optimizer_2 = MpiAdam(var_list=self.ce2_params, beta1=0.9, beta2=0.999, epsilon=1e-08) else: self.critic_grads = tf_util.flatgrad( loss=c_loss_1, var_list=self.ce1_params) self.critic_optimizer = MpiAdam(var_list=self.ce1_params, beta1=0.9, beta2=0.999, epsilon=1e-08) with tf.variable_scope('Actor_Optimizer'): self.actor_grads = tf_util.flatgrad(a_loss, self.ae_params) self.actor_optimizer = MpiAdam(var_list=self.ae_params, beta1=0.9, beta2=0.999, epsilon=1e-08) with self.sess.as_default(): self._initialize(self.sess) # 保存模型 var_list = tf.global_variables() print( "var_list!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" ) for v in var_list: print(v) self.saver = tf.train.Saver(var_list=var_list, max_to_keep=1) self.writer = tf.summary.FileWriter( "logs/" + self.experiment_name + "/DDPG_" + str(rank), self.graph) # TensorBoard summary self.a_summary = tf.summary.merge([ tf.summary.scalar('a_loss', a_loss, family='actor'), tf.summary.scalar('L_BC', L_BC, family='actor'), tf.summary.scalar('worse_than_demo', worse_than_demo, family='actor'), tf.summary.scalar('auxiliary_predict_loss', auxiliary_predict_loss, family='actor') ]) if self.use_TD3: self.c_summary = tf.summary.merge([ tf.summary.scalar('c_loss_1', c_loss_1, family='critic'), tf.summary.scalar('c_loss_2', c_loss_2, family='critic') ]) else: self.c_summary = tf.summary.merge( [tf.summary.scalar('c_loss_1', c_loss_1, family='critic')]) # episode summary self.episode_cumulate_reward = tf.placeholder( tf.float32, name='episode_cumulate_reward') self.episoed_length = tf.placeholder( tf.int16, name='episode_cumulate_reward') self.success_or_not = tf.placeholder( tf.int8, name='episode_cumulate_reward') self.eval_episode_cumulate_reward = tf.placeholder( tf.float32, name='episode_cumulate_reward') self.eval_episoed_length = tf.placeholder( tf.int16, name='episode_cumulate_reward') self.eval_success_or_not = tf.placeholder( tf.int8, name='episode_cumulate_reward') self.episode_summary = tf.summary.merge([ tf.summary.scalar('episode_cumulate_reward', self.episode_cumulate_reward, family='episoed_result'), tf.summary.scalar('episoed_length', self.episoed_length, family='episoed_result'), tf.summary.scalar('success_or_not', self.success_or_not, family='episoed_result'), ]) self.eval_episode_summary = tf.summary.merge([ tf.summary.scalar('eval_episode_cumulate_reward', self.eval_episode_cumulate_reward, family='Eval_episoed_result'), tf.summary.scalar('eval_episoed_length', self.eval_episoed_length, family='Eval_episoed_result'), tf.summary.scalar('eval_success_or_not', self.eval_success_or_not, family='Eval_episoed_result'), ])