예제 #1
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_pi = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         None,
                                         reuse=False,
                                         **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    # Target advantage function (if applicable)
                    atarg = tf.placeholder(dtype=tf.float32, shape=[None])

                    # Empirical return
                    ret = tf.placeholder(dtype=tf.float32, shape=[None])

                    # learning rate multiplier, updated with schedule
                    lrmult = tf.placeholder(name='lrmult',
                                            dtype=tf.float32,
                                            shape=[])

                    # Annealed cliping parameter epislon
                    clip_param = self.clip_param * lrmult

                    obs_ph = self.policy_pi.obs_ph
                    action_ph = self.policy_pi.pdtype.sample_placeholder(
                        [None])

                    kloldnew = old_pi.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    pol_entpen = (-self.entcoeff) * meanent

                    # pnew / pold
                    ratio = tf.exp(
                        self.policy_pi.proba_distribution.logp(action_ph) -
                        old_pi.proba_distribution.logp(action_ph))

                    # surrogate from conservative policy iteration
                    surr1 = ratio * atarg
                    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                                             1.0 + clip_param) * atarg

                    # PPO's pessimistic surrogate (L^CLIP)
                    pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2))
                    vf_loss = tf.reduce_mean(
                        tf.square(self.policy_pi.value_flat - ret))
                    total_loss = pol_surr + pol_entpen + vf_loss
                    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
                    self.loss_names = [
                        "pol_surr", "pol_entpen", "vf_loss", "kl", "ent"
                    ]

                    tf.summary.scalar('entropy_loss', pol_entpen)
                    tf.summary.scalar('policy_gradient_loss', pol_surr)
                    tf.summary.scalar('value_function_loss', vf_loss)
                    tf.summary.scalar('approximate_kullback-leibler', meankl)
                    tf.summary.scalar('clip_factor', clip_param)
                    tf.summary.scalar('loss', total_loss)

                    self.params = tf_util.get_trainable_vars("model")

                    self.assign_old_eq_new = tf_util.function(
                        [], [],
                        updates=[
                            tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                                tf_util.get_globals_vars("oldpi"),
                                tf_util.get_globals_vars("model"))
                        ])

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.adam = MpiAdam(self.params,
                                        epsilon=self.adam_epsilon,
                                        sess=self.sess)

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(ret))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.optim_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('clip_range',
                                      tf.reduce_mean(self.clip_param))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', ret)
                        tf.summary.histogram('learning_rate',
                                             self.optim_stepsize)
                        tf.summary.histogram('advantage', atarg)
                        tf.summary.histogram('clip_range', self.clip_param)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', obs_ph)
                        else:
                            tf.summary.histogram('observation', obs_ph)

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                tf_util.initialize(sess=self.sess)

                self.summary = tf.summary.merge_all()

                self.lossandgrad = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    [self.summary,
                     tf_util.flatgrad(total_loss, self.params)] + losses)
                self.compute_losses = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    losses)
예제 #2
0
def train(env_id, algo, num_timesteps, seed, sgd_steps, t_pi, t_c, lam, log,
          expert_path, pretrain, pretrain_epochs, mdpo_update_steps,
          num_trajectories, expert_model, exploration_bonus, bonus_coef,
          random_action_len, is_action_features, dir_name, neural, lipschitz,
          args):
    """
    Train TRPO model for the mujoco environment, for testing purposes
    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """

    with tf_util.single_threaded_session():
        # from mpi4py import MPI
        # rank = MPI.COMM_WORLD.Get_rank()
        rank = 0
        env_name = env_id[:-3].lower()
        log_dir = './experiments/' + env_name + '/' + str(algo).lower() + '/'\
                  + 'tpi' + str(t_pi) + '_tc' + str(t_c) + '_lam' + str(lam)
        log_dir += '_' + dir_name + '/'
        log_name = str(algo) + '_updateSteps' + str(mdpo_update_steps)
        # log_name += '_randLen' + str(random_action_len)
        if exploration_bonus:
            log_name += '_exploration' + str(bonus_coef)
        if pretrain:
            log_name += '_pretrain' + str(pretrain_epochs)
        if not is_action_features:
            log_name += "_states_only"
        log_name += '_s' + str(seed)

        log_path = log_dir + log_name
        expert_path = './experts/' + expert_path

        num_timesteps = int(num_timesteps)

        args = args.__dict__

        dir_path = os.getcwd() + log_dir[1:]
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            with open(os.getcwd() + log_dir[1:] + 'args.txt', 'w') as file:
                file.write("Experiment Arguments:")
                for key, val in args.items():
                    print(key, ": ", val, file=file)

        if log:
            if rank == 0:
                logger.configure(log_path)
            else:
                logger.configure(log_path, format_strs=[])
                logger.set_level(logger.DISABLED)
        else:
            if rank == 0:
                logger.configure()
            else:
                logger.configure(format_strs=[])
                logger.set_level(logger.DISABLED)

        # workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        # env = make_mujoco_env(env_id, workerseed)
        def make_env():
            # env_out = gym.make(env_id, reset_noise_scale=1.0)
            env_out = gym.make(env_id)
            env_out = bench.Monitor(env_out,
                                    logger.get_dir(),
                                    allow_early_resets=True)
            env_out.seed(seed)
            env_out = wrap_mujoco(env_out, random_action_len=random_action_len)
            return env_out

        #

        env = DummyVecEnv([make_env])
        # env = VecNormalize(env)

        if algo == 'Train':
            train = True
        else:
            train = False

        if algo == 'Evaluate':
            eval = True
        else:
            eval = False

        if train:
            from stable_baselines import SAC
            env = VecNormalize(env, norm_reward=False, norm_obs=False)

            if num_timesteps > 0:
                model = SAC('MlpPolicy',
                            env_id,
                            verbose=1,
                            buffer_size=1000000,
                            batch_size=256,
                            ent_coef='auto',
                            train_freq=1,
                            tau=0.01,
                            gradient_steps=1,
                            learning_starts=10000)
            else:
                model = SAC.load(expert_model, env)
            generate_expert_traj(model,
                                 expert_path,
                                 n_timesteps=num_timesteps,
                                 n_episodes=num_trajectories)
            if num_timesteps > 0:
                model.save('sac_' + env_name + '_' + str(num_timesteps))
        elif eval:
            from stable_baselines import SAC
            env = VecNormalize(env, norm_reward=False, norm_obs=False)
            model = SAC.load(expert_model, env)
            generate_expert_traj(model,
                                 expert_path,
                                 n_timesteps=num_timesteps,
                                 n_episodes=10,
                                 evaluate=True)
        else:
            expert_path = expert_path + '.npz'
            dataset = ExpertDataset(expert_path=expert_path,
                                    traj_limitation=10,
                                    verbose=1)

            if algo == 'MDAL':
                model = MDAL_MDPO_OFF('MlpPolicy',
                                      env,
                                      dataset,
                                      verbose=1,
                                      tensorboard_log="./experiments/" +
                                      env_name + "/mdal/",
                                      seed=seed,
                                      buffer_size=1000000,
                                      ent_coef=0.0,
                                      learning_starts=10000,
                                      batch_size=256,
                                      tau=0.01,
                                      gamma=0.99,
                                      gradient_steps=sgd_steps,
                                      mdpo_update_steps=mdpo_update_steps,
                                      lam=0.0,
                                      train_freq=1,
                                      d_step=10,
                                      tsallis_q=1,
                                      reparameterize=True,
                                      t_pi=t_pi,
                                      t_c=t_c,
                                      exploration_bonus=exploration_bonus,
                                      bonus_coef=bonus_coef,
                                      is_action_features=is_action_features,
                                      neural=neural,
                                      lipschitz=lipschitz)
            elif algo == 'MDAL_ON_POLICY':
                model = MDAL_MDPO_ON('MlpPolicy',
                                     env,
                                     dataset,
                                     verbose=1,
                                     timesteps_per_batch=2048,
                                     tensorboard_log="./experiments/" +
                                     env_name + "/mdal_mdpo_on/",
                                     seed=seed,
                                     max_kl=0.01,
                                     cg_iters=10,
                                     cg_damping=0.1,
                                     entcoeff=0.0,
                                     adversary_entcoeff=0.001,
                                     gamma=0.99,
                                     lam=0.95,
                                     vf_iters=5,
                                     vf_stepsize=1e-3,
                                     sgd_steps=sgd_steps,
                                     klcoeff=1.0,
                                     method="multistep-SGD",
                                     tsallis_q=1.0,
                                     t_pi=t_pi,
                                     t_c=t_c,
                                     exploration_bonus=exploration_bonus,
                                     bonus_coef=bonus_coef,
                                     is_action_features=is_action_features,
                                     neural=neural)

            elif algo == 'MDAL_TRPO':
                model = MDAL_TRPO('MlpPolicy',
                                  env,
                                  dataset,
                                  verbose=1,
                                  tensorboard_log="./experiments/" + env_name +
                                  "/mdal_trpo/",
                                  seed=seed,
                                  gamma=0.99,
                                  g_step=3,
                                  d_step=5,
                                  sgd_steps=1,
                                  d_stepsize=9e-5,
                                  entcoeff=0.0,
                                  adversary_entcoeff=0.001,
                                  max_kl=t_pi,
                                  t_pi=t_pi,
                                  t_c=t_c,
                                  exploration_bonus=exploration_bonus,
                                  bonus_coef=bonus_coef,
                                  is_action_features=is_action_features,
                                  neural=neural,
                                  lam=0.98,
                                  timesteps_per_batch=2000,
                                  lipschitz=lipschitz)

            elif algo == 'GAIL':
                from mpi4py import MPI
                from stable_baselines import GAIL

                model = GAIL('MlpPolicy',
                             env,
                             dataset,
                             verbose=1,
                             tensorboard_log="./experiments/" + env_name +
                             "/gail/",
                             seed=seed,
                             entcoeff=0.0,
                             adversary_entcoeff=0.001,
                             lipschitz=lipschitz)

            elif algo == 'GAIL_MDPO_OFF':
                # from mpi4py import MPI
                from stable_baselines import GAIL_MDPO_OFF

                model = GAIL_MDPO_OFF('MlpPolicy',
                                      env,
                                      dataset,
                                      verbose=1,
                                      tensorboard_log="./experiments/" +
                                      env_name + "/gail_mdpo_off/",
                                      seed=seed,
                                      ent_coef=0.0,
                                      adversary_entcoeff=0.001,
                                      buffer_size=1000000,
                                      learning_starts=10000,
                                      batch_size=256,
                                      tau=0.01,
                                      gamma=0.99,
                                      gradient_steps=sgd_steps,
                                      mdpo_update_steps=mdpo_update_steps,
                                      lam=0.0,
                                      train_freq=1,
                                      tsallis_q=1,
                                      reparameterize=True,
                                      t_pi=t_pi,
                                      t_c=t_c,
                                      exploration_bonus=exploration_bonus,
                                      bonus_coef=bonus_coef,
                                      is_action_features=is_action_features,
                                      lipschitz=lipschitz)
            else:
                raise ValueError("Not a valid algorithm.")

            if pretrain:
                model.pretrain(dataset, n_epochs=pretrain_epochs)

            model.learn(total_timesteps=num_timesteps, tb_log_name=log_name)

        env.close()
예제 #3
0
    def setup_model(self):
        # prevent import loops
        from stable_baselines.gail.adversary import TransitionClassifier

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)

                if self.using_gail:
                    self.reward_giver = TransitionClassifier(self.observation_space, self.action_space,
                                                             self.hidden_size_adversary,
                                                             entcoeff=self.adversary_entcoeff)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
                    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

                    observation = self.policy_pi.obs_ph
                    action = self.policy_pi.pdtype.sample_placeholder([None])

                    kloldnew = old_policy.proba_distribution.kl(self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    vferr = tf.reduce_mean(tf.square(self.policy_pi.value_flat - ret))

                    # advantage * pnew / pold
                    ratio = tf.exp(self.policy_pi.proba_distribution.logp(action) -
                                   old_policy.proba_distribution.logp(action))
                    surrgain = tf.reduce_mean(ratio * atarg)

                    optimgain = surrgain + entbonus
                    losses = [optimgain, meankl, entbonus, surrgain, meanent]
                    self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name]
                    vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name]

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape))
                        start += var_size
                    gvp = tf.add_n([tf.reduce_sum(grad * tangent)
                                    for (grad, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
                    fvp = tf_util.flatgrad(gvp, var_list)

                    tf.summary.scalar('entropy_loss', meanent)
                    tf.summary.scalar('policy_gradient_loss', optimgain)
                    tf.summary.scalar('value_function_loss', surrgain)
                    tf.summary.scalar('approximate_kullback-leibler', meankl)
                    tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg], losses)
                    self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg],
                                                        fvp)
                    self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret],
                                                                  tf_util.flatgrad(vferr, vf_var_list))

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            print(colorize(msg, color='magenta'))
                            start_time = time.time()
                            yield
                            print(colorize("done in {:.3f} seconds".format((time.time() - start_time)),
                                           color='magenta'))
                        else:
                            yield

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    if self.using_gail:
                        self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess)
                        self.d_adam.sync()
                    self.vfadam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret))
                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', ret)
                        tf.summary.histogram('learning_rate', self.vf_stepsize)
                        tf.summary.histogram('advantage', atarg)
                        tf.summary.histogram('kl_clip_range', self.max_kl)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', observation)
                        else:
                            tf.summary.histogram('observation', observation)

                self.timed = timed
                self.allmean = allmean

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi")
                if self.using_gail:
                    self.params.extend(self.reward_giver.get_trainable_variables())

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, old_policy.obs_ph, action, atarg, ret],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
예제 #4
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert isinstance(self.action_space, gym.spaces.Box), \
                "Error: DDPG cannot output a {} action space, only spaces.Box is supported.".format(self.action_space)
            assert issubclass(self.policy, DDPGPolicy), "Error: the input policy for the DDPG model must be " \
                                                        "an instance of DDPGPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)

                self.memory = self.memory_policy(limit=self.memory_limit, action_shape=self.action_space.shape,
                                                 observation_shape=self.observation_space.shape)

                with tf.variable_scope("input", reuse=False):
                    # Observation normalization.
                    if self.normalize_observations:
                        with tf.variable_scope('obs_rms'):
                            self.obs_rms = RunningMeanStd(shape=self.observation_space.shape)
                    else:
                        self.obs_rms = None

                    # Return normalization.
                    if self.normalize_returns:
                        with tf.variable_scope('ret_rms'):
                            self.ret_rms = RunningMeanStd()
                    else:
                        self.ret_rms = None

                    self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None)

                    # Create target networks.
                    self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None)
                    self.obs_target = self.target_policy.obs_ph
                    self.action_target = self.target_policy.action_ph

                    normalized_obs0 = tf.clip_by_value(normalize(self.policy_tf.processed_obs, self.obs_rms),
                                                       self.observation_range[0], self.observation_range[1])
                    normalized_obs1 = tf.clip_by_value(normalize(self.target_policy.processed_obs, self.obs_rms),
                                                       self.observation_range[0], self.observation_range[1])

                    if self.param_noise is not None:
                        # Configure perturbed actor.
                        self.param_noise_actor = self.policy(self.sess, self.observation_space, self.action_space, 1, 1,
                                                             None)
                        self.obs_noise = self.param_noise_actor.obs_ph
                        self.action_noise_ph = self.param_noise_actor.action_ph

                        # Configure separate copy for stddev adoption.
                        self.adaptive_param_noise_actor = self.policy(self.sess, self.observation_space,
                                                                      self.action_space, 1, 1, None)
                        self.obs_adapt_noise = self.adaptive_param_noise_actor.obs_ph
                        self.action_adapt_noise = self.adaptive_param_noise_actor.action_ph

                    # Inputs.
                    self.obs_train = self.policy_tf.obs_ph
                    self.action_train_ph = self.policy_tf.action_ph
                    self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
                    self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
                    self.actions = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions')
                    self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
                    self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

                # Create networks and core TF parts that are shared across setup parts.
                with tf.variable_scope("model", reuse=False):
                    self.actor_tf = self.policy_tf.make_actor(normalized_obs0)
                    self.normalized_critic_tf = self.policy_tf.make_critic(normalized_obs0, self.actions)
                    self.normalized_critic_with_actor_tf = self.policy_tf.make_critic(normalized_obs0,
                                                                                      self.actor_tf,
                                                                                      reuse=True)
                # Noise setup
                if self.param_noise is not None:
                    self._setup_param_noise(normalized_obs0)

                with tf.variable_scope("target", reuse=False):
                    critic_target = self.target_policy.make_critic(normalized_obs1,
                                                                   self.target_policy.make_actor(normalized_obs1))

                with tf.variable_scope("loss", reuse=False):
                    self.critic_tf = denormalize(
                        tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]),
                        self.ret_rms)

                    self.critic_with_actor_tf = denormalize(
                        tf.clip_by_value(self.normalized_critic_with_actor_tf,
                                         self.return_range[0], self.return_range[1]),
                        self.ret_rms)

                    q_obs1 = denormalize(critic_target, self.ret_rms)
                    self.target_q = self.rewards + (1. - self.terminals1) * self.gamma * q_obs1

                    tf.summary.scalar('critic_target', tf.reduce_mean(self.critic_target))
                    tf.summary.histogram('critic_target', self.critic_target)

                    # Set up parts.
                    if self.normalize_returns and self.enable_popart:
                        self._setup_popart()
                    self._setup_stats()
                    self._setup_target_network_updates()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('rewards', tf.reduce_mean(self.rewards))
                    tf.summary.histogram('rewards', self.rewards)
                    tf.summary.scalar('param_noise_stddev', tf.reduce_mean(self.param_noise_stddev))
                    tf.summary.histogram('param_noise_stddev', self.param_noise_stddev)
                    if len(self.observation_space.shape) == 3 and self.observation_space.shape[0] in [1, 3, 4]:
                        tf.summary.image('observation', self.obs_train)
                    else:
                        tf.summary.histogram('observation', self.obs_train)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self._setup_actor_optimizer()
                    self._setup_critic_optimizer()
                    tf.summary.scalar('actor_loss', self.actor_loss)
                    tf.summary.scalar('critic_loss', self.critic_loss)

                self.params = find_trainable_variables("model")
                self.target_params = find_trainable_variables("target")

                with self.sess.as_default():
                    self._initialize(self.sess)

                self.summary = tf.summary.merge_all()
예제 #5
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)

                # Construct network for new policy
                with tf.variable_scope("pi", reuse=False):
                    self.policy_pi = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 self.n_envs,
                                                 1,
                                                 None,
                                                 reuse=False)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_pi = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         None,
                                         reuse=False)

                # Target advantage function (if applicable)
                atarg = tf.placeholder(dtype=tf.float32, shape=[None])

                # Empirical return
                ret = tf.placeholder(dtype=tf.float32, shape=[None])

                # learning rate multiplier, updated with schedule
                lrmult = tf.placeholder(name='lrmult',
                                        dtype=tf.float32,
                                        shape=[])

                # Annealed cliping parameter epislon
                clip_param = self.clip_param * lrmult

                obs_ph = self.policy_pi.obs_ph
                action_ph = self.policy_pi.pdtype.sample_placeholder([None])

                kloldnew = old_pi.proba_distribution.kl(
                    self.policy_pi.proba_distribution)
                ent = self.policy_pi.proba_distribution.entropy()
                meankl = tf.reduce_mean(kloldnew)
                meanent = tf.reduce_mean(ent)
                pol_entpen = (-self.entcoeff) * meanent

                # pnew / pold
                ratio = tf.exp(
                    self.policy_pi.proba_distribution.logp(action_ph) -
                    old_pi.proba_distribution.logp(action_ph))

                # surrogate from conservative policy iteration
                surr1 = ratio * atarg
                surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                                         1.0 + clip_param) * atarg

                # PPO's pessimistic surrogate (L^CLIP)
                pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2))
                vf_loss = tf.reduce_mean(
                    tf.square(self.policy_pi.value_fn[:, 0] - ret))
                total_loss = pol_surr + pol_entpen + vf_loss
                losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
                self.loss_names = [
                    "pol_surr", "pol_entpen", "vf_loss", "kl", "ent"
                ]

                self.params = tf_util.get_trainable_vars("pi")
                self.lossandgrad = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    losses + [tf_util.flatgrad(total_loss, self.params)])
                self.adam = MpiAdam(self.params,
                                    epsilon=self.adam_epsilon,
                                    sess=self.sess)

                self.assign_old_eq_new = tf_util.function(
                    [], [],
                    updates=[
                        tf.assign(oldv, newv) for (
                            oldv,
                            newv) in zipsame(tf_util.get_globals_vars("oldpi"),
                                             tf_util.get_globals_vars("pi"))
                    ])
                self.compute_losses = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    losses)

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                tf_util.initialize(sess=self.sess)
예제 #6
0
    def setup_model(self):
        # prevent import loops
        from stable_baselines.gail.adversary import TransitionClassifier
        from stable_baselines.mdal.adversary import TabularAdversaryTF, NeuralAdversaryTRPO


        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the MDPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)
                # self._setup_learn(self.seed)
                self._setup_learn()

                if self.using_gail:
                    self.reward_giver = TransitionClassifier(self.observation_space, self.action_space,
                                                             self.hidden_size_adversary,
                                                             entcoeff=self.adversary_entcoeff)
                elif self.using_mdal:
                    if self.neural:
                        self.reward_giver = NeuralAdversaryTRPO(self.sess, self.observation_space, self.action_space,
                                                                self.hidden_size_adversary,
                                                                entcoeff=self.adversary_entcoeff)
                    else:
                        self.reward_giver = TabularAdversaryTF(self.sess, self.observation_space, self.action_space,
                                                                 self.hidden_size_adversary,
                                                                 entcoeff=self.adversary_entcoeff,
                                                                 expert_features=self.expert_dataset.successor_features,
                                                                 exploration_bonus=self.exploration_bonus,
                                                                 bonus_coef=self.bonus_coef,
                                                                 t_c=self.t_c,
                                                                 is_action_features=self.is_action_features)
                # Construct network for new policy
                self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    self.old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                # Network for fitting closed form
                with tf.variable_scope("closedpi", reuse=False):
                    self.closed_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
                    self.vtarg = tf.placeholder(dtype=tf.float32, shape=[None])
                    self.ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
                    self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name="learning_rate_ph")
                    self.outer_learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name="outer_learning_rate_ph")
                    self.old_vpred_ph = tf.placeholder(dtype=tf.float32, shape=[None], name="old_vpred_ph")
                    self.clip_range_vf_ph = tf.placeholder(dtype=tf.float32, shape=[], name="clip_range_ph")

                    observation = self.policy_pi.obs_ph
                    self.action = self.policy_pi.pdtype.sample_placeholder([None])

                    if self.tsallis_q == 1.0:
                        kloldnew = self.policy_pi.proba_distribution.kl(self.old_policy.proba_distribution)
                        ent = self.policy_pi.proba_distribution.entropy()
                        meankl = tf.reduce_mean(kloldnew)

                    else:
                        logp_pi = self.policy_pi.proba_distribution.logp(self.action)
                        logp_pi_old =  self.old_policy.proba_distribution.logp(self.action)
                        ent = self.policy_pi.proba_distribution.entropy()
                        #kloldnew = self.policy_pi.proba_distribution.kl_tsallis(self.old_policy.proba_distribution, self.tsallis_q)
                        tsallis_q = 2.0 - self.tsallis_q
                        meankl = tf.reduce_mean(tf_log_q(tf.exp(logp_pi), tsallis_q) - tf_log_q(tf.exp(logp_pi_old), tsallis_q)) #tf.reduce_mean(kloldnew)

                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    if self.cliprange_vf is None:
                        vpred_clipped = self.policy_pi.value_flat
                    else:
                        vpred_clipped = self.old_vpred_ph + \
                            tf.clip_by_value(self.policy_pi.value_flat - self.old_vpred_ph,
                                             - self.clip_range_vf_ph, self.clip_range_vf_ph)

                    vf_losses1 = tf.square(self.policy_pi.value_flat - self.ret)
                    vf_losses2 = tf.square(vpred_clipped - self.ret)
                    vferr = tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

                    # advantage * pnew / pold
                    ratio = tf.exp(self.policy_pi.proba_distribution.logp(self.action) -
                                   self.old_policy.proba_distribution.logp(self.action))

                    if self.method == "multistep-SGD":
                        surrgain = tf.reduce_mean(ratio * self.atarg) - meankl / self.learning_rate_ph
                    elif self.method == "closedreverse-KL":
                        surrgain = tf.reduce_mean(tf.exp(self.atarg) * self.policy_pi.proba_distribution.logp(self.action))
                    else:
                        policygain = tf.reduce_mean(tf.exp(self.atarg) * tf.log(self.closed_policy.proba_distribution.mean))
                        surrgain = tf.reduce_mean(ratio * self.atarg) - tf.reduce_mean(self.learning_rate_ph * ratio * self.policy_pi.proba_distribution.logp(self.action))

                    optimgain = surrgain #+ entbonus - self.learning_rate_ph * meankl
                    losses = [optimgain, meankl, entbonus, surrgain, meanent]
                    self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name]
                    vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name]
                    print("policy vars", var_list)

                    all_closed_var_list = tf_util.get_trainable_vars("closedpi")
                    closed_var_list = [v for v in all_closed_var_list if "/vf" not in v.name and "/q" not in v.name]

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape))
                        start += var_size
                    gvp = tf.add_n([tf.reduce_sum(grad * tangent)
                                    for (grad, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
                    fvp = tf_util.flatgrad(gvp, var_list)

                    # tf.summary.scalar('entropy_loss', meanent)
                    # tf.summary.scalar('policy_gradient_loss', optimgain)
                    # tf.summary.scalar('value_function_loss', surrgain)
                    # tf.summary.scalar('approximate_kullback-leibler', meankl)
                    # tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function([observation, self.old_policy.obs_ph, self.action, self.atarg, self.learning_rate_ph, self.vtarg], losses)
                    self.compute_fvp = tf_util.function([flat_tangent, observation, self.old_policy.obs_ph, self.action, self.atarg],
                                                        fvp)
                    self.compute_vflossandgrad = tf_util.function([observation, self.old_policy.obs_ph, self.ret, self.old_vpred_ph, self.clip_range_vf_ph],
                                                                  tf_util.flatgrad(vferr, vf_var_list))

                    grads = tf.gradients(-optimgain, var_list)
                    grads, _grad_norm = tf.clip_by_global_norm(grads, 0.5)
                    trainer = tf.train.AdamOptimizer(learning_rate=self.outer_learning_rate_ph, epsilon=1e-5)
                    # trainer = tf.train.AdamOptimizer(learning_rate=3e-4, epsilon=1e-5)
                    grads = list(zip(grads, var_list))
                    self._train = trainer.apply_gradients(grads)

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            # print(colorize(msg, color='magenta'))
                            # start_time = time.time()
                            yield
                            # print(colorize("done in {:.3f} seconds".format((time.time() - start_time)),
                            #                color='magenta'))
                        else:
                            yield

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    if self.using_gail or self.using_mdal:
                        self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess)
                        self.d_adam.sync()
                    self.vfadam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.ret))
                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(self.atarg))
                    tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', self.ret)
                        tf.summary.histogram('learning_rate', self.vf_stepsize)
                        tf.summary.histogram('advantage', self.atarg)
                        tf.summary.histogram('kl_clip_range', self.max_kl)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', observation)
                        else:
                            tf.summary.histogram('observation', observation)

                self.timed = timed
                self.allmean = allmean

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi")
                if self.using_gail:
                    self.params.extend(self.reward_giver.get_trainable_variables())

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, self.old_policy.obs_ph, self.action, self.atarg, self.ret, self.learning_rate_ph, self.vtarg, self.closed_policy.obs_ph],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
예제 #7
0
    def _setup_model(self, rank, memory_size, alpha, obs_space, action_space,
                     full_state_space, noise_target_action, **kwargs):

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf_util.single_threaded_session(graph=self.graph)
            if self.use_prioritiy:
                from algorithm.priority_memory import PrioritizedMemory
                self.memory = PrioritizedMemory(capacity=memory_size,
                                                alpha=alpha)
            else:
                from algorithm.memory import Memory
                self.memory = Memory(limit=memory_size,
                                     action_shape=action_space.shape,
                                     observation_shape=obs_space.shape,
                                     full_state_shape=full_state_space.shape)
            # 定义 placeholders
            self.observe_Input = tf.placeholder(tf.float32,
                                                [None] + list(obs_space.shape),
                                                name='observe_Input')
            self.observe_Input_ = tf.placeholder(tf.float32, [None] +
                                                 list(obs_space.shape),
                                                 name='observe_Input_')
            self.f_s = tf.placeholder(tf.float32,
                                      [None] + list(full_state_space.shape),
                                      name='full_state_Input')
            self.f_s_ = tf.placeholder(tf.float32,
                                       [None] + list(full_state_space.shape),
                                       name='fill_state_Input_')
            self.R = tf.placeholder(tf.float32, [None, 1], 'r')
            self.terminals1 = tf.placeholder(tf.float32,
                                             shape=(None, 1),
                                             name='terminals1')
            self.ISWeights = tf.placeholder(tf.float32, [None, 1],
                                            name='IS_weights')
            self.n_step_steps = tf.placeholder(tf.float32,
                                               shape=(None, 1),
                                               name='n_step_reached')
            self.q_demo = tf.placeholder(tf.float32, [None, 1],
                                         name='Q_of_actions_from_memory')
            self.come_from_demo = tf.placeholder(tf.float32, [None, 1],
                                                 name='Demo_index')
            self.action_memory = tf.placeholder(tf.float32, [None] +
                                                list(action_space.shape),
                                                name='actions_from_memory')

            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=obs_space.shape)

            with tf.variable_scope('state_rms'):
                self.state_rms = RunningMeanStd(shape=full_state_space.shape)

            with tf.name_scope('obs_preprocess'):
                self.normalized_observe_Input = tf.clip_by_value(
                    normalize(self.observe_Input, self.obs_rms), -5., 5.)
                self.normalized_observe_Input_ = tf.clip_by_value(
                    normalize(self.observe_Input_, self.obs_rms), -5., 5.)

            with tf.name_scope('state_preprocess'):
                self.normalized_f_s0 = normalize(self.f_s, self.state_rms)
                self.normalized_f_s1 = normalize(self.f_s_, self.state_rms)

            with tf.variable_scope('Actor'):
                self.action, f_s_predict = self.build_actor(
                    self.normalized_observe_Input,
                    scope='eval',
                    trainable=True,
                    full_state_dim=full_state_space.shape[0])
                self.action_, _ = self.build_actor(
                    self.normalized_observe_Input_,
                    scope='target',
                    trainable=False,
                    full_state_dim=full_state_space.shape[0])

                # Target policy smoothing, by adding clipped noise to target actions
                if noise_target_action:
                    epsilon = tf.random_normal(tf.shape(self.action_),
                                               stddev=0.007)
                    epsilon = tf.clip_by_value(epsilon, -0.01, 0.01)
                    a2 = self.action_ + epsilon
                    noised_action_ = tf.clip_by_value(a2, -1, 1)
                else:
                    noised_action_ = self.action_

            with tf.variable_scope('Critic'):
                # Q值都要被clip 防止过估计.
                self.q_1 = tf.clip_by_value(
                    self.build_critic(self.normalized_f_s0,
                                      self.action,
                                      scope='eval_1',
                                      trainable=True), self.Q_value_range[0],
                    self.Q_value_range[1])

                q_1_ = self.build_critic(self.normalized_f_s1,
                                         noised_action_,
                                         scope='target_1',
                                         trainable=False)

                if self.use_TD3:
                    q_2 = tf.clip_by_value(
                        self.build_critic(self.normalized_f_s0,
                                          self.action,
                                          scope='eval_2',
                                          trainable=True),
                        self.Q_value_range[0], self.Q_value_range[1])

                    q_2_ = self.build_critic(self.normalized_f_s1,
                                             noised_action_,
                                             scope='target_2',
                                             trainable=False)

            # Collect networks parameters. It would make it more easily to manage them.
            self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               scope='Actor/eval')
            self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               scope='Actor/target')
            self.ce1_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope='Critic/eval_1')
            self.ct1_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope='Critic/target_1')

            if self.use_TD3:
                self.ce2_params = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_2')
                self.ct2_params = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_2')

            with tf.variable_scope('Soft_Update'):
                self.soft_replace_a = [
                    tf.assign(t, (1 - TAU) * t + TAU * e)
                    for t, e in zip(self.at_params, self.ae_params)
                ]
                self.soft_replace_c = [
                    tf.assign(t, (1 - TAU) * t + TAU * e)
                    for t, e in zip(self.ct1_params, self.ce1_params)
                ]
                if self.use_TD3:
                    self.soft_replace_c += [
                        tf.assign(t, (1 - TAU) * t + TAU * e)
                        for t, e in zip(self.ct2_params, self.ce2_params)
                    ]

            # critic 的误差 为 (one-step-td 误差 + n-step-td 误差 + critic_online 的L2惩罚)
            # TD3: critic一共有4个, 算两套 critic的误差, 秀儿.
            with tf.variable_scope('Critic_Lose'):
                if self.use_TD3:
                    min_q_ = tf.minimum(q_1_, q_2_)
                else:
                    min_q_ = q_1_

                self.q_target = self.R + (1. -
                                          self.terminals1) * GAMMA * min_q_
                if self.use_n_step:
                    self.n_step_target_q = self.R + (
                        1. - self.terminals1) * tf.pow(
                            GAMMA, self.n_step_steps) * min_q_
                    cliped_n_step_target_q = tf.clip_by_value(
                        self.n_step_target_q, self.Q_value_range[0],
                        self.Q_value_range[1])

                cliped_q_target = tf.clip_by_value(self.q_target,
                                                   self.Q_value_range[0],
                                                   self.Q_value_range[1])

                self.td_error_1 = tf.abs(cliped_q_target - self.q_1)
                if self.use_TD3:
                    self.td_error_2 = tf.abs(cliped_q_target - q_2)

                if self.use_n_step:
                    self.nstep_td_error_1 = tf.abs(cliped_n_step_target_q -
                                                   self.q_1)
                    if self.use_TD3:
                        self.nstep_td_error_2 = tf.abs(cliped_n_step_target_q -
                                                       q_2)

                L2_regular_1 = tf.contrib.layers.apply_regularization(
                    tf.contrib.layers.l2_regularizer(0.001),
                    weights_list=self.ce1_params)
                if self.use_TD3:
                    L2_regular_2 = tf.contrib.layers.apply_regularization(
                        tf.contrib.layers.l2_regularizer(0.001),
                        weights_list=self.ce2_params)

                one_step_losse_1 = tf.reduce_mean(
                    tf.multiply(self.ISWeights, tf.square(
                        self.td_error_1))) * self.lambda_1_step
                if self.use_TD3:
                    one_step_losse_2 = tf.reduce_mean(
                        tf.multiply(self.ISWeights, tf.square(
                            self.td_error_2))) * self.lambda_1_step

                if self.use_n_step:
                    n_step_td_losses_1 = tf.reduce_mean(
                        tf.multiply(
                            self.ISWeights, tf.square(
                                self.nstep_td_error_1))) * self.lambda_n_step
                    c_loss_1 = one_step_losse_1 + n_step_td_losses_1 + L2_regular_1

                    if self.use_TD3:
                        n_step_td_losses_2 = tf.reduce_mean(
                            tf.multiply(self.ISWeights,
                                        tf.square(self.nstep_td_error_2))
                        ) * self.lambda_n_step
                        c_loss_2 = one_step_losse_2 + n_step_td_losses_2 + L2_regular_2
                else:
                    c_loss_1 = one_step_losse_1 + L2_regular_1

                    if self.use_TD3:
                        c_loss_2 = one_step_losse_2 + L2_regular_2

            # actor 的 loss 为 最大化q(s,a) 最小化行为克隆误差.
            # (只有demo的transition 且 demo的action 比 actor生成的action q_1(s,a)高的时候 才会有克隆误差)
            with tf.variable_scope('Actor_lose'):
                Is_worse_than_demo = self.q_1 < self.q_demo
                Is_worse_than_demo = tf.cast(Is_worse_than_demo, tf.float32)
                worse_than_demo = tf.cast(tf.reduce_sum(Is_worse_than_demo),
                                          tf.int8)

                # 算action误差 我用的是平方和, 也有人用均方误差 reduce_mean. 其实都可以.
                # 我的action本来都是很小的数.
                action_diffs = Is_worse_than_demo * tf.reduce_sum(
                    self.come_from_demo *
                    tf.square(self.action - self.action_memory),
                    1,
                    keepdims=True)

                L_BC = self.LAMBDA_BC * tf.reduce_sum(action_diffs)
                auxiliary_predict_loss = self.LAMBDA_predict * tf.reduce_mean(
                    tf.square(f_s_predict - self.f_s))
                a_loss = -tf.reduce_mean(
                    self.q_1) + L_BC + auxiliary_predict_loss

            # Setting optimizer for Actor and Critic
            with tf.variable_scope('Critic_Optimizer'):
                if self.use_TD3:
                    self.critic_grads_1 = tf_util.flatgrad(
                        loss=c_loss_1, var_list=self.ce1_params)
                    self.critic_grads_2 = tf_util.flatgrad(
                        loss=c_loss_2, var_list=self.ce2_params)

                    self.critic_optimizer_1 = MpiAdam(var_list=self.ce1_params,
                                                      beta1=0.9,
                                                      beta2=0.999,
                                                      epsilon=1e-08)
                    self.critic_optimizer_2 = MpiAdam(var_list=self.ce2_params,
                                                      beta1=0.9,
                                                      beta2=0.999,
                                                      epsilon=1e-08)
                else:
                    self.critic_grads = tf_util.flatgrad(
                        loss=c_loss_1, var_list=self.ce1_params)
                    self.critic_optimizer = MpiAdam(var_list=self.ce1_params,
                                                    beta1=0.9,
                                                    beta2=0.999,
                                                    epsilon=1e-08)
            with tf.variable_scope('Actor_Optimizer'):
                self.actor_grads = tf_util.flatgrad(a_loss, self.ae_params)
                self.actor_optimizer = MpiAdam(var_list=self.ae_params,
                                               beta1=0.9,
                                               beta2=0.999,
                                               epsilon=1e-08)
            with self.sess.as_default():
                self._initialize(self.sess)

            # 保存模型
            var_list = tf.global_variables()
            print(
                "var_list!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
            )
            for v in var_list:
                print(v)
            self.saver = tf.train.Saver(var_list=var_list, max_to_keep=1)
            self.writer = tf.summary.FileWriter(
                "logs/" + self.experiment_name + "/DDPG_" + str(rank),
                self.graph)
            # TensorBoard summary
            self.a_summary = tf.summary.merge([
                tf.summary.scalar('a_loss', a_loss, family='actor'),
                tf.summary.scalar('L_BC', L_BC, family='actor'),
                tf.summary.scalar('worse_than_demo',
                                  worse_than_demo,
                                  family='actor'),
                tf.summary.scalar('auxiliary_predict_loss',
                                  auxiliary_predict_loss,
                                  family='actor')
            ])

            if self.use_TD3:
                self.c_summary = tf.summary.merge([
                    tf.summary.scalar('c_loss_1', c_loss_1, family='critic'),
                    tf.summary.scalar('c_loss_2', c_loss_2, family='critic')
                ])
            else:
                self.c_summary = tf.summary.merge(
                    [tf.summary.scalar('c_loss_1', c_loss_1, family='critic')])

            # episode summary
            self.episode_cumulate_reward = tf.placeholder(
                tf.float32, name='episode_cumulate_reward')
            self.episoed_length = tf.placeholder(
                tf.int16, name='episode_cumulate_reward')
            self.success_or_not = tf.placeholder(
                tf.int8, name='episode_cumulate_reward')

            self.eval_episode_cumulate_reward = tf.placeholder(
                tf.float32, name='episode_cumulate_reward')
            self.eval_episoed_length = tf.placeholder(
                tf.int16, name='episode_cumulate_reward')
            self.eval_success_or_not = tf.placeholder(
                tf.int8, name='episode_cumulate_reward')

            self.episode_summary = tf.summary.merge([
                tf.summary.scalar('episode_cumulate_reward',
                                  self.episode_cumulate_reward,
                                  family='episoed_result'),
                tf.summary.scalar('episoed_length',
                                  self.episoed_length,
                                  family='episoed_result'),
                tf.summary.scalar('success_or_not',
                                  self.success_or_not,
                                  family='episoed_result'),
            ])

            self.eval_episode_summary = tf.summary.merge([
                tf.summary.scalar('eval_episode_cumulate_reward',
                                  self.eval_episode_cumulate_reward,
                                  family='Eval_episoed_result'),
                tf.summary.scalar('eval_episoed_length',
                                  self.eval_episoed_length,
                                  family='Eval_episoed_result'),
                tf.summary.scalar('eval_success_or_not',
                                  self.eval_success_or_not,
                                  family='Eval_episoed_result'),
            ])