示例#1
0
def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev, verbose=0):
    """
    get the actor update, with noise.

    :param actor: (str) the actor
    :param perturbed_actor: (str) the pertubed actor
    :param param_noise_stddev: (float) the std of the parameter noise
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :return: (TensorFlow Operation) the update function
    """
    # TODO: simplify this to this:
    # assert len(actor.vars) == len(perturbed_actor.vars)
    # assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars)

    assert len(tf_util.get_globals_vars(actor)) == len(tf_util.get_globals_vars(perturbed_actor))
    assert len([var for var in tf_util.get_trainable_vars(actor) if 'LayerNorm' not in var.name]) == \
        len([var for var in tf_util.get_trainable_vars(perturbed_actor) if 'LayerNorm' not in var.name])

    updates = []
    for var, perturbed_var in zip(tf_util.get_globals_vars(actor), tf_util.get_globals_vars(perturbed_actor)):
        if var in [var for var in tf_util.get_trainable_vars(actor) if 'LayerNorm' not in var.name]:
            if verbose >= 2:
                logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
            updates.append(tf.assign(perturbed_var,
                                     var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev)))
        else:
            if verbose >= 2:
                logger.info('  {} <- {}'.format(perturbed_var.name, var.name))
            updates.append(tf.assign(perturbed_var, var))
    assert len(updates) == len(tf_util.get_globals_vars(actor))
    return tf.group(*updates)
示例#2
0
 def _setup_critic_optimizer(self):
     """
     setup the optimizer for the critic
     """
     if self.verbose >= 2:
         logger.info('setting up critic optimizer')
     normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms),
                                                    self.return_range[0], self.return_range[1])
     self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
     if self.critic_l2_reg > 0.:
         critic_reg_vars = [var for var in tf_util.get_trainable_vars('model/qf/')
                            if 'bias' not in var.name and 'output' not in var.name and 'b' not in var.name]
         if self.verbose >= 2:
             for var in critic_reg_vars:
                 logger.info('  regularizing: {}'.format(var.name))
             logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
         critic_reg = tc.layers.apply_regularization(
             tc.layers.l2_regularizer(self.critic_l2_reg),
             weights_list=critic_reg_vars
         )
         self.critic_loss += critic_reg
     critic_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/qf/')]
     critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
     if self.verbose >= 2:
         logger.info('  critic shapes: {}'.format(critic_shapes))
         logger.info('  critic params: {}'.format(critic_nb_params))
     self.critic_grads = tf_util.flatgrad(self.critic_loss, tf_util.get_trainable_vars('model/qf/'),
                                          clip_norm=self.clip_norm)
     self.critic_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/qf/'), beta1=0.9, beta2=0.999,
                                     epsilon=1e-08)
示例#3
0
    def _setup_critic_optimizer(self):
        """
        setup the optimizer for the critic
        """
        if self.verbose >= 2:
            logger.info('setting up critic optimizer')

        ### BSS LOSS ###
        all_vars = [v for v in tf.global_variables()]
        self.l2_loss = 0.0
        for var in all_vars:
            if 'qf' in var.name:
                self.l2_loss += tf.losses.mean_squared_error(
                    tf.zeros(var.shape), var)

        _, qf_features = self.policy_tf.feature_matrices()
        singular_qf = tf.linalg.svd(qf_features, compute_uv=False)
        self.bss_loss = tf.reduce_sum(tf.square(singular_qf[-1]))
        ### BSS LOSS ###

        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) + \
            self.bss_coef * self.bss_loss + self.l2_coef * self.l2_loss
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in tf_util.get_trainable_vars('model/qf/')
                if 'bias' not in var.name and 'qf_output' not in var.name
                and 'b' not in var.name
            ]
            if self.verbose >= 2:
                for var in critic_reg_vars:
                    logger.info('  regularizing: {}'.format(var.name))
                logger.info('  applying l2 regularization with {}'.format(
                    self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list()
            for var in tf_util.get_trainable_vars('model/qf/')
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        if self.verbose >= 2:
            logger.info('  critic shapes: {}'.format(critic_shapes))
            logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = tf_util.flatgrad(
            self.critic_loss,
            tf_util.get_trainable_vars('model/qf/'),
            clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(
            var_list=tf_util.get_trainable_vars('model/qf/'),
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08)
示例#4
0
 def _setup_target_network_updates(self):
     """
     set the target update operations
     """
     init_updates, soft_updates = get_target_updates(
         tf_util.get_trainable_vars('model/'),
         tf_util.get_trainable_vars('target/'), self.tau, self.verbose)
     self.target_init_updates = init_updates
     self.target_soft_updates = soft_updates
示例#5
0
 def _setup_actor_optimizer(self):
     """
     setup the optimizer for the actor
     """
     if self.verbose >= 2:
         logger.info('setting up actor optimizer')
     self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
     actor_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/pi/')]
     actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
     if self.verbose >= 2:
         logger.info('  actor shapes: {}'.format(actor_shapes))
         logger.info('  actor params: {}'.format(actor_nb_params))
     self.actor_grads = tf_util.flatgrad(self.actor_loss, tf_util.get_trainable_vars('model/pi/'),
                                         clip_norm=self.clip_norm)
     self.actor_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999,
                                    epsilon=1e-08)
示例#6
0
def get_vars(scope):
    """
    Alias for get_trainable_vars
    :param scope: (str)
    :return: [tf Variable]
    """
    return tf_util.get_trainable_vars(scope)
示例#7
0
    def setup_model(self):

        with SetVerbosity(self.verbose):
            for i in range(self.num_agents):
                assert not isinstance(self.action_space, gym.spaces.Box), \
                    "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            # print(test_policy.type)
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)
                self.params = []

                print("AC SPC", self.action_space)
                for i in range(self.num_agents):
                    with tf.variable_scope("agent" + str(i)):
                        optimizer = tf.train.AdamOptimizer(
                            learning_rate=self.learning_rate)
                        act, _train_step, update_target, step_model = build_train(
                            q_func=partial(self.policy, **self.policy_kwargs),
                            ob_space=self.observation_space,
                            ac_space=self.action_space,
                            optimizer=optimizer,
                            gamma=self.gamma,
                            grad_norm_clipping=10,
                            param_noise=self.param_noise,
                            sess=self.sess,
                            full_tensorboard_log=
                            False,  #self.full_tensorboard_log,
                            double_q=self.double_q)
                        self.act.append(act)
                        self._train_step.append(_train_step)
                        self.step_model.append(step_model)
                        self.proba_step.append(step_model.proba_step)
                        self.update_target.append(update_target)
                        self.params.extend(
                            tf_util.get_trainable_vars("agent" + str(i) +
                                                       "/deepq"))

                print(self.params)

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(
                    self.sess
                )  # TODO: copy this file, make two versions of the algorithm.
                for i in range(self.num_agents):
                    self.update_target[i](
                        sess=self.sess
                    )  # TODO: Not sure, seems like the best thing to do is try using each agents own target first.
示例#8
0
def get_vars(scope):
    """
    Alias for get_trainable_vars

    :param scope: (str)
    :return: [tf Variable]
    """
    # prefix = tf.get_variable_scope().name.split('/')[0] + '/'
    # return tf_util.get_trainable_vars(prefix + scope)
    return tf_util.get_trainable_vars(scope)
    def setup_model(self):

        with SetVerbosity(self.verbose):
            assert not isinstance(self.action_space, gym.spaces.Box), \
                "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(graph=self.graph)
                self._setup_learn(self.seed)

                optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
                #optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate, momentum=0.95, epsilon=0.01)

                self.act, self._train_step, self.update_target, self._train_phi_step, self.step_model, _ = deepq_kpi.build_train(
                    q_func=partial(self.policy, **self.policy_kwargs),
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    optimizer=optimizer,
                    gamma=self.gamma,
                    kappa=self.kappa,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise,
                    sess=self.sess
                )
                self.proba_step = self.step_model.proba_step
                self.params = tf_util.get_trainable_vars("deepq")

                @contextmanager
                def timed(msg):
                    if self.verbose >= 1:
                        print(colorize(msg, color='magenta'))
                        start_time = time.time()
                        yield
                        print(colorize("done in {:.3f} seconds".format((time.time() - start_time)),
                                           color='magenta'))
                    else:
                        yield 

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.timed = timed
                self.summary = tf.summary.merge_all()
示例#10
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.num_action_streams = self.action_space.shape[0] 
            self.num_actions = self.num_actions_pad*self.num_action_streams # total numb network outputs for action branching with one action dimension per branch
            self.low = self.action_space.low 
            self.high = self.action_space.high 
            self.actions_range = np.subtract(self.high, self.low)

            if issubclass(self.policy, ActionBranching): self.bdq = True

            # BDQ allows continous output
            assert isinstance(self.action_space, gym.spaces.Box), \
                "Error: BDQ cannot output a gym.spaces.Discrete action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            assert issubclass(test_policy, BDQPolicy), "Error: the input policy for the BDQ model must be " \
                                                       "an instance of BDQPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                # optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)

                self.act, self._train_step, self.update_target, self.step_model = build_train(
                    q_func=partial(self.policy, **self.policy_kwargs),
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    num_actions=self.num_actions,
                    num_action_streams=self.num_action_streams,
                    batch_size=self.batch_size,
                    gamma=self.gamma,
                    grad_norm_clipping=self.grad_norm_clipping,
                    optimizer_name="Adam",
                    learning_rate=self.learning_rate,
                    sess=self.sess,
                    full_tensorboard_log=self.full_tensorboard_log,
                    double_q=self.double_q
                )
                self.proba_step = self.step_model.proba_step
                self.params = tf_util.get_trainable_vars("bdq")
                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.summary = tf.summary.merge_all()
示例#11
0
    def _setup_actor_optimizer(self):
        """
        setup the optimizer for the actor
        """
        if self.verbose >= 2:
            logger.info('setting up actor optimizer')

        ### BSS LOSS ###
        all_vars = [v for v in tf.global_variables()]
        self.l2_loss = 0.0
        for var in all_vars:
            if 'pi' in var.name:
                self.l2_loss += tf.losses.mean_squared_error(
                    tf.zeros(var.shape), var)

        pi_features, _ = self.policy_tf.feature_matrices()
        singular_pi = tf.linalg.svd(pi_features, compute_uv=False)
        self.bss_loss = tf.reduce_sum(tf.square(singular_pi[-1]))
        ### BSS LOSS ###

        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) + \
                          self.bss_coef * self.bss_loss + self.l2_coef * self.l2_loss
        actor_shapes = [
            var.get_shape().as_list()
            for var in tf_util.get_trainable_vars('model/pi/')
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        if self.verbose >= 2:
            logger.info('  actor shapes: {}'.format(actor_shapes))
            logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = tf_util.flatgrad(
            self.actor_loss,
            tf_util.get_trainable_vars('model/pi/'),
            clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(
            var_list=tf_util.get_trainable_vars('model/pi/'),
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08)
示例#12
0
    def _setup_popart(self):
        """
        setup pop-art normalization of the critic output

        See https://arxiv.org/pdf/1602.07714.pdf for details.
        Preserving Outputs Precisely, while Adaptively Rescaling Targets”.
        """
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_q_outputs_op = []
        for out_vars in [[var for var in tf_util.get_trainable_vars('model/qf/') if 'output' in var.name],
                         [var for var in tf_util.get_trainable_vars('target/qf/') if 'output' in var.name]]:
            assert len(out_vars) == 2
            # wieght and bias of the last layer
            weight, bias = out_vars
            assert 'kernel' in weight.name
            assert 'bias' in bias.name
            assert weight.get_shape()[-1] == 1
            assert bias.get_shape()[-1] == 1
            self.renormalize_q_outputs_op += [weight.assign(weight * self.old_std / new_std)]
            self.renormalize_q_outputs_op += [bias.assign((bias * self.old_std + self.old_mean - new_mean) / new_std)]
示例#13
0
    def setup_model(self):

        with SetVerbosity(self.verbose):
            assert not isinstance(self.action_space, gym.spaces.Box), \
                "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                if self.use_rmsprop:
                    optimizer = tf.train.RMSPropOptimizer(
                        learning_rate=self.learning_rate, decay=self.rmsprop_alpha, epsilon=self.rmsprop_epsilon,
                        centered=True
                    )
                else:
                    optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)

                self.act, self._train_step, self.update_target, self.step_model = build_train(
                    q_func=partial(self.policy, **self.policy_kwargs),
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    optimizer=optimizer,
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise,
                    sess=self.sess,
                    full_tensorboard_log=self.full_tensorboard_log,
                    double_q=self.double_q
                )
                self.proba_step = self.step_model.proba_step
                self.params = tf_util.get_trainable_vars("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.summary = tf.summary.merge_all()
示例#14
0
    def init_network_continuous(self, input, name):
        with tf.variable_scope(name):
            model = tf.layers.dense(input, 8, activation=tf.nn.relu)
            model = tf.layers.dense(model,
                                    self.action_space.shape[0],
                                    activation=tf.nn.sigmoid)

        self._proba_distribution, _, _ = \
            self._pdtype.proba_distribution_from_latent(model, model, init_scale=0.01)

        self.action_ph = self._pdtype.sample_placeholder([None],
                                                         name='action_ph')
        self._policy_proba = [
            self._proba_distribution.mean, self._proba_distribution.std
        ]
        self.params = tf_util.get_trainable_vars('net')
        self.pg_loss = tf.gradients(
            self._proba_distribution.neglogp(self.action_ph), self.params)
        return model
示例#15
0
    def setup_model(self):

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO2 model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."
            self.n_batch = self.n_envs * self.n_steps
            n_cpu = multiprocessing.cpu_count()
            if sys.platform == 'darwin':
                n_cpu //= 2
            self.graph = tf.Graph()

            with self.graph.as_default():
                self.sess = tf_util.make_session(num_cpu=n_cpu,
                                                 graph=self.graph)

                n_batch_step = None
                n_batch_train = None

                if self.retrain_victim:
                    # assert is mlp policy
                    if self.env_name in ['multicomp/YouShallNotPassHumans-v0']:
                        act_model = MlpPolicyValue(
                            scope="victim_policy",
                            reuse=False,
                            ob_space=self.observation_space,
                            ac_space=self.action_space,
                            sess=self.sess,
                            hiddens=[64, 64],
                            normalize=self.norm_victim)
                        with tf.variable_scope(
                                "train_model",
                                reuse=True,
                                custom_getter=tf_util.outer_scope_getter(
                                    "train_model")):
                            train_model = MlpPolicyValue(
                                scope="victim_policy",
                                reuse=True,
                                ob_space=self.observation_space,
                                ac_space=self.action_space,
                                sess=self.sess,
                                hiddens=[64, 64],
                                normalize=self.norm_victim)
                else:
                    if issubclass(self.policy, RecurrentActorCriticPolicy):
                        assert self.n_envs % self.nminibatches == 0, "For recurrent policies, " \
                                                                     "the number of environments run in parallel should be a multiple of nminibatches."
                        n_batch_step = self.n_envs
                        n_batch_train = self.n_batch // self.nminibatches

                    act_model = self.policy(self.sess,
                                            self.observation_space,
                                            self.action_space,
                                            self.n_envs,
                                            1,
                                            n_batch_step,
                                            reuse=False,
                                            **self.policy_kwargs)
                    with tf.variable_scope(
                            "train_model",
                            reuse=True,
                            custom_getter=tf_util.outer_scope_getter(
                                "train_model")):
                        train_model = self.policy(self.sess,
                                                  self.observation_space,
                                                  self.action_space,
                                                  self.n_envs //
                                                  self.nminibatches,
                                                  self.n_steps,
                                                  n_batch_train,
                                                  reuse=True,
                                                  **self.policy_kwargs)
                if self.black_box_att:
                    with tf.variable_scope("mimic_model", reuse=False):
                        self.mimic_model = RL_model(input_shape=self.observation_space.shape, \
                                                 out_shape=self.action_space.shape)
                        self.mimic_model.load(self.mimic_model_path)

                with tf.variable_scope("loss", reuse=False):

                    if self.retrain_victim:
                        self.action_ph = tf.placeholder(
                            shape=[None, self.action_space.shape[0]],
                            dtype=tf.float32,
                            name="action_ph")
                    else:
                        self.action_ph = train_model.pdtype.sample_placeholder(
                            [None], name="action_ph")

                    self.advs_ph = tf.placeholder(tf.float32, [None],
                                                  name="advs_ph")
                    self.rewards_ph = tf.placeholder(tf.float32, [None],
                                                     name="rewards_ph")
                    self.old_neglog_pac_ph = tf.placeholder(
                        tf.float32, [None], name="old_neglog_pac_ph")
                    self.old_vpred_ph = tf.placeholder(tf.float32, [None],
                                                       name="old_vpred_ph")
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")
                    self.clip_range_ph = tf.placeholder(tf.float32, [],
                                                        name="clip_range_ph")

                    # Xian added
                    self.action_opp_next_ph = tf.placeholder(
                        dtype=tf.float32,
                        shape=self.action_ph.shape,
                        name="action_opp_next_ph")
                    self.obs_opp_next_ph = tf.placeholder(
                        dtype=tf.float32,
                        shape=train_model.obs_ph.shape,
                        name="obs_opp_next_ph")
                    self.stochastic_ph = tf.placeholder(tf.bool, (),
                                                        name="stochastic")
                    self.ratio_ph = tf.placeholder(
                        tf.float32, [], name="change_action_state_ratio_ph")

                    action_ph_noise = train_model.deterministic_action

                    with tf.variable_scope("statem", reuse=True):
                        obs_oppo_predict, obs_oppo_noise_predict = modeling_state(
                            self.action_ph, action_ph_noise,
                            train_model.obs_ph)
                    if not self.masking_attention:
                        self.attention = tf.placeholder(dtype=tf.float32,
                                                        shape=[None],
                                                        name="attention_ph")
                    else:
                        self.attention = tf.placeholder(
                            dtype=tf.float32,
                            shape=[None, train_model.obs_ph.shape[1]],
                            name="attention_ph")
                        obs_oppo_noise_predict = tf.multiply(
                            obs_oppo_noise_predict, self.attention)

                    if not self.black_box_att:
                        with tf.variable_scope("victim_param",
                                               reuse=tf.AUTO_REUSE):
                            action_opp_mal_noise, _ = mlp_policy(obs_oppo_noise_predict, self.stochastic_ph, self.env.observation_space, \
                                                    self.env.action_space, [64, 64], True)
                    else:
                        # load the pretrained victim model
                        with tf.variable_scope("victim_param",
                                               reuse=tf.AUTO_REUSE):
                            victim_model = RL_func(
                                self.observation_space.shape[0],
                                self.action_space.shape[0])
                            action_opp_mal_noise = victim_model(
                                obs_oppo_noise_predict)
                    if not self.masking_attention:
                        # update 2019/07/19, if not making, attention is only weighting loss
                        # on action along with time
                        # oppo's action change
                        # change into L infinity norm
                        self.change_opp_action_mse = tf.reduce_mean(
                            tf.abs(
                                tf.multiply(
                                    action_opp_mal_noise -
                                    self.action_opp_next_ph,
                                    tf.expand_dims(self.attention, axis=-1))))
                    else:
                        self.change_opp_action_mse = tf.reduce_mean(
                            tf.abs(action_opp_mal_noise -
                                   self.action_opp_next_ph))
                    # add change_state_mse
                    self.change_state_mse = self.ratio_ph * tf.reduce_mean(
                        tf.abs(obs_oppo_noise_predict - self.obs_opp_next_ph))
                    self.change_mse = self.change_opp_action_mse - self.change_state_mse
                    # Prediction error on oppo's next observation
                    # change into the L infinity norm
                    # L(infinity) = max(0, ||l1 -l2|| - c)^2
                    self.state_modeling_mse = tf.reduce_mean(
                        tf.square(
                            tf.math.maximum(
                                tf.abs(obs_oppo_predict - self.obs_opp_next_ph)
                                - 1, 0)))

                    neglogpac = train_model.proba_distribution.neglogp(
                        self.action_ph)
                    self.entropy = tf.reduce_mean(
                        train_model.proba_distribution.entropy())

                    if self.retrain_victim:
                        vpred = tf.reshape(train_model.value_flat, [-1])
                    else:
                        vpred = train_model.value_flat
                    vpredclipped = self.old_vpred_ph + tf.clip_by_value(
                        train_model.value_flat - self.old_vpred_ph,
                        -self.clip_range_ph, self.clip_range_ph)
                    vf_losses1 = tf.square(vpred - self.rewards_ph)
                    vf_losses2 = tf.square(vpredclipped - self.rewards_ph)
                    self.vf_loss = .5 * tf.reduce_mean(
                        tf.maximum(vf_losses1, vf_losses2))
                    ratio = tf.exp(self.old_neglog_pac_ph - neglogpac)
                    pg_losses = -self.advs_ph * ratio
                    pg_losses2 = -self.advs_ph * tf.clip_by_value(
                        ratio, 1.0 - self.clip_range_ph,
                        1.0 + self.clip_range_ph)
                    self.pg_loss = tf.reduce_mean(
                        tf.maximum(pg_losses, pg_losses2))
                    self.approxkl = .5 * tf.reduce_mean(
                        tf.square(neglogpac - self.old_neglog_pac_ph))
                    self.clipfrac = tf.reduce_mean(
                        tf.cast(
                            tf.greater(tf.abs(ratio - 1.0),
                                       self.clip_range_ph), tf.float32))
                    loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef + \
                           self.hyper_weights[1] * self.change_mse

                    if self.black_box_att:
                        if not self.pretrained_mimic:
                            loss_mimic = self.hyper_weights[
                                3] * self.state_modeling_mse
                        else:
                            loss_mimic = self.hyper_weights[
                                3] * self.state_modeling_mse
                    else:  # if its' white box attack, then do not model the action output
                        loss_mimic = self.hyper_weights[
                            3] * self.state_modeling_mse

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', self.pg_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('approximate_kullback-leibler',
                                      self.approxkl)
                    tf.summary.scalar('clip_factor', self.clipfrac)
                    tf.summary.scalar('loss', loss)
                    tf.summary.scalar('loss_mimic', loss_mimic)
                    tf.summary.scalar(
                        '_change oppo action mse',
                        self.hyper_weights[1] * self.change_opp_action_mse)
                    tf.summary.scalar('_predict state mse',
                                      self.state_modeling_mse)

                    # add ppo loss
                    tf.summary.scalar(
                        '_PPO loss', loss -
                        self.hyper_weights[1] * self.change_opp_action_mse)

                    if self.retrain_victim:
                        params = tf_util.get_trainable_vars("victim_policy")
                    else:
                        params = tf_util.get_trainable_vars("model")
                    if self.full_tensorboard_log:
                        for var in params:
                            tf.summary.histogram(var.name, var)

                    self.params = [
                        params,
                        tf_util.get_trainable_vars("loss/statem")
                    ]

                    grads = tf.gradients(loss, self.params[0])
                    if self.max_grad_norm is not None:
                        grads, _grad_norm = tf.clip_by_global_norm(
                            grads, self.max_grad_norm)
                    grads = list(zip(grads, self.params[0]))

                    grads_mimic = tf.gradients(loss_mimic, self.params[1])
                    if self.max_grad_norm is not None:
                        grads_mimic, _grad_norm_mimic = tf.clip_by_global_norm(
                            grads_mimic, self.max_grad_norm)
                    grads_mimic = list(zip(grads_mimic, self.params[1]))

                trainer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate_ph, epsilon=1e-5)
                self._train = trainer.apply_gradients(grads)

                trainer_mimic = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate_ph, epsilon=1e-5)
                self._train_mimic = trainer_mimic.apply_gradients(grads_mimic)

                self.loss_names = [
                    'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
                    'clipfrac', '_change_opp_a_loss', '_s_modeling_loss',
                    '_a_modeling_loss'
                ]

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(self.rewards_ph))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))
                    tf.summary.scalar('advantage',
                                      tf.reduce_mean(self.advs_ph))
                    tf.summary.scalar('clip_range',
                                      tf.reduce_mean(self.clip_range_ph))
                    tf.summary.scalar('old_neglog_action_probabilty',
                                      tf.reduce_mean(self.old_neglog_pac_ph))
                    tf.summary.scalar('old_value_pred',
                                      tf.reduce_mean(self.old_vpred_ph))
                    # add attention onto the final results
                    tf.summary.scalar(
                        'att_hyp',
                        self.hyper_weights[1] * tf.reduce_mean(self.attention))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards',
                                             self.rewards_ph)
                        tf.summary.histogram('learning_rate',
                                             self.learning_rate_ph)
                        tf.summary.histogram('advantage', self.advs_ph)
                        tf.summary.histogram('clip_range', self.clip_range_ph)
                        tf.summary.histogram('old_neglog_action_probabilty',
                                             self.old_neglog_pac_ph)
                        tf.summary.histogram('old_value_pred',
                                             self.old_vpred_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 train_model.obs_ph)
                self.train_model = train_model
                self.act_model = act_model
                self.step = act_model.step
                self.proba_step = act_model.proba_step
                self.value = act_model.value
                self.initial_state = act_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)  # pylint: disable=E1101

                # load the pretrained_value

                if self.retrain_victim:
                    env_path = get_zoo_path(self.env_name, tag=2)
                    param = load_from_file(param_pkl_path=env_path)
                    ret_variable = tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES,
                        scope="victim_policy/retfilter")
                    obs_variable = tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES,
                        scope="victim_policy/obsfilter")
                    variables = ret_variable + obs_variable + tf.get_collection(
                        tf.GraphKeys.TRAINABLE_VARIABLES,
                        scope="victim_policy")
                    setFromFlat(variables, param, self.sess)

                if True:
                    victim_variable = tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES, "loss/victim_param")
                    param = load_from_file(param_pkl_path=self.env_path)
                    setFromFlat(victim_variable, param, sess=self.sess)

                self.summary = tf.summary.merge_all()
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACER model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            if isinstance(self.action_space, Discrete):
                self.n_act = self.action_space.n
                continuous = False
            elif isinstance(self.action_space, Box):
                # self.n_act = self.action_space.shape[-1]
                # continuous = True
                raise NotImplementedError("WIP: Acer does not support Continuous actions yet.")
            else:
                raise ValueError("Error: ACER does not work with {} actions space.".format(self.action_space))

            self.n_batch = self.n_envs * self.n_steps

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)
                self.set_random_seed(self.seed)
                n_batch_step = None
                if issubclass(self.policy, RecurrentActorCriticPolicy):
                    n_batch_step = self.n_envs
                n_batch_train = self.n_envs * (self.n_steps + 1)

                step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                         n_batch_step, reuse=False, **self.policy_kwargs)

                self.params = tf_util.get_trainable_vars("model")

                with tf.variable_scope("train_model", reuse=True,
                                       custom_getter=tf_util.outer_scope_getter("train_model")):
                    train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs,
                                              self.n_steps + 1, n_batch_train, reuse=True, **self.policy_kwargs)

                with tf.variable_scope("moving_average"):
                    # create averaged model
                    ema = tf.train.ExponentialMovingAverage(self.alpha)
                    ema_apply_op = ema.apply(self.params)

                    def custom_getter(getter, name, *args, **kwargs):
                        name = name.replace("polyak_model/", "")
                        val = ema.average(getter(name, *args, **kwargs))
                        return val

                with tf.variable_scope("polyak_model", reuse=True, custom_getter=custom_getter):
                    self.polyak_model = polyak_model = self.policy(self.sess, self.observation_space, self.action_space,
                                                                   self.n_envs, self.n_steps + 1,
                                                                   self.n_envs * (self.n_steps + 1), reuse=True,
                                                                   **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.done_ph = tf.placeholder(tf.float32, [self.n_batch])  # dones
                    self.reward_ph = tf.placeholder(tf.float32, [self.n_batch])  # rewards, not returns
                    self.mu_ph = tf.placeholder(tf.float32, [self.n_batch, self.n_act])  # mu's
                    self.action_ph = train_model.pdtype.sample_placeholder([self.n_batch])
                    self.learning_rate_ph = tf.placeholder(tf.float32, [])
                    eps = 1e-6

                    # Notation: (var) = batch variable, (var)s = sequence variable,
                    # (var)_i = variable index by action at step i
                    # shape is [n_envs * (n_steps + 1)]
                    if continuous:
                        value = train_model.value_flat
                    else:
                        value = tf.reduce_sum(train_model.policy_proba * train_model.q_value, axis=-1)

                    rho, rho_i_ = None, None
                    if continuous:
                        action_ = strip(train_model.proba_distribution.sample(), self.n_envs, self.n_steps)
                        distribution_f = tf.contrib.distributions.MultivariateNormalDiag(
                            loc=strip(train_model.proba_distribution.mean, self.n_envs, self.n_steps),
                            scale_diag=strip(train_model.proba_distribution.logstd, self.n_envs, self.n_steps))
                        f_polyak = tf.contrib.distributions.MultivariateNormalDiag(
                            loc=strip(polyak_model.proba_distribution.mean, self.n_envs, self.n_steps),
                            scale_diag=strip(polyak_model.proba_distribution.logstd, self.n_envs, self.n_steps))

                        f_i = distribution_f.prob(self.action_ph)
                        f_i_ = distribution_f.prob(action_)
                        f_polyak_i = f_polyak.prob(self.action_ph)
                        phi_i = strip(train_model.proba_distribution.mean, self.n_envs, self.n_steps)

                        q_value = strip(train_model.value_fn, self.n_envs, self.n_steps)
                        q_i = q_value[:, 0]

                        rho_i = tf.reshape(f_i, [-1, 1]) / (self.mu_ph + eps)
                        rho_i_ = tf.reshape(f_i_, [-1, 1]) / (self.mu_ph + eps)

                        qret = q_retrace(self.reward_ph, self.done_ph, q_i, value, tf.pow(rho_i, 1 / self.n_act),
                                         self.n_envs, self.n_steps, self.gamma)
                    else:
                        # strip off last step
                        # f is a distribution, chosen to be Gaussian distributions
                        # with fixed diagonal covariance and mean \phi(x)
                        # in the paper
                        distribution_f, f_polyak, q_value = \
                            map(lambda variables: strip(variables, self.n_envs, self.n_steps),
                                [train_model.policy_proba, polyak_model.policy_proba, train_model.q_value])

                        # Get pi and q values for actions taken
                        f_i = get_by_index(distribution_f, self.action_ph)
                        f_i_ = distribution_f
                        phi_i = distribution_f
                        f_polyak_i = f_polyak

                        q_i = get_by_index(q_value, self.action_ph)

                        # Compute ratios for importance truncation
                        rho = distribution_f / (self.mu_ph + eps)
                        rho_i = get_by_index(rho, self.action_ph)

                        # Calculate Q_retrace targets
                        qret = q_retrace(self.reward_ph, self.done_ph, q_i, value, rho_i, self.n_envs, self.n_steps,
                                         self.gamma)

                    # Calculate losses
                    # Entropy
                    entropy = tf.reduce_sum(train_model.proba_distribution.entropy())

                    # Policy Gradient loss, with truncated importance sampling & bias correction
                    value = strip(value, self.n_envs, self.n_steps, True)
                    # check_shape([qret, value, rho_i, f_i], [[self.n_envs * self.n_steps]] * 4)
                    # check_shape([rho, distribution_f, q_value], [[self.n_envs * self.n_steps, self.n_act]] * 2)

                    # Truncated importance sampling
                    adv = qret - value
                    log_f = tf.log(f_i + eps)
                    # [n_envs * n_steps]
                    gain_f = log_f * tf.stop_gradient(adv * tf.minimum(self.correction_term, rho_i))
                    loss_f = -tf.reduce_mean(gain_f)

                    # Bias correction for the truncation
                    adv_bc = (q_value - tf.reshape(value, [self.n_envs * self.n_steps, 1]))  # [n_envs * n_steps, n_act]

                    # check_shape([adv_bc, log_f_bc], [[self.n_envs * self.n_steps, self.n_act]] * 2)
                    if continuous:
                        gain_bc = tf.stop_gradient(adv_bc *
                                                   tf.nn.relu(1.0 - (self.correction_term / (rho_i_ + eps))) *
                                                   f_i_)
                    else:
                        log_f_bc = tf.log(f_i_ + eps)  # / (f_old + eps)
                        gain_bc = tf.reduce_sum(log_f_bc *
                                                tf.stop_gradient(
                                                    adv_bc *
                                                    tf.nn.relu(1.0 - (self.correction_term / (rho + eps))) *
                                                    f_i_),
                                                axis=1)
                    # IMP: This is sum, as expectation wrt f
                    loss_bc = -tf.reduce_mean(gain_bc)

                    loss_policy = loss_f + loss_bc

                    # Value/Q function loss, and explained variance
                    check_shape([qret, q_i], [[self.n_envs * self.n_steps]] * 2)
                    explained_variance = q_explained_variance(tf.reshape(q_i, [self.n_envs, self.n_steps]),
                                                              tf.reshape(qret, [self.n_envs, self.n_steps]))
                    loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5)

                    # Net loss
                    check_shape([loss_policy, loss_q, entropy], [[]] * 3)
                    loss = loss_policy + self.q_coef * loss_q - self.ent_coef * entropy

                    tf.summary.scalar('entropy_loss', entropy)
                    tf.summary.scalar('policy_gradient_loss', loss_policy)
                    tf.summary.scalar('value_function_loss', loss_q)
                    tf.summary.scalar('loss', loss)

                    norm_grads_q, norm_grads_policy, avg_norm_grads_f = None, None, None
                    avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj = None, None, None, None
                    if self.trust_region:
                        # [n_envs * n_steps, n_act]
                        grad = tf.gradients(- (loss_policy - self.ent_coef * entropy) * self.n_steps * self.n_envs,
                                            phi_i)
                        # [n_envs * n_steps, n_act] # Directly computed gradient of KL divergence wrt f
                        kl_grad = - f_polyak_i / (f_i_ + eps)
                        k_dot_g = tf.reduce_sum(kl_grad * grad, axis=-1)
                        adj = tf.maximum(0.0, (tf.reduce_sum(kl_grad * grad, axis=-1) - self.delta) / (
                                tf.reduce_sum(tf.square(kl_grad), axis=-1) + eps))  # [n_envs * n_steps]

                        # Calculate stats (before doing adjustment) for logging.
                        avg_norm_k = avg_norm(kl_grad)
                        avg_norm_g = avg_norm(grad)
                        avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
                        avg_norm_adj = tf.reduce_mean(tf.abs(adj))

                        grad = grad - tf.reshape(adj, [self.n_envs * self.n_steps, 1]) * kl_grad
                        # These are turst region adjusted gradients wrt f ie statistics of policy pi
                        grads_f = -grad / (self.n_envs * self.n_steps)
                        grads_policy = tf.gradients(f_i_, self.params, grads_f)
                        grads_q = tf.gradients(loss_q * self.q_coef, self.params)
                        grads = [gradient_add(g1, g2, param, verbose=self.verbose)
                                 for (g1, g2, param) in zip(grads_policy, grads_q, self.params)]

                        avg_norm_grads_f = avg_norm(grads_f) * (self.n_steps * self.n_envs)
                        norm_grads_q = tf.global_norm(grads_q)
                        norm_grads_policy = tf.global_norm(grads_policy)
                    else:
                        grads = tf.gradients(loss, self.params)

                    norm_grads = None
                    if self.max_grad_norm is not None:
                        grads, norm_grads = tf.clip_by_global_norm(grads, self.max_grad_norm)
                    grads = list(zip(grads, self.params))

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('rewards', tf.reduce_mean(self.reward_ph))
                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate))
                    tf.summary.scalar('advantage', tf.reduce_mean(adv))
                    tf.summary.scalar('action_probability', tf.reduce_mean(self.mu_ph))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('rewards', self.reward_ph)
                        tf.summary.histogram('learning_rate', self.learning_rate)
                        tf.summary.histogram('advantage', adv)
                        tf.summary.histogram('action_probability', self.mu_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.summary.histogram('observation', train_model.obs_ph)

                trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.rprop_alpha,
                                                    epsilon=self.rprop_epsilon)
                _opt_op = trainer.apply_gradients(grads)

                # so when you call _train, you first do the gradient step, then you apply ema
                with tf.control_dependencies([_opt_op]):
                    _train = tf.group(ema_apply_op)

                # Ops/Summaries to run, and their names for logging
                assert norm_grads is not None
                run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, explained_variance, norm_grads]
                names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance',
                             'norm_grads']
                if self.trust_region:
                    self.run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g,
                                              avg_norm_k_dot_g, avg_norm_adj]
                    self.names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k',
                                                  'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj']

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.initial_state = step_model.initial_state

                tf.global_variables_initializer().run(session=self.sess)

                self.summary = tf.summary.merge_all()
示例#17
0
    def setup_model(self):
        # prevent import loops
        from stable_baselines.gail.adversary import TransitionClassifier

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                if self.using_gail:
                    self.reward_giver = TransitionClassifier(self.observation_space, self.action_space,
                                                             self.hidden_size_adversary,
                                                             entcoeff=self.adversary_entcoeff)

                # Penalty related variable
                with tf.variable_scope('penalty'):
                    cur_cost_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # episodic cost

                    param_init = np.log(max(np.exp(self.penalty_init) - 1, 1e-8))
                    penalty_param = tf.get_variable('penalty_param',
                                                    initializer=float(param_init),
                                                    trainable=True,
                                                    dtype=tf.float32)
                penalty = tf.nn.softplus(penalty_param)
                penalty_loss = tf.reduce_mean(-penalty_param * (cur_cost_ph - self.cost_lim))

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)
                
                # # Network for safety value function
                # with tf.variable_Scope("vc",reuse=False):
                #     self.cost_value = MLPValue(self.sess, self.observation_spacem, self.n_envs, 1, None)
                
                with tf.variable_scope("loss", reuse=False):
                    atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
                    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
                    catarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target cost advantage function
                    cret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical cost

                    observation = self.policy_pi.obs_ph
                    action = self.policy_pi.pdtype.sample_placeholder([None])

                    kloldnew = old_policy.proba_distribution.kl(self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    vferr = tf.reduce_mean(tf.square(self.policy_pi.value_flat - ret))
                    vcerr = tf.reduce_mean(tf.square(self.policy_pi.vcf_flat - cret))
                    
                    # advantage * pnew / pold
                    ratio = tf.exp(self.policy_pi.proba_distribution.logp(action) -
                                   old_policy.proba_distribution.logp(action))
                    surrgain = tf.reduce_mean(ratio * atarg)
                    # Surrogate for cost function
                    surrcost = tf.reduce_mean(ratio * catarg)

                    optimgain = surrgain + entbonus
                    # Include surr_cost in pi_objective
                    optimgain -= penalty * surrcost
                    optimgain /= (1 + penalty)
                    # # Loss function for pi is negative of pi_objective
                    # optimgain = -optimgain # Should we??
                    
                    losses = [optimgain, meankl, entbonus, surrgain, meanent, surrcost]
                    self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy", "surrcost"]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name and "/vcf" not in v.name] # policy parameters
                    vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vcf" not in v.name] # value parameters
                    vcf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vf" not in v.name] # cost value parameters

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape))
                        start += var_size
                    gvp = tf.add_n([tf.reduce_sum(grad * tangent)
                                    for (grad, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
                    # Fisher vector products
                    fvp = tf_util.flatgrad(gvp, var_list)

                    tf.summary.scalar('penalty_loss', penalty_loss)
                    tf.summary.scalar('entropy_loss', meanent)
                    tf.summary.scalar('policy_gradient_loss', optimgain)
                    tf.summary.scalar('value_function_loss', surrgain)
                    tf.summary.scalar('constraint_cost_function_loss', surrcost)
                    tf.summary.scalar('approximate_kullback-leibler', meankl)
                    tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent + surrcost + penalty_loss)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg], losses)
                    self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg, catarg],
                                                        fvp) # Why need all inputs? Might for implementation easiness
                    # self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret],
                    #                                               tf_util.flatgrad(vferr, vf_var_list)) # Why need old_policy.obs_ph? Doesn't seem to be used
                    # self.compute_vcflossandgrad = tf_util.function([observation, old_policy.obs_ph, cret],
                    #                                               tf_util.flatgrad(vcerr, vcf_var_list))
                    self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret, cret],
                                                                  [tf_util.flatgrad(vferr, vf_var_list), tf_util.flatgrad(vcerr, vcf_var_list)])
                    self.compute_lagrangiangrad = tf_util.function([cur_cost_ph],
                                                                   tf_util.flatgrad(penalty_loss, [penalty_param]))

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            print(colorize(msg, color='magenta'))
                            start_time = time.time()
                            yield
                            print(colorize("done in {:.3f} seconds".format((time.time() - start_time)),
                                           color='magenta'))
                        else:
                            yield

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    if self.using_gail:
                        self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess)
                        self.d_adam.sync()
                    self.vfadam.sync()
                    # optimizer for constraint costs value function
                    self.vcadam = MpiAdam(vcf_var_list, sess=self.sess)
                    self.vcadam.sync()
                    # optimizer for lagragian value of safe RL
                    self.penaltyadam = MpiAdam([penalty_param], sess=self.sess)
                    self.penaltyadam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret))
                    tf.summary.scalar('discounted_costs', tf.reduce_mean(cret))
                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('cost_advantage', tf.reduce_mean(catarg))
                    tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', ret)
                        tf.summary.histogram('discounted_rewards', cret)
                        tf.summary.histogram('learning_rate', self.vf_stepsize)
                        tf.summary.histogram('penalty_learning_rate', self.penalty_lr)
                        tf.summary.histogram('advantage', atarg)
                        tf.summary.histogram('cost_advantage', catarg)
                        tf.summary.histogram('kl_clip_range', self.max_kl)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', observation)
                        else:
                            tf.summary.histogram('observation', observation)

                self.timed = timed
                self.allmean = allmean

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi")
                if self.using_gail:
                    self.params.extend(self.reward_giver.get_trainable_variables())

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg, ret, cret, cur_cost_ph],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
示例#18
0
    def setup_model(self):

        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy = self.policy(self.sess, self.observation_space, self.action_space,
                                                     **self.policy_kwargs)
                    self.support = tf.constant(np.arange(self.v_min, self.v_max + 1e-6, self.delta), dtype=tf.float32)
                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy.obs_ph
                    self.processed_next_obs_ph = self.target_policy.processed_obs
                    self.action_target = self.target_policy.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph")
                    self.projection_ph = tf.placeholder(tf.float32, (None, self.n_spt), name="v_projection")
                    self.q_projection_ph = tf.placeholder(tf.float32, (None, self.n_spt), name="q_projection")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    # first return value corresponds to deterministic actions
                    # policy_out corresponds to stochastic actions, used for training
                    # logp_pi is the log probability of actions taken by the policy
                    self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(self.processed_obs_ph)
                    # Monitor the entropy of the policy,
                    # this is not used for training
                    self.entropy = tf.reduce_mean(self.policy_tf.entropy)
                    #  Use two Q-functions to improve performance by reducing overestimation bias.
                    qf1_distr, qf2_distr, value_fn_distr = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph,
                                                                     create_qf=True, create_vf=True)
                    qf1_pi_distr, qf2_pi_distr, _ = self.policy_tf.make_critics(self.processed_obs_ph,
                                                                    policy_out, create_qf=True, create_vf=False,
                                                                    reuse=True)

                    # Target entropy is used when learning the entropy coefficient
                    if self.target_entropy == 'auto':
                        # automatically set target entropy if needed
                        self.target_entropy = -np.prod(self.action_space.shape).astype(np.float32)
                    else:
                        # Force conversion
                        # this will also throw an error for unexpected string
                        self.target_entropy = float(self.target_entropy)

                    # The entropy coefficient or entropy can be learned automatically
                    # see Automating Entropy Adjustment for Maximum Entropy RL section
                    # of https://arxiv.org/abs/1812.05905
                    if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'):
                        # Default initial value of ent_coef when learned
                        init_value = 1.0
                        if '_' in self.ent_coef:
                            init_value = float(self.ent_coef.split('_')[1])
                            assert init_value > 0., "The initial value of ent_coef must be greater than 0"

                        self.log_ent_coef = tf.get_variable('log_ent_coef', dtype=tf.float32,
                                                            initializer=np.log(init_value).astype(np.float32))
                        self.ent_coef = tf.exp(self.log_ent_coef)
                    else:
                        # Force conversion to float
                        # this will throw an error if a malformed string (different from 'auto')
                        # is passed
                        self.ent_coef = float(self.ent_coef)

                with tf.variable_scope("target", reuse=False):
                    # Create the value network
                    _, _, value_target_distr = self.target_policy.make_critics(self.processed_next_obs_ph,
                                                                         create_qf=False, create_vf=True)
                    self.value_target_distr = value_target_distr

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two Q-Values (Double-Q Learning)
                    # compute qf_pi, qf2_pi with pdf
                    min_qf_pi_distr = tf.where(tf.less(tf.reduce_mean(qf1_pi_distr * self.support),
                                                       tf.reduce_mean(qf2_pi_distr * self.support)),
                                               qf1_pi_distr, qf2_pi_distr)

                    min_qf_pi = tf.reduce_mean(tf.reduce_sum(min_qf_pi_distr * self.support, axis=-1))
                    self.min_qf_pi = min_qf_pi
                    q_backup_op = tf.stop_gradient(
                        self.rewards_ph +
                        (1 - self.terminals_ph) * self.gamma * self.support
                    )
                    q_backup_op = tf.clip_by_value(q_backup_op, self.v_min, self.v_max)
                    self.q_backup_op = q_backup_op
                    qf1_loss = -tf.reduce_mean(tf.log(qf1_distr + 1e-12) * tf.stop_gradient(self.projection_ph))
                    qf2_loss = -tf.reduce_mean(tf.log(qf2_distr + 1e-12) * tf.stop_gradient(self.projection_ph))

                    # Compute the entropy temperature loss
                    # it is used when the entropy coefficient is learned
                    ent_coef_loss, entropy_optimizer = None, None
                    if not isinstance(self.ent_coef, float):
                        ent_coef_loss = -tf.reduce_mean(
                            self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy))
                        entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)

                    # Compute the policy loss
                    # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi)
                    # to clip policy loss
                    qf_pi = tf.reduce_mean(self.support * min_qf_pi_distr, axis=-1, keepdims=True)
                    policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf_pi)

                    # NOTE: in the original implementation, they have an additional
                    # regularization loss for the Gaussian parameters
                    # this is not used for now
                    # policy_loss = (policy_kl_loss + policy_regularization_loss)
                    policy_loss = policy_kl_loss

                    # Target for value fn regression
                    # We update the vf towards the min of two Q-functions in order to
                    # reduce overestimation bias from function approximation error.

                    value_loss = -tf.reduce_mean(tf.log(value_fn_distr + 1e-12) * tf.stop_gradient(min_qf_pi_distr)) \
                                                 - tf.stop_gradient(tf.reduce_mean(self.ent_coef * logp_pi))
                    value_fn = tf.reduce_sum(value_fn_distr * self.support, axis=-1)
                    # value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup) ** 2)

                    values_losses = qf1_loss + qf2_loss + value_loss

                    # Policy train op
                    # (has to be separate from value train op, because min_qf_pi appears in policy_loss)
                    policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(policy_loss, var_list=tf_util.get_trainable_vars('model/pi'))

                    # Value train op
                    value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
                    values_params = tf_util.get_trainable_vars('model/values_fn')

                    source_params = tf_util.get_trainable_vars("model/values_fn")
                    target_params = tf_util.get_trainable_vars("target/values_fn")

                    # Polyak averaging for target variables
                    self.target_update_op = [
                        tf.assign(target, (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]
                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Control flow is used because sess.run otherwise evaluates in nondeterministic order
                    # and we first need to compute the policy action before computing q values losses
                    qf1, qf2 = tf.reduce_mean(tf.reduce_sum(self.support * qf1_distr, axis=-1)), tf.reduce_mean(tf.reduce_sum(self.support * qf2_distr, axis=-1))
                    with tf.control_dependencies([policy_train_op]):
                        train_values_op = value_optimizer.minimize(values_losses, var_list=values_params)

                        self.infos_names = ['policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy']
                        # All ops to call during one training step
                        self.step_ops = [policy_loss, qf1_loss, qf2_loss,
                                         value_loss, qf1, qf2, value_fn, logp_pi,
                                         self.entropy, policy_train_op, train_values_op]

                        # Add entropy coefficient optimization operation if needed
                        if ent_coef_loss is not None:
                            with tf.control_dependencies([train_values_op]):
                                ent_coef_op = entropy_optimizer.minimize(ent_coef_loss, var_list=self.log_ent_coef)
                                self.infos_names += ['ent_coef_loss', 'ent_coef']
                                self.step_ops += [ent_coef_op, ent_coef_loss, self.ent_coef]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('value_loss', value_loss)
                    tf.summary.scalar('entropy', self.entropy)
                    if ent_coef_loss is not None:
                        tf.summary.scalar('ent_coef_loss', ent_coef_loss)
                        tf.summary.scalar('ent_coef', self.ent_coef)

                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = tf_util.get_trainable_vars("model")
                self.target_params = tf_util.get_trainable_vars("target/values_fn")

                # Initialize Variables and target network
                self.projection_op = Projection(self.sess, self.graph, self.n_spt, self.v_min, self.v_max, self.delta,
                                                self.batch_size)
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()
示例#19
0
    def setup_model(self):
        # prevent import loops
        from stable_baselines.gail.adversary import TransitionClassifier

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)

                if self.using_gail:
                    self.reward_giver = TransitionClassifier(
                        self.observation_space,
                        self.action_space,
                        self.hidden_size_adversary,
                        entcoeff=self.adversary_entcoeff)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_policy = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    atarg = tf.placeholder(dtype=tf.float32, shape=[
                        None
                    ])  # Target advantage function (if applicable)
                    ret = tf.placeholder(dtype=tf.float32,
                                         shape=[None])  # Empirical return

                    observation = self.policy_pi.obs_ph
                    action = self.policy_pi.pdtype.sample_placeholder([None])

                    kloldnew = old_policy.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    vferr = tf.reduce_mean(
                        tf.square(self.policy_pi.value_fn[:, 0] - ret))

                    # advantage * pnew / pold
                    ratio = tf.exp(
                        self.policy_pi.proba_distribution.logp(action) -
                        old_policy.proba_distribution.logp(action))
                    surrgain = tf.reduce_mean(ratio * atarg)

                    optimgain = surrgain + entbonus
                    losses = [optimgain, meankl, entbonus, surrgain, meanent]
                    self.loss_names = [
                        "optimgain", "meankl", "entloss", "surrgain", "entropy"
                    ]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [
                        v for v in all_var_list
                        if "/vf" not in v.name and "/q/" not in v.name
                    ]
                    vf_var_list = [
                        v for v in all_var_list
                        if "/pi" not in v.name and "/logstd" not in v.name
                    ]

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list,
                                                             sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32,
                                                  shape=[None],
                                                  name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(
                            tf.reshape(flat_tangent[start:start + var_size],
                                       shape))
                        start += var_size
                    gvp = tf.add_n([
                        tf.reduce_sum(grad * tangent)
                        for (grad, tangent) in zipsame(klgrads, tangents)
                    ])  # pylint: disable=E1111
                    fvp = tf_util.flatgrad(gvp, var_list)

                    tf.summary.scalar('entropy_loss', meanent)
                    tf.summary.scalar('policy_gradient_loss', optimgain)
                    tf.summary.scalar('value_function_loss', surrgain)
                    tf.summary.scalar('approximate_kullback-leiber', meankl)
                    tf.summary.scalar(
                        'loss',
                        optimgain + meankl + entbonus + surrgain + meanent)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function(
                        [observation, old_policy.obs_ph, action, atarg],
                        losses)
                    self.compute_fvp = tf_util.function([
                        flat_tangent, observation, old_policy.obs_ph, action,
                        atarg
                    ], fvp)
                    self.compute_vflossandgrad = tf_util.function(
                        [observation, old_policy.obs_ph, ret],
                        tf_util.flatgrad(vferr, vf_var_list))

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            print(colorize(msg, color='magenta'))
                            start_time = time.time()
                            yield
                            print(
                                colorize("done in {:.3f} seconds".format(
                                    (time.time() - start_time)),
                                         color='magenta'))
                        else:
                            yield

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    if self.using_gail:
                        self.d_adam = MpiAdam(
                            self.reward_giver.get_trainable_variables(),
                            sess=self.sess)
                        self.d_adam.sync()
                    self.vfadam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(ret))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('kl_clip_range',
                                      tf.reduce_mean(self.max_kl))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', ret)
                        tf.summary.histogram('learning_rate', self.vf_stepsize)
                        tf.summary.histogram('advantage', atarg)
                        tf.summary.histogram('kl_clip_range', self.max_kl)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', observation)
                        else:
                            tf.summary.histogram('observation', observation)

                self.timed = timed
                self.allmean = allmean

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = find_trainable_variables("model")
                if self.using_gail:
                    self.params.extend(
                        self.reward_giver.get_trainable_variables())

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, old_policy.obs_ph, action, atarg, ret],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
示例#20
0
    def _setup_actor_optimizer(self):
        """
        setup the optimizer for the actor
        """
        if self.verbose >= 2:
            logger.info('setting up actor optimizer')

        if self.ro:
            split_group_action_raw = tf.split(self.augmented_action_raw,
                                              self.batch_size,
                                              axis=0)
            split_group_action = tf.split(self.augmented_action,
                                          self.batch_size,
                                          axis=0)
            split_group_q = tf.split(self.augmented_critic_with_actor_tf,
                                     self.batch_size,
                                     axis=0)

            self.actor_loss = 0
            q_stds = []
            for idx in range(self.batch_size):
                # softmax = tf.nn.softmax(split_group_q[idx] -
                #                         tf.reduce_max(split_group_q[idx], axis=0, keepdims=True), axis=0)
                # self.actor_loss = self.actor_loss + tf.reduce_sum(
                #     tf.reduce_sum(tf.square(split_group_action_raw[idx] -
                #                             tf.stop_gradient(split_group_action[idx])),
                #                   axis=1)
                #     * tf.stop_gradient(softmax))

                max_index = tf.argmax(split_group_q[idx], axis=0)
                q_std = tf.math.reduce_std(split_group_q[idx]) * 20
                target_action = split_group_action[idx][max_index, :]
                if self.adjust_lr:
                    self.actor_loss = self.actor_loss + \
                        tf.reduce_mean(tf.square(self.actor_tf[idx, :] - tf.stop_gradient(target_action))) \
                        / tf.stop_gradient(q_std)
                else:
                    self.actor_loss = self.actor_loss + \
                                      tf.reduce_mean(tf.square(self.actor_tf[idx, :] - tf.stop_gradient(target_action)))
                q_stds.append(q_std)
            # tf.summary.histogram("q_std", tf.stack(q_stds, axis=0))
        else:
            self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)

        actor_shapes = [
            var.get_shape().as_list()
            for var in tf_util.get_trainable_vars('model/pi/')
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        if self.verbose >= 2:
            logger.info('  actor shapes: {}'.format(actor_shapes))
            logger.info('  actor params: {}'.format(actor_nb_params))
        # self.actor_grads = tf_util.flatgrad(self.actor_loss, tf_util.get_trainable_vars('model/pi/'),
        #                                     clip_norm=self.clip_norm)
        # self.actor_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999,
        #                                epsilon=1e-08)
        self.actor_optimizer = tf.train.AdamOptimizer(
            learning_rate=self.actor_lr)
        self.actor_gradients = self.actor_optimizer.compute_gradients(
            self.actor_loss, var_list=tf_util.get_trainable_vars("model/pi/"))
        hist_summary = []
        for gradient, variable in self.actor_gradients:
            if gradient is not None:
                hist_summary.append(
                    tf.summary.histogram("gradients/" + variable.name,
                                         gradient))
                hist_summary.append(
                    tf.summary.histogram("variables/" + variable.name,
                                         variable))
        self.actor_gradient_summary = tf.summary.merge(hist_summary)
        self.actor_optimize_op = self.actor_optimizer.apply_gradients(
            self.actor_gradients)
示例#21
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert isinstance(self.action_space, gym.spaces.Box), \
                "Error: DDPG cannot output a {} action space, only spaces.Box is supported.".format(self.action_space)
            assert issubclass(self.policy, DDPGPolicy), "Error: the input policy for the DDPG model must be " \
                                                        "an instance of DDPGPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self._setup_learn(self.seed)
                # self.sess = tf_util.single_threaded_session(graph=self.graph)
                self.sess = tf_util.make_session()

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Observation normalization.
                    # if self.normalize_observations:
                    #     with tf.variable_scope('obs_rms'):
                    #         self.obs_rms = RunningMeanStd(shape=self.observation_space.shape)
                    # else:
                    #     self.obs_rms = None

                    # Return normalization.
                    # if self.normalize_returns:
                    #     with tf.variable_scope('ret_rms'):
                    #         self.ret_rms = RunningMeanStd()
                    # else:
                    #     self.ret_rms = None

                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space, 1, 1, None,
                                                 **self.policy_kwargs)

                    # Create target networks.
                    self.target_policy = self.policy(self.sess,
                                                     self.observation_space,
                                                     self.action_space, 1, 1,
                                                     None,
                                                     **self.policy_kwargs)
                    self.obs_target = self.target_policy.obs_ph
                    self.action_target = self.target_policy.action_ph

                    # normalized_obs0 = tf.clip_by_value(normalize(self.policy_tf.processed_obs, self.obs_rms),
                    #                                    self.observation_range[0], self.observation_range[1])
                    # normalized_obs1 = tf.clip_by_value(normalize(self.target_policy.processed_obs, self.obs_rms),
                    #                                    self.observation_range[0], self.observation_range[1])

                    # Inputs.
                    self.obs_train_ph = self.policy_tf.obs_ph
                    self.action_train_ph = self.policy_tf.action_ph
                    self.terminals1 = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='terminals1')
                    self.rewards = tf.placeholder(tf.float32,
                                                  shape=(None, 1),
                                                  name='rewards')
                    self.critic_target = tf.placeholder(tf.float32,
                                                        shape=(None, 1),
                                                        name='critic_target')

                # Create networks and core TF parts that are shared across setup parts.
                with tf.variable_scope("model", reuse=False):
                    self.actor_tf = self.policy_tf.make_actor(
                        self.policy_tf.processed_obs)
                    self.critic_tf = self.policy_tf.make_critic(
                        self.policy_tf.processed_obs, self.action_train_ph)
                    self.critic_with_actor_tf = self.policy_tf.make_critic(
                        self.policy_tf.processed_obs,
                        self.actor_tf,
                        reuse=True)

                    if self.ro:

                        def tf_repeat(tensor_to_repeat, repeat_num):
                            tiled = tf.tile(tensor_to_repeat, [1, repeat_num])
                            repeated = tf.reshape(
                                tiled,
                                shape=[
                                    self.batch_size * repeat_num,
                                    tensor_to_repeat.shape[1]
                                ])
                            return repeated

                        self.augmented_obs0 = tf_repeat(
                            self.policy_tf.processed_obs, self.sample_number)
                        self.augmented_action_raw = tf_repeat(
                            self.actor_tf, self.sample_number)
                        noises = []
                        for b_index in range(self.batch_size):
                            noises.append(
                                tf.random_uniform((self.sample_number - 1, ) +
                                                  self.action_space.shape,
                                                  -0.1, 0.1))
                            noises.append(
                                tf.zeros((1, ) + self.action_space.shape))
                        noises = tf.concat(noises, axis=0)
                        self.augmented_action = self.augmented_action_raw + noises
                        self.augmented_action = tf.clip_by_value(
                            self.augmented_action, -1, 1)
                        self.augmented_critic_with_actor_tf = self.policy_tf.make_critic(
                            self.augmented_obs0,
                            self.augmented_action,
                            reuse=True)[:, 0]

                with tf.variable_scope("target", reuse=False):
                    critic_target = \
                        self.target_policy.make_critic(self.target_policy.processed_obs,
                                                       self.target_policy.make_actor(self.target_policy.processed_obs))

                with tf.variable_scope("loss", reuse=False):
                    # self.critic_tf = denormalize(
                    #     tf.clip_by_value(self.critic_tf, self.return_range[0], self.return_range[1]),
                    #     self.ret_rms)
                    #
                    # self.critic_with_actor_tf = denormalize(
                    #     tf.clip_by_value(self.critic_with_actor_tf,
                    #                      self.return_range[0], self.return_range[1]),
                    #     self.ret_rms)
                    #
                    # q_obs1 = denormalize(critic_target, self.ret_rms)
                    self.target_q = self.rewards + (
                        1. - self.terminals1) * self.gamma * critic_target

                    # tf.summary.scalar('critic_target', tf.reduce_mean(self.critic_target))
                    if self.full_tensorboard_log:
                        tf.summary.histogram('critic_target',
                                             self.critic_target)

                    # Set up parts.
                    self._setup_stats()
                    self._setup_target_network_updates()

                with tf.variable_scope("input_info", reuse=False):
                    self.reward_summary = tf.summary.scalar(
                        'rewards', tf.reduce_mean(self.rewards))
                    self.obs_summary = tf.summary.scalar(
                        'obs', tf.reduce_mean(self.obs_train_ph))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('rewards', self.rewards)
                        if len(self.observation_space.shape
                               ) == 3 and self.observation_space.shape[0] in [
                                   1, 3, 4
                               ]:
                            tf.summary.image('observation', self.obs_train_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 self.obs_train_ph)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self._setup_actor_optimizer()
                    self._setup_critic_optimizer()
                    self.actor_loss_summary = tf.summary.scalar(
                        'actor_loss', self.actor_loss)
                    self.critic_loss_summary = tf.summary.scalar(
                        'critic_loss', self.critic_loss)

                self.params = tf_util.get_trainable_vars("model")

                self.target_params = tf_util.get_trainable_vars("target")
                self.obs_rms_params = [
                    var for var in tf.global_variables()
                    if "obs_rms" in var.name
                ]
                self.ret_rms_params = [
                    var for var in tf.global_variables()
                    if "ret_rms" in var.name
                ]

                with self.sess.as_default():
                    self._initialize(self.sess)
示例#22
0
    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                #self.replay_buffer = DiscrepancyReplayBuffer(self.buffer_size, scorer=self.policy_tf.get_q_discrepancy)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    if self.recurrent_policy:
                        import inspect
                        policy_tf_args = inspect.signature(self.policy).parameters

                        policy_tf_kwargs = {}
                        if "my_size" in policy_tf_args:
                            policy_tf_kwargs["my_size"] = len(self._get_env_parameters())
                        if "goal_size" in policy_tf_args:
                            policy_tf_kwargs["goal_size"] = self.env.goal_dim  # TODO: need to get this some other way or save it

                        if self.buffer_kwargs is not None:
                            sequence_length = self.buffer_kwargs.get("sequence_length", 1)
                        else:
                            sequence_length = 1

                        self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space,
                                                     n_batch=self.batch_size,
                                                     n_steps=sequence_length,
                                                     **policy_tf_kwargs, **self.policy_kwargs)
                        self.policy_tf_act = self.policy(self.sess, self.observation_space, self.action_space,
                                                         n_batch=1, **policy_tf_kwargs,
                                                         **self.policy_kwargs)
                        self.target_policy_tf = self.policy(self.sess, self.observation_space, self.action_space,
                                                            n_batch=self.batch_size,
                                                            n_steps=sequence_length, **policy_tf_kwargs,
                                                            **self.policy_kwargs)

                        self.dones_ph = self.policy_tf.dones_ph
                    else:
                        self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space,
                                                     **self.policy_kwargs)
                        self.target_policy_tf = self.policy(self.sess, self.observation_space, self.action_space,
                                                            **self.policy_kwargs)

                    if hasattr(self.policy_tf, "extra_phs"):
                        for ph_name in self.policy_tf.extra_phs:
                            if "target_" in ph_name:
                                self.train_extra_phs[ph_name] = getattr(self.target_policy_tf,
                                                                        ph_name.replace("target_", "") + "_ph")
                            else:
                                self.train_extra_phs[ph_name] = getattr(self.policy_tf, ph_name + "_ph")

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy_tf.obs_ph
                    self.processed_next_obs_ph = self.target_policy_tf.processed_obs
                    self.action_target = self.target_policy_tf.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph")

                self.buffer_is_prioritized = self.buffer_type.__name__ in ["PrioritizedReplayBuffer",
                                                                           "RankPrioritizedReplayBuffer"]

                if self.replay_buffer is None:
                    if self.buffer_is_prioritized:
                        if self.num_timesteps is not None and self.prioritization_starts > self.num_timesteps or self.prioritization_starts > 0:
                            self.replay_buffer = ReplayBuffer(self.buffer_size)
                        else:
                            buffer_kw = {"size": self.buffer_size, "alpha": 0.7}
                            if self.buffer_type.__name__ == "RankPrioritizedReplayBuffer":
                                buffer_kw.update(
                                    {"learning_starts": self.prioritization_starts, "batch_size": self.batch_size})
                            self.replay_buffer = self.buffer_type(**buffer_kw)
                    else:
                        replay_buffer_kw = {"size": self.buffer_size}
                        if self.buffer_kwargs is not None:
                            replay_buffer_kw.update(self.buffer_kwargs)
                        if self.recurrent_policy:
                            replay_buffer_kw["rnn_inputs"] = self.policy_tf.rnn_inputs
                        if hasattr(self.policy_tf, "extra_data_names"):
                            replay_buffer_kw["extra_data_names"] = self.policy_tf.extra_data_names
                        self.replay_buffer = self.buffer_type(**replay_buffer_kw)

                if self.recurrent_policy:
                    self.sequence_length = self.replay_buffer.sequence_length
                    self.scan_length = self.replay_buffer.scan_length
                    assert self.scan_length % self.sequence_length == 0

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    if self.recurrent_policy:
                        actor_args = inspect.signature(self.policy_tf.make_actor).parameters
                        critic_args = inspect.signature(self.policy_tf.make_critics).parameters
                        actor_kws = {k: v for k, v in self.train_extra_phs.items() if k in actor_args}
                        critic_kws = {k: v for k, v in self.train_extra_phs.items() if k in critic_args}
                        self.policy_out = policy_out = self.policy_tf.make_actor(self.processed_obs_ph, **actor_kws)
                        self.policy_act = policy_act = self.policy_tf_act.make_actor(reuse=True)
                        # Use two Q-functions to improve performance by reducing overestimation bias
                        qf1, qf2 = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph, **critic_kws)
                        _, _ = self.policy_tf_act.make_critics(None, self.actions_ph, reuse=True)
                        # Q value when following the current policy
                        qf1_pi, qf2_pi = self.policy_tf.make_critics(self.processed_obs_ph, policy_out, **critic_kws,
                                                                     reuse=True)

                        train_params = [var for var in tf_util.get_trainable_vars("model/pi") if "act" not in var.name]
                        act_params = [var for var in tf_util.get_trainable_vars("model/pi") if "act" in var.name]
                        self.act_ops = [
                            tf.assign(act, train)
                            for act, train in zip(act_params, train_params)
                        ]

                    else:
                        self.policy_out = policy_out = self.policy_tf.make_actor(self.processed_obs_ph)
                        # Use two Q-functions to improve performance by reducing overestimation bias
                        qf1, qf2 = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph)
                        # Q value when following the current policy
                        qf1_pi, qf2_pi = self.policy_tf.make_critics(self.processed_obs_ph,
                                                                policy_out, reuse=True)

                with tf.variable_scope("target", reuse=False):
                    if self.recurrent_policy:
                        # Create target networks
                        target_policy_out = self.target_policy_tf.make_actor(self.processed_next_obs_ph,
                                                                             **actor_kws,
                                                                             dones=self.dones_ph)
                        # Target policy smoothing, by adding clipped noise to target actions
                        target_noise = tf.random_normal(tf.shape(target_policy_out), stddev=self.target_policy_noise)
                        target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip)
                        # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh)
                        noisy_target_action = tf.clip_by_value(target_policy_out + target_noise, -1, 1)
                        # Q values when following the target policy
                        qf1_target, qf2_target = self.target_policy_tf.make_critics(self.processed_next_obs_ph,
                                                                                    noisy_target_action,
                                                                                    dones=self.dones_ph,
                                                                                    **critic_kws)
                    else:
                        # Create target networks
                        target_policy_out = self.target_policy_tf.make_actor(self.processed_next_obs_ph)
                        # Target policy smoothing, by adding clipped noise to target actions
                        target_noise = tf.random_normal(tf.shape(target_policy_out), stddev=self.target_policy_noise)
                        target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip)
                        # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh)
                        noisy_target_action = tf.clip_by_value(target_policy_out + target_noise, -1, 1)
                        # Q values when following the target policy
                        qf1_target, qf2_target = self.target_policy_tf.make_critics(self.processed_next_obs_ph,
                                                                                    noisy_target_action)

                policy_pre_activation = self.policy_tf.policy_pre_activation

                if self.full_tensorboard_log:
                    for var in tf_util.get_trainable_vars("model"):
                        tf.summary.histogram(var.name, var)
                if self.recurrent_policy and self.policy_tf.keras_reuse:
                    tf.summary.histogram("rnn/PI state", self.policy_tf.pi_state)
                    tf.summary.histogram("rnn/QF1 state", self.policy_tf.qf1_state)
                    tf.summary.histogram("rnn/QF2 state", self.policy_tf.qf2_state)

                # TODO: introduce somwehere here the placeholder for history which updates internal state?
                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two target Q-Values (clipped Double-Q Learning)
                    min_qf_target = tf.minimum(qf1_target, qf2_target)

                    # Targets for Q value regression
                    q_backup = tf.stop_gradient(
                        self.rewards_ph +
                        (1 - self.terminals_ph) * self.gamma * min_qf_target
                    )

                    if self.clip_q_target is not None:
                        q_backup = tf.clip_by_value(q_backup, self.clip_q_target[0], self.clip_q_target[1], name="q_backup_clipped")

                    # Compute Q-Function loss
                    if self.buffer_is_prioritized:
                        self.train_extra_phs["is_weights"] = tf.placeholder(tf.float32, shape=(None, 1), name="is_weights")
                        qf1_loss = tf.reduce_mean(self.is_weights_ph * (q_backup - qf1) ** 2)
                        qf2_loss = tf.reduce_mean(self.is_weights_ph * (q_backup - qf2) ** 2)
                    else:
                        qf1_loss = tf.reduce_mean((q_backup - qf1) ** 2)
                        qf2_loss = tf.reduce_mean((q_backup - qf2) ** 2)

                    qvalues_losses = qf1_loss + qf2_loss

                    rew_loss = tf.reduce_mean(qf1_pi)
                    action_loss = self.action_l2_scale * tf.nn.l2_loss(policy_pre_activation)

                    self.policy_loss = policy_loss = -rew_loss + action_loss
                    # Policy loss: maximise q value
                    if hasattr(self.policy_tf, "policy_loss"):
                        tf.summary.scalar("custom_policy_loss", self.policy_tf.policy_loss)
                        self.policy_loss += self.policy_tf.policy_loss
                        policy_loss = self.policy_loss

                    # Policy train op
                    # will be called only every n training steps,
                    # where n is the policy delay
                    policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
                    policy_vars = tf_util.get_trainable_vars("model/pi") + tf_util.get_trainable_vars("model/shared")
                    policy_train_op = policy_optimizer.minimize(policy_loss, var_list=policy_vars)
                    self.policy_train_op = policy_train_op
                    # Q Values optimizer
                    qvalues_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
                    qvalues_params = tf_util.get_trainable_vars('model/values_fn/') + tf_util.get_trainable_vars("model/shared/")


                    # Q Values and policy target params
                    source_params = tf_util.get_trainable_vars("model/")
                    target_params = tf_util.get_trainable_vars("target/")

                    if self.recurrent_policy:
                        source_params = [var for var in tf_util.get_trainable_vars("model/") if "act" not in var.name]

                    # Polyak averaging for target variables
                    self.target_ops = [
                        tf.assign(target, (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    train_values_op = qvalues_optimizer.minimize(qvalues_losses, var_list=qvalues_params)

                    self.infos_names = ['qf1_loss', 'qf2_loss']
                    # All ops to call during one training step
                    self.step_ops = [qf1_loss, qf2_loss,
                                     qf1, qf2, train_values_op]
                    if hasattr(self.policy_tf, "step_ops"):
                        self.step_ops.extend(self.policy_tf.step_ops)
                    self.policy_step_ops = [self.policy_train_op, self.target_ops, self.policy_loss]
                    if hasattr(self.policy_tf, "policy_step_ops"):
                        self.policy_step_ops.extend(self.policy_tf.policy_step_ops)

                    if self.recurrent_policy and self.policy_tf.save_state:
                        if self.policy_tf.share_lstm:
                            state_objects = [self.policy_tf.state]
                            if self.target_policy_tf.save_target_state:
                                state_objects.append(self.target_policy_tf.state)
                        else:
                            state_objects = [self.policy_tf.pi_state, self.policy_tf.qf1_state, self.policy_tf.qf2_state]
                            if self.target_policy_tf.save_target_state:
                                state_objects.extend([self.target_policy_tf.pi_state, self.target_policy_tf.qf1_state,
                                                      self.target_policy_tf.qf2_state])
                        self.step_ops.extend(state_objects)

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar("rew_loss", rew_loss)
                    tf.summary.scalar("action_loss", action_loss)
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = tf_util.get_trainable_vars("model")
                self.target_params = tf_util.get_trainable_vars("target/")

                if self.full_tensorboard_log:
                    policy_grads = policy_optimizer.compute_gradients(policy_loss)
                    for g in policy_grads:
                        if g[0] is not None and g[1] in policy_vars:
                            tf.summary.histogram("grad-policy/{}".format(g[1].name), g[0])

                    qf_grads = qvalues_optimizer.compute_gradients(qvalues_losses)
                    for g in qf_grads:
                        if g[0] is not None and g[1] in qvalues_params:
                            tf.summary.histogram("grad-qf/{}".format(g[1].name), g[0])

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()
示例#23
0
#!/usr/bin/python
示例#24
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            # Enable continuous actions tricks (normalized advantage)
            self.continuous_actions = isinstance(self.action_space, Box)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(num_cpu=self.nprocs,
                                                 graph=self.graph)

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, RecurrentActorCriticPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                step_model = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         n_batch_step,
                                         reuse=False,
                                         **self.policy_kwargs)

                self.params = params = tf_util.get_trainable_vars("model")

                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):
                    train_model = self.policy(self.sess,
                                              self.observation_space,
                                              self.action_space,
                                              self.n_envs,
                                              self.n_steps,
                                              n_batch_train,
                                              reuse=True,
                                              **self.policy_kwargs)

                with tf.variable_scope(
                        "loss",
                        reuse=False,
                        custom_getter=tf_util.outer_scope_getter("loss")):
                    self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None])
                    self.rewards_ph = rewards_ph = tf.placeholder(
                        tf.float32, [None])
                    self.learning_rate_ph = learning_rate_ph = tf.placeholder(
                        tf.float32, [])
                    self.actions_ph = train_model.pdtype.sample_placeholder(
                        [None])

                    neg_log_prob = train_model.proba_distribution.neglogp(
                        self.actions_ph)

                    # training loss
                    pg_loss = tf.reduce_mean(advs_ph * neg_log_prob)
                    self.entropy = entropy = tf.reduce_mean(
                        train_model.proba_distribution.entropy())
                    self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy
                    self.vf_loss = vf_loss = mse(
                        tf.squeeze(train_model.value_fn), rewards_ph)
                    train_loss = pg_loss + self.vf_coef * vf_loss

                    # Fisher loss construction
                    self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(
                        neg_log_prob)
                    sample_net = train_model.value_fn + tf.random_normal(
                        tf.shape(train_model.value_fn))
                    self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean(
                        tf.pow(
                            train_model.value_fn -
                            tf.stop_gradient(sample_net), 2))
                    self.joint_fisher = pg_fisher_loss + vf_fisher_loss

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', pg_loss)
                    tf.summary.scalar('policy_gradient_fisher_loss',
                                      pg_fisher_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('value_function_fisher_loss',
                                      vf_fisher_loss)
                    tf.summary.scalar('loss', train_loss)

                    self.grads_check = tf.gradients(train_loss, params)

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(self.rewards_ph))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))
                    tf.summary.scalar('advantage',
                                      tf.reduce_mean(self.advs_ph))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards',
                                             self.rewards_ph)
                        tf.summary.histogram('learning_rate',
                                             self.learning_rate_ph)
                        tf.summary.histogram('advantage', self.advs_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 train_model.obs_ph)

                with tf.variable_scope(
                        "kfac",
                        reuse=False,
                        custom_getter=tf_util.outer_scope_getter("kfac")):
                    with tf.device('/gpu:0'):
                        self.optim = optim = kfac.KfacOptimizer(
                            learning_rate=learning_rate_ph,
                            clip_kl=self.kfac_clip,
                            momentum=0.9,
                            kfac_update=self.kfac_update,
                            epsilon=0.01,
                            stats_decay=0.99,
                            async_eigen_decomp=self.async_eigen_decomp,
                            cold_iter=10,
                            max_grad_norm=self.max_grad_norm,
                            verbose=self.verbose)

                        optim.compute_and_apply_stats(self.joint_fisher,
                                                      var_list=params)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)

                self.summary = tf.summary.merge_all()
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.compat.v1.variable_scope("oldpi", reuse=False):
                    old_pi = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         None,
                                         reuse=False,
                                         **self.policy_kwargs)

                with tf.compat.v1.variable_scope("loss", reuse=False):
                    # Target advantage function (if applicable)
                    atarg = tf.compat.v1.placeholder(dtype=tf.float32,
                                                     shape=[None])

                    # Empirical return
                    ret = tf.compat.v1.placeholder(dtype=tf.float32,
                                                   shape=[None])

                    # learning rate multiplier, updated with schedule
                    lrmult = tf.compat.v1.placeholder(name='lrmult',
                                                      dtype=tf.float32,
                                                      shape=[])

                    # Annealed cliping parameter epislon
                    clip_param = self.clip_param * lrmult

                    obs_ph = self.policy_pi.obs_ph
                    action_ph = self.policy_pi.pdtype.sample_placeholder(
                        [None])

                    kloldnew = old_pi.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(input_tensor=kloldnew)
                    meanent = tf.reduce_mean(input_tensor=ent)
                    pol_entpen = (-self.entcoeff) * meanent

                    # pnew / pold
                    ratio = tf.exp(
                        self.policy_pi.proba_distribution.logp(action_ph) -
                        old_pi.proba_distribution.logp(action_ph))

                    # surrogate from conservative policy iteration
                    surr1 = ratio * atarg
                    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                                             1.0 + clip_param) * atarg

                    # PPO's pessimistic surrogate (L^CLIP)
                    pol_surr = -tf.reduce_mean(
                        input_tensor=tf.minimum(surr1, surr2))
                    vf_loss = tf.reduce_mean(
                        input_tensor=tf.square(self.policy_pi.value_flat -
                                               ret))
                    total_loss = pol_surr + pol_entpen + vf_loss
                    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
                    self.loss_names = [
                        "pol_surr", "pol_entpen", "vf_loss", "kl", "ent"
                    ]

                    tf.compat.v1.summary.scalar('entropy_loss', pol_entpen)
                    tf.compat.v1.summary.scalar('policy_gradient_loss',
                                                pol_surr)
                    tf.compat.v1.summary.scalar('value_function_loss', vf_loss)
                    tf.compat.v1.summary.scalar('approximate_kullback-leibler',
                                                meankl)
                    tf.compat.v1.summary.scalar('clip_factor', clip_param)
                    tf.compat.v1.summary.scalar('loss', total_loss)

                    self.params = tf_util.get_trainable_vars("model")

                    self.assign_old_eq_new = tf_util.function(
                        [], [],
                        updates=[
                            tf.compat.v1.assign(oldv, newv)
                            for (oldv, newv) in zipsame(
                                tf_util.get_globals_vars("oldpi"),
                                tf_util.get_globals_vars("model"))
                        ])

                with tf.compat.v1.variable_scope("Adam_mpi", reuse=False):
                    self.adam = MpiAdam(self.params,
                                        epsilon=self.adam_epsilon,
                                        sess=self.sess)

                with tf.compat.v1.variable_scope("input_info", reuse=False):
                    tf.compat.v1.summary.scalar(
                        'discounted_rewards', tf.reduce_mean(input_tensor=ret))
                    tf.compat.v1.summary.scalar(
                        'learning_rate',
                        tf.reduce_mean(input_tensor=self.optim_stepsize))
                    tf.compat.v1.summary.scalar(
                        'advantage', tf.reduce_mean(input_tensor=atarg))
                    tf.compat.v1.summary.scalar(
                        'clip_range',
                        tf.reduce_mean(input_tensor=self.clip_param))

                    if self.full_tensorboard_log:
                        tf.compat.v1.summary.histogram('discounted_rewards',
                                                       ret)
                        tf.compat.v1.summary.histogram('learning_rate',
                                                       self.optim_stepsize)
                        tf.compat.v1.summary.histogram('advantage', atarg)
                        tf.compat.v1.summary.histogram('clip_range',
                                                       self.clip_param)
                        if tf_util.is_image(self.observation_space):
                            tf.compat.v1.summary.image('observation', obs_ph)
                        else:
                            tf.compat.v1.summary.histogram(
                                'observation', obs_ph)

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                tf_util.initialize(sess=self.sess)

                self.summary = tf.compat.v1.summary.merge_all()

                self.lossandgrad = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    [self.summary,
                     tf_util.flatgrad(total_loss, self.params)] + losses)
                self.compute_losses = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    losses)
示例#26
0
    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                if self.replay_buffer and len(self.replay_buffer) > 0:
                    # TODO: maybe substitute with a prioritized buffer to give preference to the transitions added
                    # during continual learning
                    pass
                else:
                    self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy = self.policy(self.sess,
                                                     self.observation_space,
                                                     self.action_space,
                                                     **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy.obs_ph
                    self.processed_next_obs_ph = self.target_policy.processed_obs
                    self.action_target = self.target_policy.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name="terminals")
                    self.rewards_ph = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name="rewards")
                    self.actions_ph = tf.placeholder(
                        tf.float32,
                        shape=(None, ) + self.action_space.shape,
                        name="actions",
                    )
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    # first return value corresponds to deterministic actions
                    # policy_out corresponds to stochastic actions, used for training
                    # logp_pi is the log probability of actions taken by the policy
                    (
                        self.deterministic_action,
                        policy_out,
                        logp_pi,
                    ) = self.policy_tf.make_actor(self.processed_obs_ph)
                    # Monitor the entropy of the policy,
                    # this is not used for training
                    self.entropy = tf.reduce_mean(self.policy_tf.entropy)
                    #  Use two Q-functions to improve performance by reducing overestimation bias.
                    qf1, qf2, value_fn = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        self.actions_ph,
                        create_qf=True,
                        create_vf=True,
                    )
                    qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        policy_out,
                        create_qf=True,
                        create_vf=False,
                        reuse=True,
                    )

                    # Target entropy is used when learning the entropy coefficient
                    if self.target_entropy == "auto":
                        # automatically set target entropy if needed
                        self.target_entropy = -np.prod(
                            self.action_space.shape).astype(np.float32)
                    else:
                        # Force conversion
                        # this will also throw an error for unexpected string
                        self.target_entropy = float(self.target_entropy)

                    # The entropy coefficient or entropy can be learned automatically
                    # see Automating Entropy Adjustment for Maximum Entropy RL section
                    # of https://arxiv.org/abs/1812.05905
                    if isinstance(self.ent_coef,
                                  str) and self.ent_coef.startswith("auto"):
                        # Default initial value of ent_coef when learned
                        init_value = 1.0
                        if "_" in self.ent_coef:
                            init_value = float(self.ent_coef.split("_")[1])
                            assert init_value > 0.0, "The initial value of ent_coef must be greater than 0"

                        self.log_ent_coef = tf.get_variable(
                            "log_ent_coef",
                            dtype=tf.float32,
                            initializer=np.log(init_value).astype(np.float32),
                        )
                        self.ent_coef = tf.exp(self.log_ent_coef)
                    else:
                        # Force conversion to float
                        # this will throw an error if a malformed string (different from 'auto')
                        # is passed
                        self.ent_coef = float(self.ent_coef)

                with tf.variable_scope("target", reuse=False):
                    # Create the value network
                    _, _, value_target = self.target_policy.make_critics(
                        self.processed_next_obs_ph,
                        create_qf=False,
                        create_vf=True)
                    self.value_target = value_target

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two Q-Values (Double-Q Learning)
                    min_qf_pi = tf.minimum(qf1_pi, qf2_pi)

                    # Target for Q value regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1 - self.terminals_ph) *
                                                self.gamma * self.value_target)

                    # Compute Q-Function loss
                    # TODO: test with huber loss (it would avoid too high values)
                    qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2)
                    qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2)

                    # Compute the entropy temperature loss
                    # it is used when the entropy coefficient is learned
                    ent_coef_loss, entropy_optimizer = None, None
                    if not isinstance(self.ent_coef, float):
                        ent_coef_loss = -tf.reduce_mean(
                            self.log_ent_coef *
                            tf.stop_gradient(logp_pi + self.target_entropy))
                        entropy_optimizer = tf.train.AdamOptimizer(
                            learning_rate=self.learning_rate_ph)

                    # Compute the policy loss
                    # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi)
                    policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi -
                                                    qf1_pi)

                    # NOTE: in the original implementation, they have an additional
                    # regularization loss for the Gaussian parameters
                    # this is not used for now
                    # policy_loss = (policy_kl_loss + policy_regularization_loss)
                    policy_loss = policy_kl_loss

                    # Target for value fn regression
                    # We update the vf towards the min of two Q-functions in order to
                    # reduce overestimation bias from function approximation error.
                    v_backup = tf.stop_gradient(min_qf_pi -
                                                self.ent_coef * logp_pi)
                    value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2)

                    values_losses = qf1_loss + qf2_loss + value_loss

                    # Policy train op
                    # (has to be separate from value train op, because min_qf_pi appears in policy_loss)
                    policy_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(
                        policy_loss,
                        var_list=tf_util.get_trainable_vars("model/pi"))

                    # Value train op
                    value_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    values_params = tf_util.get_trainable_vars(
                        "model/values_fn")

                    source_params = tf_util.get_trainable_vars(
                        "model/values_fn/vf")
                    target_params = tf_util.get_trainable_vars(
                        "target/values_fn/vf")

                    # Polyak averaging for target variables
                    self.target_update_op = [
                        tf.assign(target,
                                  (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]
                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Control flow is used because sess.run otherwise evaluates in nondeterministic order
                    # and we first need to compute the policy action before computing q values losses
                    with tf.control_dependencies([policy_train_op]):
                        train_values_op = value_optimizer.minimize(
                            values_losses, var_list=values_params)

                        self.infos_names = [
                            "policy_loss",
                            "qf1_loss",
                            "qf2_loss",
                            "value_loss",
                            "entropy",
                        ]
                        # All ops to call during one training step
                        self.step_ops = [
                            policy_loss,
                            qf1_loss,
                            qf2_loss,
                            value_loss,
                            qf1,
                            qf2,
                            value_fn,
                            logp_pi,
                            self.entropy,
                            policy_train_op,
                            train_values_op,
                        ]

                        # Add entropy coefficient optimization operation if needed
                        if ent_coef_loss is not None:
                            with tf.control_dependencies([train_values_op]):
                                ent_coef_op = entropy_optimizer.minimize(
                                    ent_coef_loss, var_list=self.log_ent_coef)
                                self.infos_names += [
                                    "ent_coef_loss", "ent_coef"
                                ]
                                self.step_ops += [
                                    ent_coef_op,
                                    ent_coef_loss,
                                    self.ent_coef,
                                ]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar("policy_loss", policy_loss)
                    tf.summary.scalar("qf1_loss", qf1_loss)
                    tf.summary.scalar("qf2_loss", qf2_loss)
                    tf.summary.scalar("value_loss", value_loss)
                    tf.summary.scalar("entropy", self.entropy)
                    if ent_coef_loss is not None:
                        tf.summary.scalar("ent_coef_loss", ent_coef_loss)
                        tf.summary.scalar("ent_coef", self.ent_coef)

                    tf.summary.scalar("learning_rate",
                                      tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = tf_util.get_trainable_vars("model")
                self.target_params = tf_util.get_trainable_vars(
                    "target/values_fn/vf")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()
示例#27
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \
                                                                "instance of common.policies.ActorCriticPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                self.n_batch = self.n_envs * self.n_steps

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, RecurrentActorCriticPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                step_model = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         n_batch_step,
                                         reuse=False,
                                         **self.policy_kwargs)

                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):
                    train_model = self.policy(self.sess,
                                              self.observation_space,
                                              self.action_space,
                                              self.n_envs,
                                              self.n_steps,
                                              n_batch_train,
                                              reuse=True,
                                              **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.actions_ph = train_model.pdtype.sample_placeholder(
                        [None], name="action_ph")
                    self.advs_ph = tf.placeholder(tf.float32, [None],
                                                  name="advs_ph")
                    self.rewards_ph = tf.placeholder(tf.float32, [None],
                                                     name="rewards_ph")
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                    neglogpac = train_model.proba_distribution.neglogp(
                        self.actions_ph)
                    self.entropy = tf.reduce_mean(
                        train_model.proba_distribution.entropy())
                    self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac)
                    self.vf_loss = mse(tf.squeeze(train_model.value_flat),
                                       self.rewards_ph)
                    # https://arxiv.org/pdf/1708.04782.pdf#page=9, https://arxiv.org/pdf/1602.01783.pdf#page=4
                    # and https://github.com/dennybritz/reinforcement-learning/issues/34
                    # suggest to add an entropy component in order to improve exploration.
                    loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', self.pg_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('loss', loss)

                    self.params = tf_util.get_trainable_vars("model")
                    grads = tf.gradients(loss, self.params)
                    if self.max_grad_norm is not None:
                        grads, _ = tf.clip_by_global_norm(
                            grads, self.max_grad_norm)
                    grads = list(zip(grads, self.params))

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(self.rewards_ph))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))
                    tf.summary.scalar('advantage',
                                      tf.reduce_mean(self.advs_ph))
                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards',
                                             self.rewards_ph)
                        tf.summary.histogram('learning_rate',
                                             self.learning_rate_ph)
                        tf.summary.histogram('advantage', self.advs_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 train_model.obs_ph)

                trainer = tf.train.RMSPropOptimizer(
                    learning_rate=self.learning_rate_ph,
                    decay=self.alpha,
                    epsilon=self.epsilon)
                self.apply_backprop = trainer.apply_gradients(grads)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                self.attention = step_model.attention
                tf.global_variables_initializer().run(session=self.sess)

                self.summary = tf.summary.merge_all()
    def setup_model(self):
        # prevent import loops

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            print("number of workers are", self.nworkers)
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)
                self._setup_learn(self.seed)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_policy = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)
                # Network for phi
                with tf.variable_scope("phi", reuse=False):
                    self.policy_phi = self.policy(self.sess,
                                                  self.observation_space,
                                                  self.action_space,
                                                  self.n_envs,
                                                  1,
                                                  None,
                                                  reuse=False,
                                                  **self.policy_kwargs)
                # Network for phi old
                with tf.variable_scope("oldphi", reuse=False):
                    self.policy_phi_old = self.policy(self.sess,
                                                      self.observation_space,
                                                      self.action_space,
                                                      self.n_envs,
                                                      1,
                                                      None,
                                                      reuse=False,
                                                      **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    atarg = tf.placeholder(dtype=tf.float32, shape=[
                        None
                    ])  # Target advantage function (if applicable)
                    ret = tf.placeholder(dtype=tf.float32,
                                         shape=[None])  # Empirical return

                    observation = self.policy_pi.obs_ph
                    action = self.policy_pi.pdtype.sample_placeholder([None])

                    kloldnew = old_policy.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    #kloldnew = self.policy_pi.proba_distribution.kl(old_policy.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    vferr = tf.reduce_mean(
                        tf.square(self.policy_pi.value_flat - ret))
                    vf_phi_err = tf.reduce_mean(
                        tf.square(self.policy_phi.value_flat - ret))
                    vf_phi_old_err = tf.reduce_mean(
                        tf.square(self.policy_phi_old.value_flat))

                    # advantage * pnew / pold
                    ratio = tf.exp(
                        self.policy_pi.proba_distribution.logp(action) -
                        old_policy.proba_distribution.logp(action))
                    surrgain = tf.reduce_mean(ratio * atarg)

                    optimgain = surrgain + entbonus
                    losses = [optimgain, meankl, entbonus, surrgain, meanent]
                    self.loss_names = [
                        "optimgain", "meankl", "entloss", "surrgain", "entropy"
                    ]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [
                        v for v in all_var_list
                        if "/vf" not in v.name and "/q/" not in v.name
                    ]
                    vf_var_list = [
                        v for v in all_var_list
                        if "/pi" not in v.name and "/logstd" not in v.name
                    ]
                    all_var_oldpi_list = tf_util.get_trainable_vars("oldpi")
                    var_oldpi_list = [
                        v for v in all_var_oldpi_list
                        if "/vf" not in v.name and "/q/" not in v.name
                    ]

                    all_var_phi_list = tf_util.get_trainable_vars("phi")
                    vf_phi_var_list = [
                        v for v in all_var_phi_list if "/pi" not in v.name
                        and "/logstd" not in v.name and "/q" not in v.name
                    ]
                    all_var_phi_old_list = tf_util.get_trainable_vars("oldphi")
                    vf_phi_old_var_list = [
                        v for v in all_var_phi_old_list if "/pi" not in v.name
                        and "/logstd" not in v.name and "/q" not in v.name
                    ]
                    #print("vars", vf_var_list)
                    self.policy_vars = all_var_list
                    self.oldpolicy_vars = all_var_oldpi_list
                    print("all var list", all_var_list)
                    print("phi vars", vf_phi_var_list)
                    print("phi old vars", vf_phi_old_var_list)

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list,
                                                             sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32,
                                                  shape=[None],
                                                  name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(
                            tf.reshape(flat_tangent[start:start + var_size],
                                       shape))
                        start += var_size
                    gvp = tf.add_n([
                        tf.reduce_sum(grad * tangent)
                        for (grad, tangent) in zipsame(klgrads, tangents)
                    ])  # pylint: disable=E1111
                    fvp = tf_util.flatgrad(gvp, var_list)

                    tf.summary.scalar('entropy_loss', meanent)
                    tf.summary.scalar('policy_gradient_loss', optimgain)
                    tf.summary.scalar('value_function_loss', surrgain)
                    tf.summary.scalar('approximate_kullback-leibler', meankl)
                    tf.summary.scalar(
                        'loss',
                        optimgain + meankl + entbonus + surrgain + meanent)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function(
                        [observation, old_policy.obs_ph, action, atarg],
                        losses)
                    self.compute_fvp = tf_util.function([
                        flat_tangent, observation, old_policy.obs_ph, action,
                        atarg
                    ], fvp)
                    self.compute_vflossandgrad = tf_util.function(
                        [observation, old_policy.obs_ph, ret],
                        tf_util.flatgrad(vferr, vf_var_list))
                    self.compute_vf_phi_lossandgrad = tf_util.function(
                        [observation, self.policy_phi.obs_ph, ret],
                        tf_util.flatgrad(vf_phi_err, vf_phi_var_list))
                    self.compute_vf_loss = tf_util.function(
                        [observation, old_policy.obs_ph, ret], vferr)
                    self.compute_vf_phi_loss = tf_util.function(
                        [observation, self.policy_phi.obs_ph, ret], vf_phi_err)
                    #self.compute_vf_phi_old_loss = tf_util.function([self.policy_phi_old.obs_ph], vf_phi_old_err)
                    #self.phi_old_obs = np.array([-0.012815  , -0.02076313,  0.07524705,  0.09407324,  0.0901745 , -0.09339058,  0.03544853, -0.03297224])
                    #self.phi_old_obs = self.phi_old_obs.reshape((1, 8))

                    update_phi_old_expr = []
                    for var, var_target in zip(
                            sorted(vf_phi_var_list, key=lambda v: v.name),
                            sorted(vf_phi_old_var_list, key=lambda v: v.name)):
                        update_phi_old_expr.append(var_target.assign(var))
                    update_phi_old_expr = tf.group(*update_phi_old_expr)

                    self.update_phi_old = tf_util.function(
                        [], [], updates=[update_phi_old_expr])

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            print(colorize(msg, color='magenta'))
                            start_time = time.time()
                            yield
                            print(
                                colorize("done in {:.3f} seconds".format(
                                    (time.time() - start_time)),
                                         color='magenta'))
                        else:
                            yield

                    @contextmanager
                    def temp_seed(seed):
                        state = np.random.get_state()
                        np.random.seed(seed)
                        try:
                            yield
                        finally:
                            np.random.set_state(state)

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    self.vf_phi_adam = MpiAdam(vf_phi_var_list, sess=self.sess)
                    self.vfadam.sync()
                    self.vf_phi_adam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(ret))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('kl_clip_range',
                                      tf.reduce_mean(self.max_kl))

                self.timed = timed
                self.allmean = allmean
                self.temp_seed = temp_seed

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = tf_util.get_trainable_vars(
                    "model") + tf_util.get_trainable_vars("oldpi")

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, old_policy.obs_ph, action, atarg, ret],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
示例#29
0
文件: td3.py 项目: vjg28/AnimalAI
    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy_tf = self.policy(
                        self.sess, self.observation_space, self.action_space,
                        **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy_tf.obs_ph
                    self.processed_next_obs_ph = self.target_policy_tf.processed_obs
                    self.action_target = self.target_policy_tf.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32,
                                                     shape=(None, ) +
                                                     self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    self.policy_out = policy_out = self.policy_tf.make_actor(
                        self.processed_obs_ph)
                    # Use two Q-functions to improve performance by reducing overestimation bias
                    qf1, qf2 = self.policy_tf.make_critics(
                        self.processed_obs_ph, self.actions_ph)
                    # Q value when following the current policy
                    qf1_pi, _ = self.policy_tf.make_critics(
                        self.processed_obs_ph, policy_out, reuse=True)

                with tf.variable_scope("target", reuse=False):
                    # Create target networks
                    target_policy_out = self.target_policy_tf.make_actor(
                        self.processed_next_obs_ph)
                    # Target policy smoothing, by adding clipped noise to target actions
                    target_noise = tf.random_normal(
                        tf.shape(target_policy_out),
                        stddev=self.target_policy_noise)
                    target_noise = tf.clip_by_value(target_noise,
                                                    -self.target_noise_clip,
                                                    self.target_noise_clip)
                    # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh)
                    noisy_target_action = tf.clip_by_value(
                        target_policy_out + target_noise, -1, 1)
                    # Q values when following the target policy
                    qf1_target, qf2_target = self.target_policy_tf.make_critics(
                        self.processed_next_obs_ph, noisy_target_action)

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two target Q-Values (clipped Double-Q Learning)
                    min_qf_target = tf.minimum(qf1_target, qf2_target)

                    # Targets for Q value regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1 - self.terminals_ph) *
                                                self.gamma * min_qf_target)

                    # Compute Q-Function loss
                    qf1_loss = tf.reduce_mean((q_backup - qf1)**2)
                    qf2_loss = tf.reduce_mean((q_backup - qf2)**2)

                    qvalues_losses = qf1_loss + qf2_loss

                    # Policy loss: maximise q value
                    self.policy_loss = policy_loss = -tf.reduce_mean(qf1_pi)

                    # Policy train op
                    # will be called only every n training steps,
                    # where n is the policy delay
                    policy_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(
                        policy_loss,
                        var_list=tf_util.get_trainable_vars('model/pi'))
                    self.policy_train_op = policy_train_op

                    # Q Values optimizer
                    qvalues_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    qvalues_params = tf_util.get_trainable_vars(
                        'model/values_fn/')

                    # Q Values and policy target params
                    source_params = tf_util.get_trainable_vars("model/")
                    target_params = tf_util.get_trainable_vars("target/")

                    # Polyak averaging for target variables
                    self.target_ops = [
                        tf.assign(target,
                                  (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    train_values_op = qvalues_optimizer.minimize(
                        qvalues_losses, var_list=qvalues_params)

                    self.infos_names = ['qf1_loss', 'qf2_loss']
                    # All ops to call during one training step
                    self.step_ops = [
                        qf1_loss, qf2_loss, qf1, qf2, train_values_op
                    ]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = tf_util.get_trainable_vars("model")
                self.target_params = tf_util.get_trainable_vars("target/")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()
示例#30
0
    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy_tf = self.policy(
                        self.sess, self.observation_space, self.action_space,
                        **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy_tf.obs_ph
                    self.processed_next_obs_ph = self.target_policy_tf.processed_obs
                    self.action_target = self.target_policy_tf.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32,
                                                     shape=(None, ) +
                                                     self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")
                    self.risk_factor_ph = tf.placeholder(tf.float32, [],
                                                         name='risk_factor_ph')

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    self.policy_out = policy_out = self.policy_tf.make_actor(
                        self.processed_obs_ph)
                    #double_policy = self.policy_tf.make_actor(self.processed_next_obs_ph,reuse=True)
                    # Use two Q-functions to improve performance by reducing overestimation bias
                    if self.model_type == "QR":
                        self.qrtau = tf.tile(
                            tf.reshape(
                                tf.range(0.5 / self.n_support, 1,
                                         1 / self.n_support),
                                [1, self.n_support]),
                            [tf.shape(self.processed_obs_ph)[0], 1])
                        qf1, qf2 = self.policy_tf.make_critics(
                            self.processed_obs_ph,
                            self.actions_ph,
                            n_support=self.n_support)
                        # Q value when following the current policy
                        qrtau_pi = self.qrtau
                        qf1_pi, _ = self.policy_tf.make_critics(
                            self.processed_obs_ph,
                            policy_out,
                            reuse=True,
                            n_support=self.n_support)
                    elif self.model_type == "IQN":
                        self.qrtau = tf.random_uniform([
                            tf.shape(self.processed_obs_ph)[0], self.n_support
                        ],
                                                       minval=self.tau_clamp,
                                                       maxval=1.0 -
                                                       self.tau_clamp)
                        qf1, qf2 = self.policy_tf.make_critics(
                            self.processed_obs_ph,
                            self.actions_ph,
                            model_type=self.model_type,
                            iqn_tau=self.qrtau,
                            n_support=self.n_support)
                        # Q value when following the current policy
                        qrtau_pi = tf.random_uniform([
                            tf.shape(self.processed_obs_ph)[0], self.n_support
                        ],
                                                     minval=self.tau_clamp,
                                                     maxval=1.0 -
                                                     self.tau_clamp)
                        qf1_pi, _ = self.policy_tf.make_critics(
                            self.processed_obs_ph,
                            policy_out,
                            model_type=self.model_type,
                            iqn_tau=qrtau_pi,
                            reuse=True,
                            n_support=self.n_support)

                with tf.variable_scope("target", reuse=False):
                    # Create target networks
                    target_policy_out = self.target_policy_tf.make_actor(
                        self.processed_next_obs_ph)
                    # Target policy smoothing, by adding clipped noise to target actions
                    target_noise = tf.random_normal(
                        tf.shape(target_policy_out),
                        stddev=self.target_policy_noise)
                    target_noise = tf.clip_by_value(target_noise,
                                                    -self.target_noise_clip,
                                                    self.target_noise_clip)
                    # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh)
                    noisy_target_action = tf.clip_by_value(
                        target_policy_out + target_noise, -1 + 1e-2, 1 - 1e-2)
                    # Q values when following the target policy
                    if self.model_type == "QR":
                        target_qrtau = self.qrtau
                        qf1_target, qf2_target = self.target_policy_tf.make_critics(
                            self.processed_next_obs_ph,
                            noisy_target_action,
                            n_support=self.n_support)
                    elif self.model_type == "IQN":
                        target_qrtau = tf.random_uniform([
                            tf.shape(self.processed_next_obs_ph)[0],
                            self.n_support
                        ],
                                                         minval=self.tau_clamp,
                                                         maxval=1.0 -
                                                         self.tau_clamp)
                        qf1_target, qf2_target = self.target_policy_tf.make_critics(
                            self.processed_next_obs_ph,
                            noisy_target_action,
                            model_type=self.model_type,
                            iqn_tau=target_qrtau,
                            n_support=self.n_support)

                with tf.variable_scope("loss", reuse=False):

                    quantile_weight = 1.0 - self.risk_factor_ph * (
                        2.0 * qrtau_pi - 1.0)
                    min_quantile = tf.reduce_mean(qf1_pi[:, 0])
                    max_quantile = tf.reduce_mean(qf1_pi[:, -1])

                    #min_qf_target = tf.minimum(qf1_target, qf2_target)
                    #max_arg = tf.argmax(target_qrtau,axis=-1)
                    qf1_t_flag = qf1_target[:, -1]
                    qf2_t_flag = qf2_target[:, -1]
                    #qf1_t_flag = qf1_target[:,max_arg]
                    #qf2_t_flag = qf2_target[:,max_arg]
                    #qf1_t_flag = tf.reduce_max(qf1_target,axis=-1)
                    #qf2_t_flag = tf.reduce_max(qf2_target,axis=-1)
                    #min_flag = qf1_t_flag > qf2_t_flag
                    min_flag = qf1_t_flag < qf2_t_flag
                    min_qf_target = tf.where(min_flag, qf1_target, qf2_target)

                    # Targets for Q value regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1.0 - self.terminals_ph) *
                                                self.gamma * min_qf_target)

                    # Compute Q-Function loss
                    qrtau = tf.tile(tf.expand_dims(self.qrtau, axis=2),
                                    [1, 1, self.n_support])
                    #qrtau = tf.tile(tf.expand_dims(self.qrtau, axis=1), [1, self.n_support, 1])
                    #mulmax = 2.0
                    logit_valid_tile = tf.tile(
                        tf.expand_dims(q_backup, axis=1),
                        [1, self.n_support, 1])

                    theta_loss_tile = tf.tile(tf.expand_dims(qf1, axis=2),
                                              [1, 1, self.n_support])
                    Huber_loss = tf.compat.v1.losses.huber_loss(
                        logit_valid_tile,
                        theta_loss_tile,
                        reduction=tf.losses.Reduction.NONE,
                        delta=self.kappa) / self.kappa
                    bellman_errors = logit_valid_tile - theta_loss_tile
                    Loss = tf.abs(qrtau - tf.stop_gradient(
                        tf.to_float(bellman_errors < 0))) * Huber_loss
                    qf1_losses = tf.reduce_mean(tf.reduce_sum(Loss, axis=1),
                                                axis=1)
                    #qf1_gmul = qf1_losses - tf.reduce_min(qf1_losses)
                    #qf1_gmul = 1.0 + mulmax*qf1_gmul/tf.reduce_max(qf1_gmul)#(1.0 - mulmax) + 2*mulmax*qf1_gmul/tf.reduce_max(qf1_gmul)
                    qf1_loss = tf.reduce_mean(qf1_losses)

                    theta_loss_tile = tf.tile(tf.expand_dims(qf2, axis=2),
                                              [1, 1, self.n_support])
                    Huber_loss = tf.compat.v1.losses.huber_loss(
                        logit_valid_tile,
                        theta_loss_tile,
                        reduction=tf.losses.Reduction.NONE,
                        delta=self.kappa) / self.kappa
                    bellman_errors = logit_valid_tile - theta_loss_tile
                    Loss = tf.abs(qrtau - tf.stop_gradient(
                        tf.to_float(bellman_errors < 0))) * Huber_loss
                    qf2_losses = tf.reduce_mean(tf.reduce_sum(Loss, axis=1),
                                                axis=1)
                    #qf2_gmul = qf2_losses - tf.reduce_min(qf2_losses)
                    #qf2_gmul = 1.0 + mulmax*qf2_gmul/tf.reduce_max(qf2_gmul)#(1.0 - mulmax) + 2*mulmax*qf2_gmul/tf.reduce_max(qf2_gmul)
                    qf2_loss = tf.reduce_mean(qf2_losses)
                    qvalues_losses = qf1_loss + qf2_loss

                    # Policy loss: maximise q value
                    self.policy_loss = policy_loss = -tf.reduce_mean(
                        tf.multiply(qf1_pi,
                                    quantile_weight))  # + policy_update_ratio
                    # Q Values optimizer
                    #qvalues_optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph)
                    qvalues_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    #qvalues_optimizer = tf.contrib.opt.NadamOptimizer(learning_rate=self.learning_rate_ph)
                    qvalues_params = tf_util.get_trainable_vars(
                        'model/values_fn/')

                    # Q Values and policy target params
                    source_params = tf_util.get_trainable_vars("model/")
                    target_params = tf_util.get_trainable_vars("target/")

                    # Policy train op
                    # will be called only every n training steps,
                    # where n is the policy delay
                    #policy_optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph)
                    policy_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    #policy_optimizer = tf.contrib.opt.NadamOptimizer(learning_rate=self.learning_rate_ph)

                    # Initializing target to match source variables
                    self.target_init_op = tf.group([
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ])

                    train_values_op = qvalues_optimizer.minimize(
                        qvalues_losses, var_list=qvalues_params)
                    #grad_values = tf.gradients(qvalues_losses, qvalues_params)
                    #grad_values = list(zip(grad_values, qvalues_params))

                    with tf.control_dependencies([train_values_op]):
                        policy_train_op = policy_optimizer.minimize(
                            policy_loss,
                            var_list=tf_util.get_trainable_vars('model/pi'))
                        #grad_policy = tf.gradients(policy_loss, tf_util.get_trainable_vars('model/pi'))
                        #grad_policy = list(zip(grad_policy, tf_util.get_trainable_vars('model/pi')))
                        self.policy_train_op = policy_train_op

                    with tf.control_dependencies([self.policy_train_op]):
                        # Polyak averaging for target variables
                        self.target_ops = tf.group([
                            tf.assign(target, (1.0 - self.tau) * target +
                                      self.tau * source) for target, source in
                            zip(target_params, source_params)
                        ])

                    self.infos_names = ['qf1_loss', 'qf2_loss']
                    # All ops to call during one training step
                    self.step_ops = [
                        qf1_loss, qf2_loss, qf1, qf2, train_values_op
                    ]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('min_quantile', min_quantile)
                    tf.summary.scalar('max_quantile', max_quantile)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))
                    '''
                    for grad, var in grad_values + grad_policy:
                        tf.summary.histogram(var.name, var)
                        tf.summary.histogram(var.name + '/gradient', grad)
                    '''

                # Retrieve parameters that must be saved
                self.params = tf_util.get_trainable_vars("model")
                self.target_params = tf_util.get_trainable_vars("target/")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(self.target_init_op)

                self.summary = tf.summary.merge_all()