示例#1
0
def aac_loss_def(act_target,
                 adv_target,
                 r_target,
                 pi_logits,
                 pi_vf,
                 pi_prime_logits,
                 entropy_beta,
                 epsilon=None,
                 name='_aac_',
                 verbose=False):
    """
    Advantage Actor Critic loss definition.
    Paper: https://arxiv.org/abs/1602.01783

    Args:
        act_target:      tensor holding policy actions targets;
        adv_target:      tensor holding policy estimated advantages targets;
        r_target:        tensor holding policy empirical returns targets;
        pi_logits:       policy logits output tensor;
        pi_prime_logits: not used;
        pi_vf:           policy value function output tensor;
        entropy_beta:    entropy regularization constant;
        epsilon:         not used;
        name:            scope;
        verbose:         summary level.

    Returns:
        tensor holding estimated AAC loss;
        list of related tensorboard summaries.
    """
    with tf.name_scope(name + '/aac'):
        neg_pi_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=pi_logits, labels=act_target)
        pi_loss = tf.reduce_mean(neg_pi_log_prob * adv_target)
        vf_loss = 0.5 * tf.losses.mean_squared_error(r_target, pi_vf)
        entropy = tf.reduce_mean(cat_entropy(pi_logits))

        loss = pi_loss + vf_loss - entropy * entropy_beta

        mean_vf = tf.reduce_mean(pi_vf)
        mean_t_target = tf.reduce_mean(r_target)

        summaries = [
            tf.summary.scalar('policy_loss', pi_loss),
            tf.summary.scalar('value_loss', vf_loss),
        ]
        if verbose:
            summaries += [
                tf.summary.scalar('entropy', entropy),
                tf.summary.scalar('value_fn', mean_vf),
                # tf.summary.scalar('empirical_return',mean_t_target),
                # tf.summary.histogram('value_fn', pi_vf),
                # tf.summary.histogram('empirical_return', r_target),
            ]

    return loss, summaries
示例#2
0
def meta_loss_def_1_0(
        act_target_train,
        act_target_test,
        adv_target_train,
        adv_target_test,
        r_target_train,
        r_target_test,
        pi_logits_train,
        pi_logits_test,
        pi_vf_train,
        pi_vf_test,
        pi_prime_logits,
        entropy_beta,
        epsilon=None,
        name='_meta_',
        verbose=False
):
    with tf.name_scope(name + '/meta'):
        neg_pi_log_prob_train = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=pi_logits_train,
            labels=act_target_train
        )
        neg_pi_log_prob_test = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=pi_logits_test,
            labels=act_target_test
        )
        pi_loss = tf.reduce_mean(
            (neg_pi_log_prob_train + neg_pi_log_prob_test) * adv_target_test
        )
        vf_loss_train = 0.5 * tf.losses.mean_squared_error(r_target_test, pi_vf_train)
        vf_loss_test = 0.5 * tf.losses.mean_squared_error(r_target_test, pi_vf_test)

        entropy = tf.reduce_mean(cat_entropy(pi_logits_test))

        loss = pi_loss + vf_loss_test + vf_loss_train - entropy * entropy_beta

        mean_vf_test = tf.reduce_mean(pi_vf_test)
        mean_vf_train = tf.reduce_mean(pi_vf_train)

        summaries = [
            tf.summary.scalar('meta_policy_loss', pi_loss),
            tf.summary.scalar('meta_value_loss_test', vf_loss_test),
        ]
        if verbose:
            summaries += [
                tf.summary.scalar('entropy', entropy),
                tf.summary.scalar('value_fn_test', mean_vf_test),
                tf.summary.scalar('value_fn_train', mean_vf_train)
            ]

    return loss, summaries
示例#3
0
def ppo_loss_def(act_target,
                 adv_target,
                 r_target,
                 pi_logits,
                 pi_vf,
                 pi_prime_logits,
                 entropy_beta,
                 epsilon,
                 name='_ppo_',
                 verbose=False):
    """
    PPO clipped surrogate loss definition, as (7) in https://arxiv.org/pdf/1707.06347.pdf

    Args:
        act_target:      tensor holding policy actions targets;
        adv_target:      tensor holding policy estimated advantages targets;
        r_target:        tensor holding policy empirical returns targets;
        pi_logits:       policy logits output tensor;
        pi_vf:           policy value function output tensor;
        pi_prime_logits: old_policy logits output tensor;
        entropy_beta:    entropy regularization constant
        epsilon:         L^Clip epsilon tensor;
        name:            scope;
        verbose:         summary level.

    Returns:
        tensor holding estimated PPO L^Clip loss;
        list of related tensorboard summaries.
    """
    #act_target = tf.placeholder(tf.float32, [None, env.action_space.n], name="on_policy_action_pl")
    #adv_target = tf.placeholder(tf.float32, [None], name="on_policy_advantage_pl")
    #r_target = tf.placeholder(tf.float32, [None], name="on_policy_return_pl")
    with tf.name_scope(name + '/ppo'):
        pi_log_prob = -tf.nn.softmax_cross_entropy_with_logits(
            logits=pi_logits, labels=act_target)
        pi_old_log_prob = tf.stop_gradient(
            -tf.nn.softmax_cross_entropy_with_logits(logits=pi_prime_logits,
                                                     labels=act_target))
        pi_ratio = tf.exp(pi_log_prob - pi_old_log_prob)

        surr1 = pi_ratio * adv_target  # surrogate from conservative policy iteration
        surr2 = tf.clip_by_value(pi_ratio, 1.0 - epsilon,
                                 1.0 + epsilon) * adv_target

        pi_surr_loss = -tf.reduce_mean(tf.minimum(
            surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
        vf_loss = tf.losses.mean_squared_error(r_target, pi_vf)  # V.fn. loss
        entropy = tf.reduce_mean(cat_entropy(pi_logits))

        loss = pi_surr_loss + vf_loss - entropy * entropy_beta

        # Info:
        mean_pi_ratio = tf.reduce_mean(pi_ratio)
        mean_vf = tf.reduce_mean(pi_vf)
        mean_kl_old_new = tf.reduce_mean(
            kl_divergence(pi_prime_logits, pi_logits))

        summaries = [
            tf.summary.scalar('l_clip_loss', pi_surr_loss),
            tf.summary.scalar('value_loss', vf_loss),
        ]
        if verbose:
            summaries += [
                tf.summary.scalar('entropy', entropy),
                tf.summary.scalar('Dkl_old_new', mean_kl_old_new),
                tf.summary.scalar('pi_ratio', mean_pi_ratio),
                tf.summary.scalar('value_f', mean_vf),
            ]

    return loss, summaries
示例#4
0
文件: model.py 项目: jm16/boosters
    def __init__(
            self,
            data_batch,
            labels_batch=None,
            keep_prob=tf.ones([],),
            activation=tf.nn.elu,
            name='model',
            reuse=False):
        self.data_batch = data_batch
        self.labels_batch = labels_batch
        with tf.variable_scope(name_or_scope=name, reuse=reuse):
            hidden = conv_1d_casual_attention_encoder(
                data_batch['features'],
                keep_prob=keep_prob,
                conv_1d_num_filters=64,
                conv_1d_filter_size=2,
                conv_1d_activation=activation,
                reuse=False,
            )
            hidden = tf.layers.flatten(hidden)

            # print(hidden.shape)

            # hidden = tf.layers.dense(
            #     inputs=hidden,
            #     units=512,
            #     activation=activation,
            # )

            hidden = noisy_linear(
                x=hidden,
                size=64,
                activation_fn=activation,
                name='dense1'
            )
            hidden = tf.nn.dropout(hidden, keep_prob=keep_prob)

            self.predicted_log_sum = tf.layers.dense(
                inputs=hidden,
                units=1,
                activation=activation,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
            )

            # self.predicted_log_sum = noisy_linear(
            #     x=hidden,
            #     size=1,
            #     activation_fn=activation,
            #     name='log_sum'
            # )
            self.predicted_target_sum = tf.clip_by_value(
                tf.exp(self.predicted_log_sum) - 1,
                clip_value_min=0,
                clip_value_max=1e20
            )

            self.predicted_flag_logits = tf.layers.dense(
                inputs=hidden,
                units=2,
                activation=activation,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
            )

            # self.predicted_flag_logits = noisy_linear(
            #     x=tf.concat([hidden, self.predicted_log_sum], axis=-1),
            #     size=2,
            #     activation_fn=activation,
            #     name='flag'
            # )
            self.predicted_flag_probs = tf.nn.softmax(self.predicted_flag_logits)

            self.predicted_flag = tf.argmax(
                self.predicted_flag_probs,
                axis=-1
            )

            self.class_entropy = tf.reduce_mean(cat_entropy(self.predicted_flag_logits))

            self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)

            if labels_batch is not None:
                self.regress_loss = tf.losses.mean_squared_error(
                    labels=labels_batch['target_sum'][..., None],
                    predictions=self.predicted_log_sum
                )
                self.regress_loss = tf.sqrt(self.regress_loss)

                self.class_loss = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits_v2(
                        logits=self.predicted_flag_logits,
                        labels=labels_batch['target_flag'],
                    )
                )
                self.auc, self.auc_update_op = tf.metrics.auc(
                    labels=labels_batch['target_flag'],
                    predictions=self.predicted_flag_probs,
                    weights=None,
                    num_thresholds=200,
                )

            else:
                self.regress_loss = None
                self.class_loss = None
                self.auc = 0
                self.auc_update_op = None
示例#5
0
文件: aac.py 项目: yanbingms/btgym
    def _make_loss(self, pi, pi_prime, name='base', verbose=True, **kwargs):
        """
        Defines policy state encoder classification loss, placeholders and summaries.

        Args:
            pi:                 policy network obj.
            pi_prime:           optional policy network obj.
            name:               str, name scope
            verbose:            summary level

        Returns:
            tensor holding estimated loss graph
            list of related summaries
        """
        with tf.name_scope(name):
            # On-policy AAC loss definition:
            pi.on_pi_act_target = tf.placeholder(
                tf.float32, [None, self.ref_env.action_space.n],
                name="on_policy_action_pl")
            pi.on_pi_adv_target = tf.placeholder(tf.float32, [None],
                                                 name="on_policy_advantage_pl")
            pi.on_pi_r_target = tf.placeholder(tf.float32, [None],
                                               name="on_policy_return_pl")

            clip_epsilon = tf.cast(
                self.clip_epsilon * self.learn_rate_decayed /
                self.opt_learn_rate, tf.float32)

            on_pi_loss, on_pi_summaries = self.on_policy_loss(
                act_target=pi.on_pi_act_target,
                adv_target=pi.on_pi_adv_target,
                r_target=pi.on_pi_r_target,
                pi_logits=pi.on_logits,
                pi_vf=pi.on_vf,
                pi_prime_logits=pi_prime.on_logits,
                entropy_beta=self.model_beta,
                epsilon=clip_epsilon,
                name='on_policy',
                verbose=verbose)

            # Classification loss for price movements prediction:

            # oracle_labels = tf.one_hot(tf.argmax(pi.expert_actions, axis=-1), depth=4)

            if self.class_use_rnn:
                class_logits = pi.on_logits

            else:
                class_logits = pi.on_simple_logits

            # class_loss = tf.reduce_mean(
            #     tf.nn.softmax_cross_entropy_with_logits_v2(
            #         labels=pi.expert_actions,#oracle_labels,
            #         logits=class_logits,
            #     )
            # )

            class_loss = tf.losses.mean_squared_error(
                labels=pi.expert_actions[..., 1:3],
                predictions=tf.nn.softmax(class_logits)[..., 1:3],
            )
            entropy = tf.reduce_mean(cat_entropy(class_logits))

            # self.accuracy = tf.metrics.accuracy(
            #     labels=tf.argmax(pi.expert_actions, axis=-1),
            #     predictions=tf.argmax(class_logits, axis=-1)
            # )

            self.accuracy = tf.metrics.accuracy(
                labels=tf.argmax(pi.expert_actions[..., 1:3], axis=-1),
                predictions=tf.argmax(class_logits[..., 1:3], axis=-1))

            model_summaries = [
                tf.summary.scalar('class_loss', class_loss),
                tf.summary.scalar('class_accuracy', self.accuracy[0])
            ]
            # Accumulate total loss:
            loss = float(self.class_lambda) * class_loss + float(self.aac_lambda) * on_pi_loss\
                - float(self.model_beta) * entropy

            model_summaries += on_pi_summaries

        return loss, model_summaries