Exemplo n.º 1
0
    def create_pretrain_loss(self):
        pretrain_loss = self.loss_for_reconstruction()

        # training updates
        train_op = get_train_op(self.learning_rate, pretrain_loss,
                                self.g_params, self.clip_val)
        return pretrain_loss, train_op
Exemplo n.º 2
0
    def create_critic_loss(self, cumulative_rewards, missing=None):
        if missing is not None:
            missing = tf.cast(missing, tf.bool)
        else:
            missing = 1.0

        loss = tf.compat.v1.losses.mean_squared_error(labels=cumulative_rewards, predictions=self.estimated_values, weights=missing)
        vars = [v for v in tf.trainable_variables() if v.op.name.startswith(self.name)]
        train_op = get_train_op(self.generator.learning_rate, loss, vars)

        return loss, train_op
Exemplo n.º 3
0
    def create_pretrain_loss(self):
        pretrain_recon_loss = self.loss_for_reconstruction()
        pretrain_act_loss = -tf.reduce_sum(
            tf.one_hot(tf.cast(tf.reshape(self.acts, [-1]), tf.int32), self.n_actions, 1.0, 0.0) * clip_and_log(
                tf.reshape(self.a_predictions, [-1, self.n_actions])
            )
        ) / (self.sequence_length * self.batch_size)

        # training updates
        pretrain_loss = pretrain_recon_loss + pretrain_act_loss
        train_op = get_train_op(self.learning_rate, pretrain_loss, self.g_params)
        return pretrain_loss, pretrain_act_loss, train_op
Exemplo n.º 4
0
    def create_adversarial_loss(self, dis_predictions):
        missing = tf.cast(self.missing, tf.float32)

        rewards = tf.nn.sigmoid(dis_predictions)
        rewards = clip_and_log(rewards)

        log_probs = self.gen_log_p * missing

        rewards_list = tf.unstack(rewards, axis=1)
        missing_list = tf.unstack(missing, axis=1)

        # Cumulative Discounted Returns.  The true value function V*(s).
        cumulative_rewards = []
        for t in range(self.sequence_length):
            cum_value = tf.zeros(shape=[self.batch_size])
            for s in range(t, self.sequence_length):
                cum_value += missing_list[s] * np.power(
                    self.reward_gamma, (s - t)) * rewards_list[s]
            cumulative_rewards.append(cum_value)
        cumulative_rewards = tf.stack(cumulative_rewards, axis=1)
        print("cumulative_rewards:", cumulative_rewards.shape)

        # Unstack Tensors into lists.
        self.critic_loss, self.critic_updates = self.critic.create_critic_loss(
            cumulative_rewards, self.missing)

        baselines = tf.unstack(self.critic.estimated_values, axis=1)
        log_probs_list = tf.unstack(log_probs, axis=1)

        g_loss = 0.
        for t in range(self.sequence_length):
            log_probability = log_probs_list[t]

            cum_advantage = tf.zeros(shape=[self.batch_size])
            for s in range(t, self.sequence_length):
                cum_advantage += missing_list[s] * np.power(
                    self.reward_gamma, (s - t)) * rewards_list[s]
            cum_advantage -= baselines[t]

            # Clip advantages.
            cum_advantage = tf.clip_by_value(cum_advantage, -self.clip_val,
                                             self.clip_val)
            g_loss += tf.multiply(missing_list[t] * log_probability,
                                  tf.stop_gradient(cum_advantage))

        train_op = get_train_op(self.learning_rate, -g_loss, self.g_params,
                                self.clip_val)
        return g_loss, train_op
Exemplo n.º 5
0
    def create_loss(self, fake_predictions, real_predictions, missing):
        real_labels = tf.ones_like(real_predictions)
        fake_labels = 1 - missing

        loss_real = tf.compat.v1.losses.sigmoid_cross_entropy(real_labels,
                                                              real_predictions,
                                                              weights=missing)
        loss_fake = tf.compat.v1.losses.sigmoid_cross_entropy(fake_labels,
                                                              fake_predictions,
                                                              weights=missing)

        loss = (loss_fake + loss_real) / 2.

        vars = [
            param for param in tf.trainable_variables()
            if 'discriminator' in param.name
        ]
        train_op = get_train_op(self.generator.learning_rate, loss, vars)
        return loss_fake, loss_real, train_op
Exemplo n.º 6
0
    def create_loss(self, fake_predictions, real_predictions,
                    fake_sequence, real_sequence,
                    fake_missing, real_missing,
                    fake_weights=1.0, real_weights=1.0):

        real_labels = tf.ones_like(real_predictions)
        fake_labels = tf.zeros_like(fake_predictions)

        real_presented = tf.cast(real_sequence, tf.float32) * (1 - real_missing)
        fake_presented = tf.cast(fake_sequence, tf.float32) * (1 - fake_missing)

        # all presented tokens > 0 but missing = -1
        _fake_presented = tf.where(tf.math.equal(fake_presented, 0), tf.ones_like(fake_presented) * -1, fake_presented)
        fake_labels = tf.where(tf.math.equal(real_presented, _fake_presented), real_labels, fake_labels)

        loss_real = tf.compat.v1.losses.sigmoid_cross_entropy(real_labels, real_predictions, weights=real_weights)
        loss_fake = tf.compat.v1.losses.sigmoid_cross_entropy(fake_labels, fake_predictions, weights=fake_weights)

        loss = (loss_fake + loss_real) / 2.

        vars = [param for param in tf.trainable_variables() if 'discriminator' in param.name]
        train_op = get_train_op(self.generator.learning_rate, loss, vars)

        return loss_fake, loss_real, train_op
Exemplo n.º 7
0
    def create_adversarial_loss(self, dis_predictions):
        missing = get_mask(self.gen_act)

        # mask_sent = tf.cast(get_mask_for_pad(self.gen_x, self.gen_act), tf.float32)
        mask_sent = tf.ones_like(missing, tf.float32)

        rewards = tf.nn.sigmoid(dis_predictions)
        rewards = clip_and_log(rewards)

        missing = tf.cast(missing, tf.float32)
        present = (1 - missing)

        mask_act_log_probs = clip_and_log(self.gen_mask_act_p)
        util_act_log_probs = clip_and_log(1 - self.gen_mask_act_p)

        gen_log_p = self.gen_log_p
        gen_act_log_p = self.gen_act_log_p - tf.stop_gradient(util_act_log_probs)

        probs4Tok = tf.exp(gen_log_p)
        probs4Man = tf.exp(gen_act_log_p)

        log_probs = gen_log_p * missing
        act_log_probs = gen_act_log_p * present

        rewards_list = tf.unstack(rewards, axis=1)
        missing_list = tf.unstack(missing, axis=1)
        present_list = tf.unstack(present, axis=1)
        mask_sent_list = tf.unstack(mask_sent, axis=1)

        # Cumulative Discounted Returns.  The true value function V*(s).
        cumulative_rewards = []
        for t in range(self.sequence_length):
            cum_value = tf.zeros(shape=[self.batch_size])
            for s in range(t, self.sequence_length):
                cum_value += mask_sent_list[s] * np.power(self.reward_gamma, (s - t)) * rewards_list[s]
            cumulative_rewards.append(cum_value)
        cumulative_rewards = tf.stack(cumulative_rewards, axis=1)

        self.critic_loss, self.critic_updates = self.critic.create_critic_loss(cumulative_rewards, missing=mask_sent)

        baselines = tf.unstack(self.critic.estimated_values, axis=1)
        probs4Tok_list = tf.unstack(probs4Tok, axis=1)
        probs4Man_list = tf.unstack(probs4Man, axis=1)
        log_probs_list = tf.unstack(log_probs, axis=1)
        act_log_probs_list = tf.unstack(act_log_probs, axis=1)
        mask_act_log_probs_list = tf.unstack(mask_act_log_probs, axis=1)
        util_act_log_probs_list = tf.unstack(util_act_log_probs, axis=1)

        g_loss = 0.
        for t in range(self.sequence_length):
            prob = probs4Tok_list[t]
            act_prob = probs4Man_list[t]

            log_probability = log_probs_list[t]
            act_log_probability = act_log_probs_list[t]

            mask_log_prob = mask_act_log_probs_list[t]
            util_log_prob = util_act_log_probs_list[t]

            mask_prob = tf.exp(mask_log_prob)
            util_prob = tf.exp(util_log_prob)

            cum_advantage = tf.zeros(shape=[self.batch_size])
            for s in range(t, self.sequence_length):
                cum_advantage += mask_sent_list[s] * np.power(self.reward_gamma, (s - t)) * rewards_list[s]
            cum_advantage -= baselines[t]

            # Clip advantages.
            cum_advantage = tf.clip_by_value(cum_advantage, -self.clip_val, self.clip_val)

            g_loss += tf.multiply(missing_list[t] * log_probability * tf.stop_gradient(mask_prob), tf.stop_gradient(cum_advantage))
            g_loss += tf.multiply(present_list[t] * act_log_probability * tf.stop_gradient(util_prob), tf.stop_gradient(cum_advantage))
            g_loss += tf.multiply(missing_list[t] * mask_log_prob * tf.stop_gradient(prob), tf.stop_gradient(cum_advantage))
            g_loss += tf.multiply(present_list[t] * util_log_prob * tf.stop_gradient(act_prob), tf.stop_gradient(cum_advantage))

        train_op = get_train_op(self.learning_rate, -g_loss, self.g_params, self.clip_val)
        return g_loss, train_op