def expected_loss_objective(decoder: Decoder,
                            reward_function: RewardFunction,
                            control_variate: str = None) -> Objective:
    """Construct Expected Loss objective for training with bandit feedback.

    'Bandit Structured Prediction for Neural Sequence-to-Sequence Learning'
    Details: http://www.aclweb.org/anthology/P17-1138

    :param decoder: a recurrent decoder to sample from
    :param reward_function: any evaluator object
    :param control_variate: optional 'baseline' average reward
    :return: Objective object to be used in generic trainer
    """
    check_argument_types()

    # decoded, shape (time, batch)
    # pylint: disable=protected-access
    sample_loop_result = decoder.decoding_loop(train_mode=False, sample=True)
    sample_logits = sample_loop_result[0]
    sample_decoded = sample_loop_result[3]

    reference = decoder.train_inputs

    def _score_with_reward_function(references: np.array,
                                    hypotheses: np.array) -> np.array:
        """Score (time, batch) arrays with sentence-based reward function.

        Parts of the sentence after generated <pad> or </s> are ignored.
        BPE-postprocessing is also included.

        :param references: array of indices of references, shape (time, batch)
        :param hypotheses: array of indices of hypotheses, shape (time, batch)
        :return: an array of batch length with float rewards
        """
        rewards = []
        for refs, hyps in zip(references.transpose(), hypotheses.transpose()):
            ref_seq = []
            hyp_seq = []
            for r_token in refs:
                token = decoder.vocabulary.index_to_word[r_token]
                if token == END_TOKEN or token == PAD_TOKEN:
                    break
                ref_seq.append(token)
            for h_token in hyps:
                token = decoder.vocabulary.index_to_word[h_token]
                if token == END_TOKEN or token == PAD_TOKEN:
                    break
                hyp_seq.append(token)
            # join BPEs, split on " " to prepare list for evaluator
            refs_tokens = " ".join(ref_seq).replace("@@ ", "").split(" ")
            hyps_tokens = " ".join(hyp_seq).replace("@@ ", "").split(" ")
            reward = float(reward_function([hyps_tokens], [refs_tokens]))
            rewards.append(reward)
        return np.array(rewards, dtype=np.float32)

    # rewards, shape (batch)
    sample_reward = tf.py_func(_score_with_reward_function,
                               [reference, sample_decoded], tf.float32)

    # if specified, compute the average reward baseline
    baseline = None

    reward_counter = tf.Variable(0.0, trainable=False, name="reward_counter")
    reward_sum = tf.Variable(0.0, trainable=False, name="reward_sum")

    if control_variate == "baseline":
        # increment the cumulative reward in the decoder
        reward_counter = tf.assign_add(reward_counter,
                                       tf.to_float(decoder.batch_size))
        reward_sum = tf.assign_add(reward_sum, tf.reduce_sum(sample_reward))
        baseline = tf.div(reward_sum, tf.maximum(reward_counter, 1.0))

    tf.summary.scalar("sample_{}/reward".format(decoder.data_id),
                      tf.reduce_mean(sample_reward),
                      collections=["summary_train"])

    # REINFORCE score: shape (time, batch, vocab)
    sent_loss = reinforce_score(sample_reward, baseline, sample_decoded,
                                sample_logits)

    batch_loss = tf.reduce_mean(sent_loss)

    tf.summary.scalar("train_{}/self_bandit_cost".format(decoder.data_id),
                      batch_loss,
                      collections=["summary_train"])

    return Objective(name="{}_bandit".format(decoder.name),
                     decoder=decoder,
                     loss=batch_loss,
                     gradients=None,
                     weight=None)
Exemplo n.º 2
0
def rl_objective(decoder: Decoder,
                 reward_function: RewardFunction,
                 subtract_baseline: bool = False,
                 normalize: bool = False,
                 temperature: float = 1.,
                 ce_smoothing: float = 0.,
                 alpha: float = 1.,
                 sample_size: int = 1) -> Objective:
    """Construct RL objective for training with sentence-level feedback.

    Depending on the options the objective corresponds to:
    1) sample_size = 1, normalize = False, ce_smoothing = 0.0
     Bandit objective (Eq. 2) described in 'Bandit Structured Prediction for
     Neural Sequence-to-Sequence Learning'
     (http://www.aclweb.org/anthology/P17-1138)
     It's recommended to set subtract_baseline = True.
    2) sample_size > 1, normalize = True, ce_smoothing = 0.0
     Minimum Risk Training as described in 'Minimum Risk Training for Neural
     Machine Translation' (http://www.aclweb.org/anthology/P16-1159) (Eq. 12).
    3) sample_size > 1, normalize = False, ce_smoothing = 0.0
     The Google 'Reinforce' objective as proposed in 'Google’s NMT System:
     Bridging the Gap between Human and Machine Translation'
     (https://arxiv.org/pdf/1609.08144.pdf) (Eq. 8).
    4) sample_size > 1, normalize = False, ce_smoothing > 0.0
     Google's 'Mixed' objective in the above paper (Eq. 9),
     where ce_smoothing implements alpha.

    Note that 'alpha' controls the sharpness of the normalized distribution,
    while 'temperature' controls the sharpness during sampling.

    :param decoder: a recurrent decoder to sample from
    :param reward_function: any evaluator object
    :param subtract_baseline: avg reward is subtracted from obtained reward
    :param normalize: the probabilities of the samples are re-normalized
    :param sample_size: number of samples to obtain feedback for
    :param ce_smoothing: add cross-entropy loss with this coefficient to loss
    :param alpha: determines the shape of the normalized distribution
    :param temperature: the softmax temperature for sampling
    :return: Objective object to be used in generic trainer
    """
    check_argument_types()

    reference = decoder.train_inputs

    def _score_with_reward_function(references: np.array,
                                    hypotheses: np.array) -> np.array:
        """Score (time, batch) arrays with sentence-based reward function.

        Parts of the sentence after generated <pad> or </s> are ignored.
        BPE-postprocessing is also included.

        :param references: array of indices of references, shape (time, batch)
        :param hypotheses: array of indices of hypotheses, shape (time, batch)
        :return: an array of batch length with float rewards
        """
        rewards = []
        for refs, hyps in zip(references.transpose(), hypotheses.transpose()):
            ref_seq = []
            hyp_seq = []
            for r_token in refs:
                token = decoder.vocabulary.index_to_word[r_token]
                if token == END_TOKEN or token == PAD_TOKEN:
                    break
                ref_seq.append(token)
            for h_token in hyps:
                token = decoder.vocabulary.index_to_word[h_token]
                if token == END_TOKEN or token == PAD_TOKEN:
                    break
                hyp_seq.append(token)
            # join BPEs, split on " " to prepare list for evaluator
            refs_tokens = " ".join(ref_seq).replace("@@ ", "").split(" ")
            hyps_tokens = " ".join(hyp_seq).replace("@@ ", "").split(" ")
            reward = float(reward_function([hyps_tokens], [refs_tokens]))
            rewards.append(reward)
        return np.array(rewards, dtype=np.float32)

    samples_rewards = []
    samples_logprobs = []

    for _ in range(sample_size):
        # sample from logits
        # decoded, shape (time, batch)
        sample_loop_result = decoder.decoding_loop(train_mode=False,
                                                   sample=True,
                                                   temperature=temperature)
        sample_logits = sample_loop_result[0]
        sample_decoded = sample_loop_result[3]

        # rewards, shape (batch)
        # simulate from reference
        sample_reward = tf.py_func(_score_with_reward_function,
                                   [reference, sample_decoded], tf.float32)

        # pylint: disable=invalid-unary-operand-type
        word_logprobs = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=sample_decoded, logits=sample_logits)

        # sum word log prob to sentence log prob
        # no masking here, since otherwise shorter sentences are preferred
        sent_logprobs = tf.reduce_sum(word_logprobs, axis=0)

        samples_rewards.append(sample_reward)  # sample_size x batch
        samples_logprobs.append(sent_logprobs)  # sample_size x batch

    # stack samples, sample_size x batch
    samples_rewards_stacked = tf.stack(samples_rewards)
    samples_logprobs_stacked = tf.stack(samples_logprobs)

    if subtract_baseline:
        # if specified, compute the average reward baseline
        reward_counter = tf.Variable(0.0,
                                     trainable=False,
                                     name="reward_counter")
        reward_sum = tf.Variable(0.0, trainable=False, name="reward_sum")
        # increment the cumulative reward
        reward_counter = tf.assign_add(
            reward_counter, tf.to_float(decoder.batch_size * sample_size))
        # sum over batch and samples
        reward_sum = tf.assign_add(reward_sum,
                                   tf.reduce_sum(samples_rewards_stacked))
        # compute baseline: avg of previous rewards
        baseline = tf.div(reward_sum, tf.maximum(reward_counter, 1.0))
        samples_rewards_stacked -= baseline

        tf.summary.scalar("train_{}/rl_reward_baseline".format(
            decoder.data_id),
                          tf.reduce_mean(baseline),
                          collections=["summary_train"])

    if normalize:
        # normalize over sample space
        samples_logprobs_stacked = tf.nn.softmax(samples_logprobs_stacked *
                                                 alpha,
                                                 dim=0)

    scored_probs = tf.stop_gradient(
        tf.negative(samples_rewards_stacked)) * samples_logprobs_stacked

    # sum over samples
    total_loss = tf.reduce_sum(scored_probs, axis=0)

    # average over batch
    batch_loss = tf.reduce_mean(total_loss)

    if ce_smoothing > 0.0:
        batch_loss += tf.multiply(ce_smoothing, decoder.cost)

    tf.summary.scalar("train_{}/self_rl_cost".format(decoder.data_id),
                      batch_loss,
                      collections=["summary_train"])

    return Objective(name="{}_rl".format(decoder.name),
                     decoder=decoder,
                     loss=batch_loss,
                     gradients=None,
                     weight=None)