예제 #1
0
def perplexity_eval(device: torch.device, model: lmp.model.BaseRNNModel,
                    sequence: str,
                    tokenizer: lmp.tokenizer.BaseTokenizer) -> float:
    r"""Helper function for calculating perplexity.

    Args:
        device:
            Model running device.
        model:
            Language model.
        sequence:
            Sequence for evaluation.
        tokenizer:
            Tokenizer for encoding sequence.

    Return:
        Perplexity of `sequence`.
    """
    # Evalation mode.
    model.eval()

    # Encode sequence and convert into tensor. Original sequence length: S.
    # New sequence length: S + 2.
    sequence = tokenizer.encode(sequence, max_seq_len=-1)

    # `sequence[:-2]` means predict tokens include [BOS] output but exclude
    # [EOS] input. `x.shape = (S)`.
    x = torch.LongTensor(sequence[:-2]).to(device)

    # `y.shape = (S)`.
    y = sequence[1:-1]

    # Reshape into `(1, S)` to fit model.
    x = x.reshape(1, -1)

    # Get model vocabulary prediction with shape `(1, S, V)`.
    pred_y = model.predict(x)

    # Reshape into `(S)` for easier maniplation.
    x = x.squeeze(0)

    # Reshape into `(S, V)` for easier maniplation.
    pred_y = pred_y.squeeze(0)

    # Accumulate negative log-likelihood.
    nll = torch.zeros(1).to(device)

    # Iterate through each prediction.
    for pos, token_id in enumerate(y):
        probs = pred_y[pos, token_id]
        nll = nll - torch.log(probs)

    # Normalized by length.
    nll = nll / x.size(0)

    # Take exponential to cancel logarithmic.
    return nll.exp().item()
def perplexity_eval(
        device: torch.device,
        model: Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel],
        sequence: str,
        tokenizer: lmp.tokenizer.BaseTokenizer
) -> float:
    r"""Helper function for calculating perplexity.

    Args:
        device:
            Model running device.
        model:
            Language model.
        sequence:
            Sequence for evaluation. Must not be empty.
        tokenizer:
            Tokenizer for encoding sequence.

    Raises:
        TypeError:
            When one of the arguments are not an instance of their type
            annotation respectively.

    Return:
        Perplexity of `sequence`.
    """
    # Type check.
    if not isinstance(device, torch.device):
        raise TypeError('`device` must be an instance of `torch.device`.')

    if not isinstance(model, (
            lmp.model.BaseRNNModel,
            lmp.model.BaseResRNNModel
    )):
        raise TypeError(
            '`model` must be an instance of '
            '`Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]`.'
        )

    if not isinstance(sequence, str):
        raise TypeError('`sequence` must be an instance of `str`.')

    if not isinstance(tokenizer, lmp.tokenizer.BaseTokenizer):
        raise TypeError(
            '`tokenizer` must be an instance of `lmp.tokenizer.BaseTokenizer`.'
        )

    # Value check.
    if not sequence:
        raise ValueError('`sequence` must not be empty.')

    # Evalation mode.
    model.eval()

    # Encode sequence and convert into tensor. Original sequence length: S.
    # New sequence length: S + 2.
    sequence = tokenizer.encode(sequence, max_seq_len=-1)

    # `sequence[:-2]` means predict tokens include [bos] output but exclude
    # [eos] input. `x.shape = (S)`.
    x = torch.LongTensor(sequence[:-2]).to(device)

    # `y.shape = (S)`.
    y = sequence[1:-1]

    # Reshape into `(1, S)` to fit model.
    x = x.reshape(1, -1)

    # Get model vocabulary prediction with shape `(1, S, V)`.
    pred_y = model.predict(x)

    # Reshape into `(S)` for easier maniplation.
    x = x.squeeze(0)

    # Reshape into `(S, V)` for easier maniplation.
    pred_y = pred_y.squeeze(0)

    # Accumulate negative log-likelihood.
    nll = torch.zeros(1).to(device)

    # Iterate through each prediction.
    for pos, token_id in enumerate(y):
        probs = pred_y[pos, token_id]
        nll = nll - torch.log(probs)

    # Normalized by length.
    nll = nll / x.size(0)

    # Take exponential to cancel logarithmic.
    return nll.exp().item()
def generate_sequence(beam_width: int, begin_of_sequence: str,
                      device: torch.device, max_seq_len: int,
                      model: lmp.model.BaseRNNModel,
                      tokenizer: lmp.tokenizer.BaseTokenizer) -> List[str]:
    r"""Sequences generation using beam search.

    Args:
        beam_width:
            Number of candidate sequences to output.
        begin_of_sequence:
            Begining of sequence which model will auto-complete.
        device:
            Model running device.
        max_seq_len:
            Maximum of output sequences length.
        model:
            Language model.
        tokenizer:
            Tokenizer for encoding and decoding sequences.

    Returns:
        Generated sequences.
    """
    # Evaluation mode.
    model.eval()

    # Encode sequence and convert into tensor. Remove [EOS] since we are using
    # begin of sentence.
    cur_seq = tokenizer.encode(begin_of_sequence, max_seq_len=-1)
    cur_seq = torch.LongTensor(cur_seq)[:-1].to(device)

    # Get begin sequence length.
    seq_len = cur_seq.size(-1)

    # Generated sequence.
    # Start shape (1, S).
    # Final shape (B, S).
    cur_seq = cur_seq.reshape(1, seq_len)

    # Accumulated negative log-likelihood. Using log can change consecutive
    # probability multiplication into sum of log probability which can
    # avoid computational underflow. Initialized to zero with shape (B).
    accum_prob = torch.zeros(beam_width).to(device)

    for _ in range(max_seq_len - seq_len):
        # Model prediction has shape (B, S, V).
        pred_y = model.predict(cur_seq)

        # Record all beams prediction.
        # Each beam will predict `beam_width` different results.
        # So we totally have `beam_width * beam_width` different results.
        top_k_in_all_beams = []
        for out_beam in range(cur_seq.size(0)):
            # Get `beam_width` different prediction from beam `out_beam`.
            # `top_k_prob_in_beam` has shape (B) and
            # `top_k_index_in_beam` has shape (B).
            top_k_prob_in_beam, top_k_index_in_beam = \
                pred_y[out_beam, -1].topk(
                    k=beam_width,
                    dim=-1
                )

            # Record each beam's negative log-likelihood and concate
            # next token id based on prediction.
            for in_beam in range(beam_width):
                # Accumulate negative log-likelihood. Since log out
                # negative value when input is in range 0~1, we negate it
                # to be postive.
                prob = accum_prob[out_beam] - \
                    top_k_prob_in_beam[in_beam].log()
                prob = prob.unsqueeze(0)

                # Concate next predicted token id.
                seq = torch.cat([
                    cur_seq[out_beam],
                    top_k_index_in_beam[in_beam].unsqueeze(0)
                ],
                                dim=-1).unsqueeze(0)

                # Record result.
                top_k_in_all_beams.append({'prob': prob, 'seq': seq})

        # Compare each recorded result in all beams. First concate tensor
        # then use `topk` to get the `beam_width` highest prediction in all
        # beams.
        _, top_k_index_in_all_beams = torch.cat(
            [beam['prob'] for beam in top_k_in_all_beams]).topk(k=beam_width,
                                                                dim=0)

        # Update `cur_seq` which is the `beam_width` highest results.
        cur_seq = torch.cat([
            top_k_in_all_beams[index]['seq']
            for index in top_k_index_in_all_beams
        ],
                            dim=0)

        # Update accumlated negative log-likelihood.
        accum_prob = torch.cat([
            top_k_in_all_beams[index]['prob']
            for index in top_k_index_in_all_beams
        ],
                               dim=0)

    return tokenizer.batch_decode(cur_seq.tolist())
예제 #4
0
def generate_sequence(
        beam_width: int,
        begin_of_sequence: str,
        device: torch.device,
        max_seq_len: int,
        model: Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel],
        tokenizer: lmp.tokenizer.BaseTokenizer
) -> List[str]:
    r"""Sequences generation using beam search.

    Args:
        beam_width:
            Number of candidate sequences to output. Must be bigger than or
            equal to `1`.
        begin_of_sequence:
            Begining of sequence which model will auto-complete.
        device:
            Model running device.
        max_seq_len:
            Maximum of output sequences length. Must be bigger than or equal to
            `2`.
        model:
            Language model.
        tokenizer:
            Tokenizer for encoding and decoding sequences.

    Raises:
        TypeError:
            When one of the arguments are not an instance of their type
            annotation respectively.
        ValueError:
            When one of the arguments do not follow their constraints. See
            docstring for arguments constraints.

    Returns:
        Generated sequences.
    """
    # Type check.
    if not isinstance(beam_width, int):
        raise TypeError('`beam_width` must be an instance of `int`.')

    if not isinstance(begin_of_sequence, str):
        raise TypeError('`begin_of_sequence` must be an instance of `str`.')

    if not isinstance(device, torch.device):
        raise TypeError('`device` must be an instance of `torch.device`.')

    if not isinstance(max_seq_len, int):
        raise TypeError('`max_seq_len` must be an instance of `int`.')

    if not isinstance(model, (
            lmp.model.BaseRNNModel,
            lmp.model.BaseResRNNModel
    )):
        raise TypeError(
            '`model` must be an instance of '
            '`Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]`.'
        )

    if not isinstance(tokenizer, lmp.tokenizer.BaseTokenizer):
        raise TypeError(
            '`tokenizer` must be an instance of '
            '`lmp.tokenizer.BaseTokenizer`.'
        )

    # Value check.
    if beam_width < 1:
        raise ValueError('`beam_width` must be bigger than or equal to `1`.')

    if max_seq_len < 2:
        raise ValueError('`max_seq_len` must be bigger than or equal to `2`.')

    # Evaluation mode.
    model.eval()

    # Encode sequence and convert into tensor. Remove `[eos]`` since we are
    # using begin of sentence.
    cur_seq = tokenizer.encode(begin_of_sequence, max_seq_len=-1)
    cur_seq = torch.LongTensor(cur_seq)[:-1].to(device)

    # Get begin sequence length.
    seq_len = cur_seq.size(-1)

    # Generated sequence.
    # Start shape (1, S).
    # Final shape (B, S).
    cur_seq = cur_seq.reshape(1, seq_len)

    # Accumulated negative log-likelihood. Using log can change consecutive
    # probability multiplication into sum of log probability which can
    # avoid computational underflow. Initialized to zero with shape (B).
    accum_prob = torch.zeros(beam_width).to(device)

    for _ in range(max_seq_len - seq_len):
        # Model prediction has shape (B, S, V).
        pred_y = model.predict(cur_seq)

        # Record all beams prediction.
        # Each beam will predict `beam_width` different results.
        # So we totally have `beam_width * beam_width` different results.
        top_k_in_all_beams = []
        for out_beam in range(cur_seq.size(0)):
            # Get `beam_width` different prediction from beam `out_beam`.
            # `top_k_prob_in_beam` has shape (B) and
            # `top_k_index_in_beam` has shape (B).
            top_k_prob_in_beam, top_k_index_in_beam = \
                pred_y[out_beam, -1].topk(
                    k=beam_width,
                    dim=-1
                )

            # Record each beam's negative log-likelihood and concate
            # next token id based on prediction.
            for in_beam in range(beam_width):
                # Accumulate negative log-likelihood. Since log out
                # negative value when input is in range 0~1, we negate it
                # to be postive.
                prob = accum_prob[out_beam] - \
                    top_k_prob_in_beam[in_beam].log()
                prob = prob.unsqueeze(0)

                # Concate next predicted token id.
                seq = torch.cat([
                    cur_seq[out_beam],
                    top_k_index_in_beam[in_beam].unsqueeze(0)
                ], dim=-1).unsqueeze(0)

                # Record result.
                top_k_in_all_beams.append({
                    'prob': prob,
                    'seq': seq
                })

        # Compare each recorded result in all beams. First concate tensor
        # then use `topk` to get the `beam_width` highest prediction in all
        # beams.
        _, top_k_index_in_all_beams = torch.cat([
            beam['prob']
            for beam in top_k_in_all_beams
        ]).topk(k=beam_width, dim=0)

        # Update `cur_seq` which is the `beam_width` highest results.
        cur_seq = torch.cat([
            top_k_in_all_beams[index]['seq']
            for index in top_k_index_in_all_beams
        ], dim=0)

        # Update accumlated negative log-likelihood.
        accum_prob = torch.cat([
            top_k_in_all_beams[index]['prob']
            for index in top_k_index_in_all_beams
        ], dim=0)

    return tokenizer.batch_decode(cur_seq.tolist())