예제 #1
0
def align_bpe(text: Text, bpe_tokenizer: Tokenizer) -> Tuple[TokenAligner, List[Text]]:
    """Alignment fn for BPE tokenizer, used in GPT and XLM
    """
    eow_tokens = space_tokenize_with_eow(text.lower())
    bpe_tokens = bpe_tokenizer.tokenize(text)
    ta = TokenAligner(eow_tokens, bpe_tokens)
    return ta, bpe_tokens
예제 #2
0
def align_bytebpe(text: Text, bytebpe_tokenizer: Tokenizer) -> Tuple[TokenAligner, List[Text]]:
    """Alignment fn for Byte-level BPE tokenizer, used in GPT-2 and RoBERTa
    """
    bow_tokens = space_tokenize_with_bow(text)
    bytebpe_tokens = bytebpe_tokenizer.tokenize(text)

    modified_bytebpe_tokens = list(map(process_bytebpe_for_alignment, bytebpe_tokens))
    ta = TokenAligner(bow_tokens, modified_bytebpe_tokens)
    return ta, bytebpe_tokens
예제 #3
0
def align_sentencepiece(
        text: Text,
        sentencepiece_tokenizer: Tokenizer) -> Tuple[TokenAligner, List[Text]]:
    """Alignment fn for SentencePiece Tokenizer, used in XLNET
    """
    bow_tokens = space_tokenize_with_bow(text)
    sentencepiece_tokens = sentencepiece_tokenizer.tokenize(text)

    modified_sentencepiece_tokens = list(
        map(process_sentencepiece_for_alignment, sentencepiece_tokens))
    ta = TokenAligner(bow_tokens, modified_sentencepiece_tokens)
    return ta, sentencepiece_tokens
예제 #4
0
def align_wpm(
    text: Text, wpm_tokenizer: Tokenizer, do_lower_case: bool
) -> Tuple[TokenAligner, List[Text]]:
    """Alignment fn for WPM tokenizer, used in BERT
    """
    # If using lowercase, do this for the source tokens for better matching.
    bow_tokens = space_tokenize_with_bow(text.lower() if do_lower_case else text)
    wpm_tokens = wpm_tokenizer.tokenize(text)

    # Align using <w> markers for stability w.r.t. word boundaries.
    modified_wpm_tokens = list(map(process_wordpiece_for_alignment, wpm_tokens))
    ta = TokenAligner(bow_tokens, modified_wpm_tokens)
    return ta, wpm_tokens