示例#1
0
def align_bert(text: Text,
               model_name: str) -> Tuple[retokenize.TokenAligner, List[Text]]:
    # If using lowercase, do this for the source tokens for better matching.
    do_lower_case = model_name.endswith('uncased')
    bow_tokens = space_tokenize_with_bow(
        text.lower() if do_lower_case else text)

    bert_tokenizer = _get_bert_tokenizer(model_name, do_lower_case)
    wpm_tokens = bert_tokenizer.tokenize(text)

    # Align using <w> markers for stability w.r.t. word boundaries.
    modified_wpm_tokens = list(
        map(process_bert_wordpiece_for_alignment, wpm_tokens))
    ta = retokenize.TokenAligner(bow_tokens, modified_wpm_tokens)
    return ta, wpm_tokens
示例#2
0
def retokenize_record(record):
    """Retokenize edge probing examples. Modifies in-place.

    This can be slow, so recommended to use as a pre-processing step.
    See retokenize_edge_data.py.
    """
    text = record['text']
    moses_tokens = utils.TOKENIZER.tokenize(text)
    cleaned_moses_tokens = utils.unescape_moses(moses_tokens)
    ta = retokenize.TokenAligner(text, cleaned_moses_tokens)
    record['text'] = " ".join(moses_tokens)
    for target in record['targets']:
        if 'span1' in target:
            target['span1'] = list(map(int, ta.project_span(*target['span1'])))
        if 'span2' in target:
            target['span2'] = list(map(int, ta.project_span(*target['span2'])))
    return record
示例#3
0
def retokenize_record(record):
    """Retokenize edge probing examples. Modifies in-place.

    This can be slow, so recommended to use as a pre-processing step.
    See retokenize_edge_data.py.
    """
    text = record['text']
    eow_tokens = space_tokenize_with_eow(text)
    bpe_tokens = openai_utils.tokenize(text)

    ta = retokenize.TokenAligner(eow_tokens, bpe_tokens)
    record['text'] = " ".join(bpe_tokens)
    for target in record['targets']:
        if 'span1' in target:
            target['span1'] = list(map(int,
                                       ta.project_span(*target['span1'])))
        if 'span2' in target:
            target['span2'] = list(map(int,
                                       ta.project_span(*target['span2'])))
    return record
示例#4
0
def align_openai(text: Text) -> Tuple[retokenize.TokenAligner, List[Text]]:
    eow_tokens = space_tokenize_with_eow(text)
    bpe_tokens = openai_utils.tokenize(text)
    ta = retokenize.TokenAligner(eow_tokens, bpe_tokens)
    return ta, bpe_tokens
示例#5
0
def align_moses(text: Text) -> Tuple[retokenize.TokenAligner, List[Text]]:
    moses_tokens = MosesTokenizer.tokenize(text)
    cleaned_moses_tokens = utils.unescape_moses(moses_tokens)
    ta = retokenize.TokenAligner(text, cleaned_moses_tokens)
    return ta, moses_tokens