Python BertTokenizerFast.encode_plus示例，transformers.BertTokenizerFast.encode_plus Python示例

示例#1

0

显示文件

def inference_answer(question: str, context: str, input_ids: List[int],
                     token_type_ids: List[int], start_pos: int, end_pos: int,
                     tokenizer: BertTokenizerFast) -> str:
    """ Inference fucntion for the answer.

    Because the tokenizer lowers the capital letters and splits punctuation marks,
    you may get wrong answer words if you detokenize it directly.
    For example, if you encode "$5.000 Dollars" and decode it, you get different words from the orignal.

    "$5.00 USD" --(Tokenize)--> ["$", "5", ".", "00", "usd"] --(Detokenize)--> "$ 5. 00 usd"

    Thus, you should find the original words in the context by the start and end token positions of the answer.
    Implement the function inferencing the answer from the context and the answer token postion.

    Note 1: We have already implmented direct decoding so you can skip this problem if you want.

    Note 2: When we implement squad_feature, we have arbitrarily split tokens if the answer is a subword,
            so it is very tricky to extract the original word by start_pos and end_pos.`
            However, as None is entered into the answer when evaluating,
            you can assume the word tokens follow general tokenizing rule in this problem.
            In fact, the most appropriate solution is storing the character index when tokenizing them.

    Hint: You can find a simple solution if you carefully search the documentation of the transformers library.
    Library Link: https://huggingface.co/transformers/index.html

    Arguments:
    question -- Question string
    context -- Context string

    input_ids -- Input ids
    token_type_ids -- Token type ids
    start_pos -- Predicted start token position of the answer
    end_pos -- Predicted end token position of the answer

    tokenizer -- Tokenizer to encode and decode the string

    Return:
    answer -- Answer string
    """
    ### YOUR CODE HERE (~4 lines)
    answer = input_ids[start_pos:end_pos + 1]
    answer: str = tokenizer.decode(answer)
    encoded_context = tokenizer.encode_plus(question,
                                            context,
                                            return_offsets_mapping=True)
    answer_char_pos = encoded_context['offset_mapping'][start_pos:end_pos + 1]
    answer = context[answer_char_pos[0][0]:answer_char_pos[-1][1]]
    ### END YOUR CODE

    return answer

示例#2

0

显示文件

def inference_model(model: BertForSquad, tokenizer: BertTokenizerFast,
                    context: str, question: str, input_ids: List[int],
                    token_type_ids: List[int]) -> str:
    """ Inferene function with the model 
    Because we don't know how your model works, we can't not infer the answer from your model.
    Implement inference process for you model.
    Please use inference_start_end and inference_answer functions you have implemented
    
    Argumentes:
    model -- Model you have trained.
    tokenizer -- Tokenizer to encode and decode the string
    context -- Context string
    question -- Question string
    input_ids -- Input ids
    token_type_dis -- Token type ids

    Return:
    answer -- Answer string
    """
    ### YOUR CODE HERE
    anwser: str = None
    # model_path = './checkpoint'
    # model = model.from_pretrained(model_path)
    # tokenizer = tokenizer.from_pretrained('bert-base-uncased')
    model.eval()

    encoded_input = tokenizer.encode_plus(question, context)
    input_ids = torch.tensor([encoded_input['input_ids']], device=device)
    attention_mask = torch.tensor([encoded_input['attention_mask']],
                                  device=device)
    token_type_ids = torch.tensor([encoded_input['token_type_ids']],
                                  device=device)
    start_logits, end_logits = model(input_ids, attention_mask, token_type_ids)
    start_probs = F.softmax(start_logits, dim=-1)
    end_probs = F.softmax(end_logits, dim=-1)
    context_start_pos = encoded_input['input_ids'].index(
        tokenizer.sep_token_id) + 1
    context_end_pos = len(encoded_input['input_ids']) - 2
    start_pos, end_pos = inference_start_end(start_probs, end_probs,
                                             context_start_pos,
                                             context_end_pos)
    answer = inference_answer(question, context, encoded_input['input_ids'],
                              encoded_input['token_type_ids'], start_pos,
                              end_pos, tokenizer)
    ### END YOUR CODE

    return answer

示例#3

0

显示文件

文件： dataset.py 项目： hash2430/Korean_Intent_classifier

def squad_features_1(
    context: str,
    question: str,
    answer: Union[str, None],
    start_char_pos: Union[int, None],
    tokenizer: BertTokenizerFast
) -> Tuple[List[int], List[int], int, int]:
    """ Squad feature extractor
    Implement the feature extractor from a Squad sample for your model
    Return values should follow [CLS + question + SEP + context + SEP] form.
    In addition, because start_char_pos is based on character index, you should convert it to proper token index.
    Check the test cases to know the functionality in detail.

    Note: input_ids and token_type_ids follows the transfomer library documentation 
    https://huggingface.co/transformers/glossary.html

    Arguments:
    context -- Context string
    question -- Question string
    anwser -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos
    start_char_pos -- Character index which the answer starts from in the context.
                      If the answer is None, this argument is also None.
    tokenizer -- Tokenizer to encode text strings.
                 Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast

    Returns:
    input_ids -- Input ids
    token_type_ids -- Token type ids 
    start_token_pos -- Token index which the answer starts from in the input_ids list. 
                       None if no answer is given.
    end_token_pos -- Token index which the answer ends by in the input_ids list.
                     This includes the last token which located in the index.
                     None if no answer is given.
    """
    input_ids: List[int] = None
    token_type_ids: List[int] = None
    start_token_pos: int = None
    end_token_pos: int = None

    encoded_dict = tokenizer.encode_plus(question, context)
    tokens = tokenizer.tokenize(context)
    input_ids = encoded_dict['input_ids']
    token_type_ids = encoded_dict['token_type_ids']

    if answer == None:
        return input_ids, token_type_ids, None, None
    context = context.lower()
    token2char_map = {}
    start = 0
    for j in range(len(tokens)):
        for i in range(len(tokens[j])):
            if tokens[j][i] == '#':
                continue
            else:
                break
        token = tokens[j][i:]
        start = context.find(token,start)
        end = start + len(token)
        token2char_map[j] = [start, end-1]
        start = end

    for i in range(len(tokens)):
        if token2char_map[i][0]>=start_char_pos:
            start_token_pos = i
            break
    end_token_pos = len(tokens) - 1
    for i in range(start_token_pos, len(tokens)):
        if token2char_map[i][0]>=start_char_pos+len(answer):
            end_token_pos = i-1
            break

    num_tokens_before_context = input_ids.index(102) + 1
    start_token_pos += num_tokens_before_context
    end_token_pos += num_tokens_before_context

    return input_ids, token_type_ids, start_token_pos, end_token_pos

示例#4

0

显示文件

def squad_features(
        context: str, question: str, answer: Union[str, None],
        start_char_pos: Union[int, None],
        tokenizer: BertTokenizerFast) -> Tuple[List[int], List[int], int, int]:
    """ Squad feature extractor
    Implement the feature extractor from a Squad sample for your model
    Return values should follow [CLS + question + SEP + context + SEP] form.
    In addition, because start_char_pos is based on character index, you should convert it to proper token index.
    Check the test cases to know the functionality in detail.

    Note: input_ids and token_type_ids follows the transfomer library documentation 
    https://huggingface.co/transformers/glossary.html

    Arguments:
    context -- Context string
    question -- Question string
    answer -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos
    start_char_pos -- Character index which the answer starts from in the context.
                      If the answer is None, this argument is also None.
    tokenizer -- Tokenizer to encode text strings.
                 Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast

    Returns:
    input_ids -- Input ids
    token_type_ids -- Token type ids
    start_token_pos -- Token index which the answer starts from in the input_ids list. 
                       None if no answer is given.
    end_token_pos -- Token index which the answer ends by in the input_ids list.
                     This includes the last token which located in the index.
                     None if no answer is given.
    """
    ### YOUR CODE HERE (~18 lines)
    encoded_dict = tokenizer.encode_plus(question, context)
    input_ids = encoded_dict["input_ids"]
    token_type_ids = encoded_dict["token_type_ids"]
    input_ids_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    # print("Input (tokens): ", input_ids_tokens)
    if answer is None and start_char_pos is None:
        start_token_pos = None
        end_token_pos = None
        return input_ids, token_type_ids, start_token_pos, end_token_pos

    start_token_pos, end_token_pos = 0, 0
    start_token_pos += token_type_ids.count(0)
    start_token_pos += len(tokenizer.tokenize(context[:start_char_pos]))
    end_token_pos += len(tokenizer.tokenize(answer)) + start_token_pos - 1
    # Extract tokenized answer part only
    tokenized_answer = " ".join(
        tokenizer.convert_ids_to_tokens(
            input_ids[start_token_pos:end_token_pos + 1]))

    subword_prefix_original = "##" if "##" in tokenized_answer else ""
    subword_prefix = "##"
    tokenized_answer = tokenized_answer.replace('#', '')
    if tokenized_answer != answer.lower(
    ) and start_token_pos == end_token_pos and answer in tokenized_answer:
        # A single word but different subword tokenization case
        new_subword_list = [
            subword_prefix_original + tokenized_answer[:len(answer)],
            subword_prefix + tokenized_answer[len(answer):]
        ]
        # print('new_subword_list : ', new_subword_list)
        input_ids = input_ids[:
                              start_token_pos] + tokenizer.convert_tokens_to_ids(
                                  new_subword_list) + input_ids[end_token_pos +
                                                                1:]
        token_type_ids.append(1)

    # print("Input ids: ", input_ids)
    # input_ids_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    # print("Input (tokens) (ADJUSTED): ", input_ids_tokens)
    # print("Segmend Ids: ", token_type_ids)
    # print('START_CHAR_POS: ', start_char_pos)
    # print("ANSWER: ", answer)
    # print("START: ", start_token_pos)
    # print("END: ", end_token_pos)
    # print("ANSWER SPAN: ", input_ids_tokens[start_token_pos:end_token_pos+1])
    assert len(input_ids) == len(token_type_ids)

    ### END YOUR CODE

    return input_ids, token_type_ids, start_token_pos, end_token_pos

示例#5

0

显示文件

文件： 09_tf_bert_transformers.py 项目： flyangovoyang/owenlp

import tensorflow as tf
from transformers import TFBertModel, BertConfig, BertTokenizerFast
# transformers==3.0.2

print(tf.__version__)
config = BertConfig.from_pretrained('../../bert-base-chinese/config.json')
model = TFBertModel.from_pretrained('tf-model.h5', config=config)
model.summary()

tokenizer = BertTokenizerFast('../../bert-base-chinese/vocab.txt')
max_seq_len = 50


texts = ["我喜欢你", "我爱你", "我讨厌你"]
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)


for text in texts:
    be = tokenizer.encode_plus(text, truncation=True, max_length=max_seq_len+2, padding='max_length', return_tensors="tf")
    with tf.GradientTape() as tape:
        last_hidden_states, pooler_output = model(be)[:2]
        loss = loss_object(pooler_output)
    print(last_hidden_states.shape)
    print(pooler_output.shape)

# model.save('bert_save')