def inference_answer(question: str, context: str, input_ids: List[int], token_type_ids: List[int], start_pos: int, end_pos: int, tokenizer: BertTokenizerFast) -> str: """ Inference fucntion for the answer. Because the tokenizer lowers the capital letters and splits punctuation marks, you may get wrong answer words if you detokenize it directly. For example, if you encode "$5.000 Dollars" and decode it, you get different words from the orignal. "$5.00 USD" --(Tokenize)--> ["$", "5", ".", "00", "usd"] --(Detokenize)--> "$ 5. 00 usd" Thus, you should find the original words in the context by the start and end token positions of the answer. Implement the function inferencing the answer from the context and the answer token postion. Note 1: We have already implmented direct decoding so you can skip this problem if you want. Note 2: When we implement squad_feature, we have arbitrarily split tokens if the answer is a subword, so it is very tricky to extract the original word by start_pos and end_pos.` However, as None is entered into the answer when evaluating, you can assume the word tokens follow general tokenizing rule in this problem. In fact, the most appropriate solution is storing the character index when tokenizing them. Hint: You can find a simple solution if you carefully search the documentation of the transformers library. Library Link: https://huggingface.co/transformers/index.html Arguments: question -- Question string context -- Context string input_ids -- Input ids token_type_ids -- Token type ids start_pos -- Predicted start token position of the answer end_pos -- Predicted end token position of the answer tokenizer -- Tokenizer to encode and decode the string Return: answer -- Answer string """ ### YOUR CODE HERE (~4 lines) answer = input_ids[start_pos:end_pos + 1] answer: str = tokenizer.decode(answer) encoded_context = tokenizer.encode_plus(question, context, return_offsets_mapping=True) answer_char_pos = encoded_context['offset_mapping'][start_pos:end_pos + 1] answer = context[answer_char_pos[0][0]:answer_char_pos[-1][1]] ### END YOUR CODE return answer
def inference_model(model: BertForSquad, tokenizer: BertTokenizerFast, context: str, question: str, input_ids: List[int], token_type_ids: List[int]) -> str: """ Inferene function with the model Because we don't know how your model works, we can't not infer the answer from your model. Implement inference process for you model. Please use inference_start_end and inference_answer functions you have implemented Argumentes: model -- Model you have trained. tokenizer -- Tokenizer to encode and decode the string context -- Context string question -- Question string input_ids -- Input ids token_type_dis -- Token type ids Return: answer -- Answer string """ ### YOUR CODE HERE anwser: str = None # model_path = './checkpoint' # model = model.from_pretrained(model_path) # tokenizer = tokenizer.from_pretrained('bert-base-uncased') model.eval() encoded_input = tokenizer.encode_plus(question, context) input_ids = torch.tensor([encoded_input['input_ids']], device=device) attention_mask = torch.tensor([encoded_input['attention_mask']], device=device) token_type_ids = torch.tensor([encoded_input['token_type_ids']], device=device) start_logits, end_logits = model(input_ids, attention_mask, token_type_ids) start_probs = F.softmax(start_logits, dim=-1) end_probs = F.softmax(end_logits, dim=-1) context_start_pos = encoded_input['input_ids'].index( tokenizer.sep_token_id) + 1 context_end_pos = len(encoded_input['input_ids']) - 2 start_pos, end_pos = inference_start_end(start_probs, end_probs, context_start_pos, context_end_pos) answer = inference_answer(question, context, encoded_input['input_ids'], encoded_input['token_type_ids'], start_pos, end_pos, tokenizer) ### END YOUR CODE return answer
def squad_features_1( context: str, question: str, answer: Union[str, None], start_char_pos: Union[int, None], tokenizer: BertTokenizerFast ) -> Tuple[List[int], List[int], int, int]: """ Squad feature extractor Implement the feature extractor from a Squad sample for your model Return values should follow [CLS + question + SEP + context + SEP] form. In addition, because start_char_pos is based on character index, you should convert it to proper token index. Check the test cases to know the functionality in detail. Note: input_ids and token_type_ids follows the transfomer library documentation https://huggingface.co/transformers/glossary.html Arguments: context -- Context string question -- Question string anwser -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos start_char_pos -- Character index which the answer starts from in the context. If the answer is None, this argument is also None. tokenizer -- Tokenizer to encode text strings. Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast Returns: input_ids -- Input ids token_type_ids -- Token type ids start_token_pos -- Token index which the answer starts from in the input_ids list. None if no answer is given. end_token_pos -- Token index which the answer ends by in the input_ids list. This includes the last token which located in the index. None if no answer is given. """ input_ids: List[int] = None token_type_ids: List[int] = None start_token_pos: int = None end_token_pos: int = None encoded_dict = tokenizer.encode_plus(question, context) tokens = tokenizer.tokenize(context) input_ids = encoded_dict['input_ids'] token_type_ids = encoded_dict['token_type_ids'] if answer == None: return input_ids, token_type_ids, None, None context = context.lower() token2char_map = {} start = 0 for j in range(len(tokens)): for i in range(len(tokens[j])): if tokens[j][i] == '#': continue else: break token = tokens[j][i:] start = context.find(token,start) end = start + len(token) token2char_map[j] = [start, end-1] start = end for i in range(len(tokens)): if token2char_map[i][0]>=start_char_pos: start_token_pos = i break end_token_pos = len(tokens) - 1 for i in range(start_token_pos, len(tokens)): if token2char_map[i][0]>=start_char_pos+len(answer): end_token_pos = i-1 break num_tokens_before_context = input_ids.index(102) + 1 start_token_pos += num_tokens_before_context end_token_pos += num_tokens_before_context return input_ids, token_type_ids, start_token_pos, end_token_pos
def squad_features( context: str, question: str, answer: Union[str, None], start_char_pos: Union[int, None], tokenizer: BertTokenizerFast) -> Tuple[List[int], List[int], int, int]: """ Squad feature extractor Implement the feature extractor from a Squad sample for your model Return values should follow [CLS + question + SEP + context + SEP] form. In addition, because start_char_pos is based on character index, you should convert it to proper token index. Check the test cases to know the functionality in detail. Note: input_ids and token_type_ids follows the transfomer library documentation https://huggingface.co/transformers/glossary.html Arguments: context -- Context string question -- Question string answer -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos start_char_pos -- Character index which the answer starts from in the context. If the answer is None, this argument is also None. tokenizer -- Tokenizer to encode text strings. Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast Returns: input_ids -- Input ids token_type_ids -- Token type ids start_token_pos -- Token index which the answer starts from in the input_ids list. None if no answer is given. end_token_pos -- Token index which the answer ends by in the input_ids list. This includes the last token which located in the index. None if no answer is given. """ ### YOUR CODE HERE (~18 lines) encoded_dict = tokenizer.encode_plus(question, context) input_ids = encoded_dict["input_ids"] token_type_ids = encoded_dict["token_type_ids"] input_ids_tokens = tokenizer.convert_ids_to_tokens(input_ids) # print("Input (tokens): ", input_ids_tokens) if answer is None and start_char_pos is None: start_token_pos = None end_token_pos = None return input_ids, token_type_ids, start_token_pos, end_token_pos start_token_pos, end_token_pos = 0, 0 start_token_pos += token_type_ids.count(0) start_token_pos += len(tokenizer.tokenize(context[:start_char_pos])) end_token_pos += len(tokenizer.tokenize(answer)) + start_token_pos - 1 # Extract tokenized answer part only tokenized_answer = " ".join( tokenizer.convert_ids_to_tokens( input_ids[start_token_pos:end_token_pos + 1])) subword_prefix_original = "##" if "##" in tokenized_answer else "" subword_prefix = "##" tokenized_answer = tokenized_answer.replace('#', '') if tokenized_answer != answer.lower( ) and start_token_pos == end_token_pos and answer in tokenized_answer: # A single word but different subword tokenization case new_subword_list = [ subword_prefix_original + tokenized_answer[:len(answer)], subword_prefix + tokenized_answer[len(answer):] ] # print('new_subword_list : ', new_subword_list) input_ids = input_ids[: start_token_pos] + tokenizer.convert_tokens_to_ids( new_subword_list) + input_ids[end_token_pos + 1:] token_type_ids.append(1) # print("Input ids: ", input_ids) # input_ids_tokens = tokenizer.convert_ids_to_tokens(input_ids) # print("Input (tokens) (ADJUSTED): ", input_ids_tokens) # print("Segmend Ids: ", token_type_ids) # print('START_CHAR_POS: ', start_char_pos) # print("ANSWER: ", answer) # print("START: ", start_token_pos) # print("END: ", end_token_pos) # print("ANSWER SPAN: ", input_ids_tokens[start_token_pos:end_token_pos+1]) assert len(input_ids) == len(token_type_ids) ### END YOUR CODE return input_ids, token_type_ids, start_token_pos, end_token_pos
import tensorflow as tf from transformers import TFBertModel, BertConfig, BertTokenizerFast # transformers==3.0.2 print(tf.__version__) config = BertConfig.from_pretrained('../../bert-base-chinese/config.json') model = TFBertModel.from_pretrained('tf-model.h5', config=config) model.summary() tokenizer = BertTokenizerFast('../../bert-base-chinese/vocab.txt') max_seq_len = 50 texts = ["我喜欢你", "我爱你", "我讨厌你"] loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5) for text in texts: be = tokenizer.encode_plus(text, truncation=True, max_length=max_seq_len+2, padding='max_length', return_tensors="tf") with tf.GradientTape() as tape: last_hidden_states, pooler_output = model(be)[:2] loss = loss_object(pooler_output) print(last_hidden_states.shape) print(pooler_output.shape) # model.save('bert_save')