def _improve_answer_span( doc_tokens: Sequence[str], unimproved_span: Tuple[int, int], orig_answer_text: str, tokenizer: tokenization.FullTokenizer, ): """Returns answer token spans that better match the annotated answer. This function is branched from the original BERT `run_squad.py` code Usually question answer span annotations are character based. We first project them to whitespace-tokenized words (unigrams). But then after WordPiece tokenization, we can often find a "better match". For example: Question: What year was John Smith born? Context: The leader was John Smith (1895-1943). Answer: 1895 The original whitespace-tokenized answer will be "(1895-1943).". However after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match the exact answer, 1895. The purpose of this function is to find such "better match". However, this is not always possible. Consider the following: Question: What country is the top exporter of electornics? Context: The Japanese electronics industry is the lagest in the world. Answer: Japan In this case, the annotator chose "Japan" as a character sub-span of the word "Japanese". Since our WordPiece tokenizer does not split "Japanese", we just use "Japanese" as the annotation. This is expected to be fairly rare. Args: doc_tokens: Sequence of Text, the wordpiece tokenized tokens of the doc. unimproved_span: Tuple of two ints, the unimproved answer token span. In the first example, it is the token span for "(" and ")". orig_answer_text: Text, the original answer text. In the first example, it is "1895". tokenizer: FullTokenizer, wordpiece tokenizer to tokenize the original answer text. Returns: Tuple of two ints, the improved answer token span. In the first example, it corresponds to the answer token span for "1895". """ tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) for new_begin in range(unimproved_span[0], unimproved_span[1] + 1): for new_end in range(unimproved_span[1], new_begin - 1, -1): text_span = " ".join(doc_tokens[new_begin:(new_end + 1)]) if text_span == tok_answer_text: return new_begin, new_end return unimproved_span
def wordpiece_tokenize_with_indices( doc_unigrams: Sequence[str], tokenizer: tokenization.FullTokenizer ) -> Tuple[List[str], List[int], List[int]]: """Wordpiece tokenizes unigrams to tokens and returns indices mapping.""" token_to_unigram_map = [] unigram_to_token_map = [] doc_tokens = [] for (i, token) in enumerate(doc_unigrams): unigram_to_token_map.append(len(doc_tokens)) sub_tokens = tokenizer.tokenize(token) token_to_unigram_map.extend([i] * len(sub_tokens)) doc_tokens.extend(sub_tokens) return doc_tokens, unigram_to_token_map, token_to_unigram_map
def get_sentencepiece_tokenized_text( text: str, tokenizer: tokenization.FullTokenizer) -> TokenizedText: """Gets SentencePiece TokenizedText for a text with indices mapping.""" tokens = [six.ensure_text(tk, "utf-8") for tk in tokenizer.tokenize(text)] token_ids = tokenizer.convert_tokens_to_ids(tokens) chars_to_tokens = [] for i, token in enumerate(tokens): num_chars = len(token) if i == 0: num_chars -= 1 chars_to_tokens.extend([i] * num_chars) token_ids = tokenizer.convert_tokens_to_ids(tokens) tokenized_text = TokenizedText() tokenized_text.text = sentencepiece_detokenize(tokens) tokenized_text.tokens = tokens tokenized_text.token_ids = token_ids tokenized_text.chars_to_tokens = chars_to_tokens return tokenized_text
def find_candidate_mentions( input_text: Text, candidate: Text, tokenizer: tokenization.FullTokenizer, offset=0) -> Tuple[List[Text], List[Tuple[int, int]]]: """Finds the candidate string mentions in the sentence post tokenization. Args: input_text: The input for searching the candidate. candidate: The candidate to be searched for mentions in the input. tokenizer: The tokenizer to be used. For BERT tokenzier, we assume an uncased vocab. offset: Offset to be added to all the span values. Returns: A tuple of (input_tokens_list, list_of_candidate_spans_in_the_list) Example: input = "Thisss is Saaan Franciscooo" candidate = "saan franciscooo" Let's say we are using the ALBERT tokenizer. Tokenizing the input would give: ['▁This', 'ss', '▁is', '▁Sa', 'aan', '▁Franc', 'isc', 'ooo'] We return the tokens of the sentence. We also return [(3, 7)] representing the only span where the candidate occurs in the tokenized sentence. Note that the span is inclusive. """ assert isinstance(tokenizer, tokenization.FullTokenizer) input_lower = input_text.lower() candidate_lower = candidate.lower() tokens = tokenizer.tokenize(input_text) if (not candidate_lower or not input_lower or candidate_lower not in input_lower): return (tokens, []) if isinstance(tokenizer, tokenization.FullTokenizer): # We assume a tokenizer with lower cased vocab here. We do a simple # substring match of the candidate tokens to the input text tokens. if (tokenizer.sp_model is None and not tokenizer.basic_tokenizer.do_lower_case): raise ValueError("BERT tokenizer should be lower cased.") candidate_tokens = tokenizer.tokenize(candidate.lower()) candidate_len = len(candidate_tokens) candidate_spans = [] for i in range(0, len(tokens)): if i + candidate_len <= len(tokens): if tokens[i:i + candidate_len] == candidate_tokens: candidate_spans.append((offset + i, offset + i + candidate_len - 1)) return (tokens, candidate_spans) # Now that we know the candidate is present in the input_text, we do a # best effort matching. spiece_underline = tokenization.SPIECE_UNDERLINE.decode("utf-8") char_index_to_token_index = collections.OrderedDict() i = 0 for (j, token) in enumerate(tokens): k = 0 if token.startswith(spiece_underline): k += 1 for c in token[k:len(token)]: c = c.lower() # Most chars, in general, other than the special_token in tokens have a # corresponding mapping in the input_text. while i < len(input_lower) and _is_whitespace(input_lower[i]): # To handle cases like a space etc in the input_text. Spaces, tabs etc. # generally don't appear in the tokens. char_index_to_token_index[i] = j i += 1 if _is_whitespace(c): # This shouldn't generally happen - ALBERT tokenizer collapses # whitespaces. continue if c != input_lower[i]: # Tokenizer probably has extra characters for this token. continue if i < len(input_lower): assert c == input_lower[i] char_index_to_token_index[i] = j i += 1 if i != len(input_text): # Our best effort matching chars to tokens failed. As a fallback, we will # just match the given candidate with the entire input_text and return. # Because we know that the candidate is already present in the input_text, # it's better to assign the candidate to the entire input_text (which in # our case is a sentence), rather than dropping it altogether. return (tokens, [(offset, offset + len(tokens) - 1)]) # We now have matched every char in the input to its corresponding token # index successfully. candidate_spans = [] cand_len = len(candidate_lower) # Using re.finditer for substring match seems to be throwing a weird python # error -- "nothing to repeat at position 0", in some cases. So doing a # brute force substring match. for start in range(0, len(input_lower)): if (start + cand_len <= len(input_lower) and input_lower[start:start + cand_len] == candidate_lower): end = start + cand_len - 1 assert start in char_index_to_token_index, ( "no mapping found for index %d for candidate %s ", start, candidate) assert end in char_index_to_token_index, ( "no mapping found for index %d for candidate %s ", end, candidate) token_span_start = char_index_to_token_index[start] token_span_end = char_index_to_token_index[end] candidate_spans.append( (offset + token_span_start, offset + token_span_end)) return (tokens, candidate_spans)