Exemplo n.º 1
def _get_wordpiece_detokenized_text(
    token_span: _SpanType, raw_prediction: _RawPredictionType,
    tokenizer: tokenization.FullTokenizer) -> str:
  """Gets the normalized answer token text given the token span."""
  answer_tokens = tokenizer.convert_ids_to_tokens(
      raw_prediction["long_token_ids"][token_span[0]:token_span[1] + 1])
  return data_utils.wordpiece_tokens_to_normalized_text(answer_tokens)
Exemplo n.º 2
def _get_sentencepiece_detokenized_text(token_span: _SpanType,
                                        raw_prediction: _RawPredictionType,
                                        tokenizer: tokenization.FullTokenizer):
  """Gets final text using SentencePiece tokens."""
  long_token_ids = raw_prediction["long_token_ids"]
  answer_tokens = tokenizer.convert_ids_to_tokens(
      long_token_ids[token_span[0]:token_span[1] + 1].tolist())
  return data_utils.sentencepiece_detokenize(answer_tokens)
Exemplo n.º 3
def get_sentencepiece_tokenized_text(
    text: str, tokenizer: tokenization.FullTokenizer) -> TokenizedText:
  """Gets SentencePiece TokenizedText for a text with indices mapping."""
  tokens = [six.ensure_text(tk, "utf-8") for tk in tokenizer.tokenize(text)]
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  chars_to_tokens = []
  for i, token in enumerate(tokens):
    num_chars = len(token)
    if i == 0:
      num_chars -= 1
    chars_to_tokens.extend([i] * num_chars)
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  tokenized_text = TokenizedText()
  tokenized_text.text = sentencepiece_detokenize(tokens)
  tokenized_text.tokens = tokens
  tokenized_text.token_ids = token_ids
  tokenized_text.chars_to_tokens = chars_to_tokens
  return tokenized_text
Exemplo n.º 4
def _improve_answer_span(
    doc_tokens: Sequence[str],
    unimproved_span: Tuple[int, int],
    orig_answer_text: str,
    tokenizer: tokenization.FullTokenizer,
  """Returns answer token spans that better match the annotated answer.

  This function is branched from the original BERT `run_squad.py` code

  Usually question answer span annotations are character based. We first project
  them to whitespace-tokenized words (unigrams). But then after WordPiece
  tokenization, we can often find a "better match". For example:

    Question: What year was John Smith born?
    Context: The leader was John Smith (1895-1943).
    Answer: 1895

  The original whitespace-tokenized answer will be "(1895-1943).". However
  after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
  the exact answer, 1895. The purpose of this function is to find such "better

  However, this is not always possible. Consider the following:

    Question: What country is the top exporter of electornics?
    Context: The Japanese electronics industry is the lagest in the world.
    Answer: Japan

  In this case, the annotator chose "Japan" as a character sub-span of
  the word "Japanese". Since our WordPiece tokenizer does not split
  "Japanese", we just use "Japanese" as the annotation. This is expected to be
  fairly rare.

    doc_tokens: Sequence of Text, the wordpiece tokenized tokens of the doc.
    unimproved_span: Tuple of two ints, the unimproved answer token span. In the
      first example, it is the token span for "(" and ")".
    orig_answer_text: Text, the original answer text. In the first example, it
      is "1895".
    tokenizer: FullTokenizer, wordpiece tokenizer to tokenize the original
      answer text.

    Tuple of two ints, the improved answer token span. In the first example, it
    corresponds to the answer token span for "1895".
  tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
  for new_begin in range(unimproved_span[0], unimproved_span[1] + 1):
    for new_end in range(unimproved_span[1], new_begin - 1, -1):
      text_span = " ".join(doc_tokens[new_begin:(new_end + 1)])
      if text_span == tok_answer_text:
        return new_begin, new_end

  return unimproved_span
Exemplo n.º 5
def wordpiece_tokenize_with_indices(
    doc_unigrams: Sequence[str], tokenizer: tokenization.FullTokenizer
) -> Tuple[List[str], List[int], List[int]]:
  """Wordpiece tokenizes unigrams to tokens and returns indices mapping."""
  token_to_unigram_map = []
  unigram_to_token_map = []
  doc_tokens = []
  for (i, token) in enumerate(doc_unigrams):
    sub_tokens = tokenizer.tokenize(token)
    token_to_unigram_map.extend([i] * len(sub_tokens))
  return doc_tokens, unigram_to_token_map, token_to_unigram_map
Exemplo n.º 6
def get_wordpiece_tokenized_text(
    text: str, tokenizer: tokenization.FullTokenizer) -> TokenizedText:
  """Gets WordPiece TokenizedText for a text with indices mapping."""
  unigrams, _, chars_to_unigrams = whitespace_split_with_indices(text)
  tokens, unigrams_to_tokens, tokens_to_unigrams = (
      wordpiece_tokenize_with_indices(unigrams, tokenizer))
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  tokenized_text = TokenizedText()
  tokenized_text.text = text
  tokenized_text.tokens = tokens
  tokenized_text.token_ids = token_ids
  tokenized_text.unigrams = unigrams
  tokenized_text.chars_to_unigrams = chars_to_unigrams
  tokenized_text.unigrams_to_tokens = unigrams_to_tokens
  tokenized_text.tokens_to_unigrams = tokens_to_unigrams
  return tokenized_text
Exemplo n.º 7
def find_candidate_mentions(
    input_text: Text,
    candidate: Text,
    tokenizer: tokenization.FullTokenizer,
    offset=0) -> Tuple[List[Text], List[Tuple[int, int]]]:
  """Finds the candidate string mentions in the sentence post tokenization.

    input_text: The input for searching the candidate.
    candidate: The candidate to be searched for mentions in the input.
    tokenizer: The tokenizer to be used. For BERT tokenzier, we assume an
      uncased vocab.
    offset: Offset to be added to all the span values.

    A tuple of (input_tokens_list, list_of_candidate_spans_in_the_list)

  input = "Thisss is Saaan Franciscooo"
  candidate = "saan franciscooo"

  Let's say we are using the ALBERT tokenizer. Tokenizing the input would give:
  ['▁This', 'ss', '▁is', '▁Sa', 'aan', '▁Franc', 'isc', 'ooo']

  We return the tokens of the sentence.
  We also return [(3, 7)] representing the only span where the candidate
  occurs in the tokenized sentence. Note that the span is  inclusive.

  assert isinstance(tokenizer, tokenization.FullTokenizer)

  input_lower = input_text.lower()
  candidate_lower = candidate.lower()
  tokens = tokenizer.tokenize(input_text)
  if (not candidate_lower or not input_lower or
      candidate_lower not in input_lower):
    return (tokens, [])

  if isinstance(tokenizer, tokenization.FullTokenizer):
    # We assume a tokenizer with lower cased vocab here. We do a simple
    # substring match of the candidate tokens to the input text tokens.
    if (tokenizer.sp_model is None and
        not tokenizer.basic_tokenizer.do_lower_case):
      raise ValueError("BERT tokenizer should be lower cased.")
    candidate_tokens = tokenizer.tokenize(candidate.lower())
    candidate_len = len(candidate_tokens)
    candidate_spans = []
    for i in range(0, len(tokens)):
      if i + candidate_len <= len(tokens):
        if tokens[i:i + candidate_len] == candidate_tokens:
          candidate_spans.append((offset + i, offset + i + candidate_len - 1))
    return (tokens, candidate_spans)

  # Now that we know the candidate is present in the input_text, we do a
  # best effort matching.
  spiece_underline = tokenization.SPIECE_UNDERLINE.decode("utf-8")
  char_index_to_token_index = collections.OrderedDict()
  i = 0
  for (j, token) in enumerate(tokens):
    k = 0
    if token.startswith(spiece_underline):
      k += 1
    for c in token[k:len(token)]:
      c = c.lower()
      # Most chars, in general, other than the special_token in tokens have a
      # corresponding mapping in the input_text.
      while i < len(input_lower) and _is_whitespace(input_lower[i]):
        # To handle cases like a space etc in the input_text. Spaces, tabs etc.
        # generally don't appear in the tokens.
        char_index_to_token_index[i] = j
        i += 1
      if _is_whitespace(c):
        # This shouldn't generally happen - ALBERT tokenizer collapses
        # whitespaces.
      if c != input_lower[i]:
        # Tokenizer probably has extra characters for this token.
      if i < len(input_lower):
        assert c == input_lower[i]
        char_index_to_token_index[i] = j
        i += 1

  if i != len(input_text):
    # Our best effort matching chars to tokens failed. As a fallback, we will
    # just match the given candidate with the entire input_text and return.
    # Because we know that the candidate is already present in the input_text,
    # it's better to assign the candidate to the entire input_text (which in
    # our case is a sentence), rather than dropping it altogether.
    return (tokens, [(offset, offset + len(tokens) - 1)])

  # We now have matched every char in the input to its corresponding token
  # index successfully.
  candidate_spans = []
  cand_len = len(candidate_lower)
  # Using re.finditer for substring match seems to be throwing a weird python
  # error -- "nothing to repeat at position 0", in some cases. So doing a
  # brute force substring match.
  for start in range(0, len(input_lower)):
    if (start + cand_len <= len(input_lower) and
        input_lower[start:start + cand_len] == candidate_lower):
      end = start + cand_len - 1
      assert start in char_index_to_token_index, (
          "no mapping found for index %d for candidate %s ", start, candidate)
      assert end in char_index_to_token_index, (
          "no mapping found for index %d for candidate %s ", end, candidate)
      token_span_start = char_index_to_token_index[start]
      token_span_end = char_index_to_token_index[end]
          (offset + token_span_start, offset + token_span_end))

  return (tokens, candidate_spans)