def _get_answer(self, context, context_ids, answer_start, answer_end): encoder = load_encoder(self.config.vocab_path) subtokens = [ encoder._subtoken_id_to_subtoken_string(s) for s in context_ids ] if not isinstance(subtokens[0], unicode): subtokens = [x.decode('utf-8') for x in subtokens] if not isinstance(context, unicode): context = context.decode('utf-8') assert isinstance(context, unicode) assert isinstance(subtokens[0], unicode) spans = tokenizer_util.match_subtokens_to_string(context, subtokens) start = spans[answer_start][0] end = spans[answer_end][1] # + 1 text = context[start:end] return text
def get_answer_index(context, context_tokens, answer_start, answer): assert isinstance(answer, unicode) assert isinstance(context, unicode) assert isinstance(context_tokens[0], unicode) spans = tokenizer_util.match_subtokens_to_string(context, context_tokens) answer_end = answer_start + len(answer) word_answer_start = None word_answer_end = None for word_idx, (start, _) in enumerate(spans): if (start <= answer_start and # Check that we aren't a part of the same token (word_answer_start is None or spans[word_answer_start][0] != start )): word_answer_start = word_idx if start < answer_end: word_answer_end = word_idx assert word_answer_start <= word_answer_end, (context, context_tokens, answer_start, answer) return word_answer_start, word_answer_end