def convert_examples_to_features(examples, tokenizer, max_query_length,
                                 is_training):
    features = []
    for (example_index, example) in enumerate(examples):
        query_tokens = tokenizer.tokenize(example.question_text)
        if len(query_tokens) > max_query_length:
            query_tokens = query_tokens[:max_query_length]

        # mapping between orig and tok
        tok_to_orig_index = []
        tok_to_orig_map = {}
        orig_to_tok_index = []
        all_doc_tokens = []
        for (i, token) in enumerate(example.doc_tokens):
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = tokenizer.tokenize(token)
            for sub_token in sub_tokens:
                tok_to_orig_map[len(all_doc_tokens)] = i
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)

        tok_start_position = None
        tok_end_position = None
        if is_training:
            tok_start_position = orig_to_tok_index[example.start_position]
            if example.end_position < len(example.doc_tokens) - 1:
                tok_end_position = orig_to_tok_index[example.end_position +
                                                     1] - 1
            else:
                tok_end_position = len(all_doc_tokens) - 1
            (tok_start_position, tok_end_position) = _improve_answer_span(
                all_doc_tokens, tok_start_position, tok_end_position,
                tokenizer, example.orig_answer_text)

        if example_index >= 16 and example_index < 20:
            logger.info("*** Example ***")
            logger.info("example_index: %s" % (example_index))
            logger.info("tokens: %s" % " ".join(all_doc_tokens))
            if is_training:
                answer_text = " ".join(
                    all_doc_tokens[tok_start_position:(tok_end_position + 1)])
                logger.info("start_position: %d" % (tok_start_position))
                logger.info("end_position: %d" % (tok_end_position))
                logger.info("orig answer: %s" % (example.orig_answer_text))
                logger.info("answer: %s" % (answer_text))

        features.append(
            ExampleFeature(example_index=example_index,
                           query_tokens=query_tokens,
                           doc_tokens=all_doc_tokens,
                           tok_to_orig_map=tok_to_orig_map,
                           start_position=tok_start_position,
                           end_position=tok_end_position))
    return features
Exemplo n.º 2
0
def convert_examples_to_features(examples, tokenizer, max_query_length, \
                                 is_training, append_history):
    features = []
    for (example_index, example) in enumerate(examples):
        all_query_tokens = [tokenizer.tokenize(question_text) for question_text in example.questions]
        cur_query_tokens = all_query_tokens[-1]
        prev_query_tokens = all_query_tokens[:-1]
        if append_history:
            prev_query_tokens = prev_query_tokens[::-1]
        flat_prev_query_tokens = []
        for query_tokens in prev_query_tokens:
            flat_prev_query_tokens += query_tokens

        if len(cur_query_tokens) + len(flat_prev_query_tokens) + 1 <= max_query_length:
            if append_history:
                query_tokens = cur_query_tokens + ['[SEP]'] + flat_prev_query_tokens
            else:
                query_tokens = flat_prev_query_tokens + ['[SEP]'] + cur_query_tokens
        else:
            prev_query_len = max_query_length  - 1 - len(cur_query_tokens)
            if append_history:
                query_tokens = cur_query_tokens + ['[SEP]'] + flat_prev_query_tokens[:prev_query_len]
            else:
                query_tokens = flat_prev_query_tokens[-1*prev_query_len:] + ['[SEP]'] + cur_query_tokens

        tok_to_orig_index = []
        tok_to_orig_map = {}
        orig_to_tok_index = []
        all_doc_tokens = []
        for (i, token) in enumerate(example.doc_tokens):
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = tokenizer.tokenize(token)
            for sub_token in sub_tokens:
                tok_to_orig_map[len(all_doc_tokens)] = i
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)

        # start/end position
        tok_start_position = None
        tok_end_position = None
        if is_training:
            tok_start_position = orig_to_tok_index[example.start_position]
            if example.end_position < len(example.doc_tokens) - 1:
                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
            else:
                tok_end_position = len(all_doc_tokens) - 1
            (tok_start_position, tok_end_position) = _improve_answer_span(
                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
                example.orig_answer_text)
        features.append(
            ExampleFeature(
                example_index=example_index,
                query_tokens=query_tokens,
                doc_tokens=all_doc_tokens,
                tok_to_orig_map=tok_to_orig_map,
                start_position=tok_start_position,
                end_position=tok_end_position,
                yes_no_flag=example.yes_no_flag,
                yes_no_ans=example.yes_no_ans,
                followup=example.followup))
    return features
def convert_examples_to_features(examples, tokenizer, max_query_length,
                                 is_training, append_history):
    """
    @input format:
    if append_history is True, query_tokens=[query, prev_queries,]
    if append_history is False, query_tokens= [prev_queries, query]
    """
    features = []
    for (example_index, example) in enumerate(examples):
        all_query_tokens = [
            tokenizer.tokenize(question_text)
            for question_text in example.question_texts
        ]
        # same as basic Bert
        if append_history:
            all_query_tokens = all_query_tokens[::-1]
        flat_all_query_tokens = []
        for query_tokens in all_query_tokens:
            flat_all_query_tokens += query_tokens
        if append_history:
            query_tokens = flat_all_query_tokens[:max_query_length]
        else:
            query_tokens = flat_all_query_tokens[-1 * max_query_length:]

        # doc_tokens
        tok_to_orig_index = []
        # tok_to_orig_map:
        # map the token position in tokenized all_doc_tokens to
        # the token position of original text by doc_tokens
        tok_to_orig_map = {}
        orig_to_tok_index = []
        all_doc_tokens = []
        for (i, token) in enumerate(example.doc_tokens):
            # the orig word is mapped to its first sub token
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = tokenizer.tokenize(token)
            for sub_token in sub_tokens:
                tok_to_orig_map[len(all_doc_tokens)] = i
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)

        # start/end position
        tok_start_position = None
        tok_end_position = None
        if is_training:
            tok_start_position = orig_to_tok_index[example.start_position]
            if example.end_position < len(example.doc_tokens) - 1:
                # tok_end_position is the last sub token of orig end_position
                tok_end_position = orig_to_tok_index[example.end_position +
                                                     1] - 1
            else:
                tok_end_position = len(all_doc_tokens) - 1
            (tok_start_position, tok_end_position) = _improve_answer_span(
                all_doc_tokens, tok_start_position, tok_end_position,
                tokenizer, example.orig_answer_text)

        features.append(
            ExampleFeature(example_index=example_index,
                           query_tokens=query_tokens,
                           doc_tokens=all_doc_tokens,
                           tok_to_orig_map=tok_to_orig_map,
                           start_position=tok_start_position,
                           end_position=tok_end_position,
                           yes_no_flag=example.yes_no_flag,
                           yes_no_ans=example.yes_no_ans))
    return features
Exemplo n.º 4
0
def convert_examples_to_features(examples, tokenizer, max_seq_length,
                                 doc_stride, max_query_length, is_training,
                                 append_history):
    """
    Features for each chunk from a document
    A document can have multiple equally-spaced chunks
    """
    unique_id = 1000000000
    features = []
    for (example_index, example) in enumerate(examples):
        # query_tokens
        all_query_tokens = [
            tokenizer.tokenize(question_text)
            for question_text in example.questions
        ]
        if append_history:
            all_query_tokens = all_query_tokens[::-1]
        flat_all_query_tokens = []
        for query_tokens in all_query_tokens:
            flat_all_query_tokens += query_tokens
        if append_history:
            query_tokens = flat_all_query_tokens[:max_query_length]
        else:
            query_tokens = flat_all_query_tokens[-1 * max_query_length:]

        tok_to_orig_index = []
        orig_to_tok_index = []
        all_doc_tokens = []
        for (i, token) in enumerate(example.doc_tokens):
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = tokenizer.tokenize(token)
            for sub_token in sub_tokens:
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)

        tok_start_position = None
        tok_end_position = None
        if is_training:
            tok_start_position = orig_to_tok_index[example.start_position]
            if example.end_position < len(example.doc_tokens) - 1:
                tok_end_position = orig_to_tok_index[example.end_position +
                                                     1] - 1
            else:
                tok_end_position = len(all_doc_tokens) - 1
            (tok_start_position, tok_end_position) = _improve_answer_span(
                all_doc_tokens, tok_start_position, tok_end_position,
                tokenizer, example.orig_answer_text)
        # The -3 accounts for [CLS], [SEP] and [SEP]
        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
        # sliding window to generate multiple document spans
        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
        doc_spans = []
        start_offset = 0
        while start_offset < len(all_doc_tokens):
            length = len(all_doc_tokens) - start_offset
            if length > max_tokens_for_doc:
                length = max_tokens_for_doc
            doc_spans.append(_DocSpan(start=start_offset, length=length))
            if start_offset + length == len(all_doc_tokens):
                break
            start_offset += min(length, doc_stride)

        for (doc_span_index, doc_span) in enumerate(doc_spans):
            tokens = []
            token_to_orig_map = {}
            token_is_max_context = {}
            segment_ids = []
            tokens.append("[CLS]")
            segment_ids.append(0)
            for token in query_tokens:
                tokens.append(token)
                segment_ids.append(0)
            tokens.append("[SEP]")
            segment_ids.append(0)

            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
                token_to_orig_map[len(
                    tokens)] = tok_to_orig_index[split_token_index]

                is_max_context = _check_is_max_context(doc_spans,
                                                       doc_span_index,
                                                       split_token_index)
                token_is_max_context[len(tokens)] = is_max_context
                tokens.append(all_doc_tokens[split_token_index])
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1] * len(input_ids)
            # Zero-pad up to the sequence length.
            while len(input_ids) < max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)
            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length

            start_position = None
            end_position = None
            yes_no_flag = None
            yes_no_ans = None
            followup = None
            if is_training:
                doc_start = doc_span.start
                doc_end = doc_span.start + doc_span.length - 1
                if (example.start_position < doc_start
                        or example.end_position < doc_start
                        or example.start_position > doc_end
                        or example.end_position > doc_end):
                    continue
                doc_offset = len(query_tokens) + 2
                start_position = tok_start_position - doc_start + doc_offset
                end_position = tok_end_position - doc_start + doc_offset
                yes_no_flag = example.yes_no_flag
                yes_no_ans = example.yes_no_ans
                followup = example.followup
            if example_index >= 16 and example_index < 20:
                logger.info("*** Example ***")
                logger.info("unique_id: %s" % (unique_id))
                logger.info("example_index: %s" % (example_index))
                logger.info("doc_span_index: %s" % (doc_span_index))
                logger.info("tokens: %s" % " ".join(tokens))
                if is_training:
                    answer_text = " ".join(
                        tokens[start_position:(end_position + 1)])
                    logger.info("start_position: %d" % (start_position))
                    logger.info("end_position: %d" % (end_position))
                    logger.info("answer: %s" % (answer_text))
            features.append(
                ChunkFeature(unique_id=unique_id,
                             example_index=example_index,
                             doc_span_index=doc_span_index,
                             tokens=tokens,
                             token_to_orig_map=token_to_orig_map,
                             token_is_max_context=token_is_max_context,
                             input_ids=input_ids,
                             input_mask=input_mask,
                             segment_ids=segment_ids,
                             start_position=start_position,
                             end_position=end_position,
                             yes_no_flag=yes_no_flag,
                             yes_no_ans=yes_no_ans,
                             followup=followup))
            unique_id += 1
    return features