def tokenize_input_sequence_to_subtokens(examples, tokenizer, clip_length):
    len_of_queries = []
    len_of_contexts = []
    len_of_inputs = []

    summary_of_inputs = OrderedDict()
    oob_counter = 0

    for example_idx, example_item in enumerate(examples):
        context_subtokens_lst = []
        query_item = example_item["query"]
        context_item = example_item["context"]
        query_subtokens = tokenizer.tokenize(query_item)
        context_whitespace_tokens = whitespace_tokenize(context_item)
        for word_item in context_whitespace_tokens:
            tmp_subword_lst = tokenizer.tokenize(word_item)
            context_subtokens_lst.extend(tmp_subword_lst)
        len_of_queries.append(len(query_subtokens))
        len_of_contexts.append(len(context_subtokens_lst))
        len_of_inputs.append(
            len(query_subtokens) + len(context_subtokens_lst) + 3)
        if len(context_subtokens_lst) + len(query_subtokens) >= clip_length:
            oob_counter += 1

    summary_of_inputs["max_query"] = max(len_of_queries)
    summary_of_inputs["max_context"] = max(len_of_contexts)
    summary_of_inputs["max_inputs"] = max(len_of_inputs)

    summary_of_inputs["min_query"] = min(len_of_queries)
    summary_of_inputs["min_context"] = min(len_of_contexts)
    summary_of_inputs["min_inputs"] = min(len_of_inputs)

    summary_of_inputs["avg_query"] = sum(len_of_queries) / len(len_of_queries)
    summary_of_inputs["avg_context"] = sum(len_of_contexts) / len(
        len_of_contexts)
    summary_of_inputs["avg_inputs"] = sum(len_of_inputs) / len(len_of_inputs)

    summary_of_inputs["num_examples"] = len(len_of_queries)
    summary_of_inputs["oob_examples"] = oob_counter

    return summary_of_inputs
Пример #2
0
def convert_examples_to_features(examples,
                                 tokenizer,
                                 label_lst,
                                 max_seq_length,
                                 is_training=True,
                                 allow_impossible=True,
                                 pad_sign=True):
    label_map = {tmp: idx for idx, tmp in enumerate(label_lst)}
    features = []

    for (example_idx, example) in enumerate(examples):

        query_tokens = tokenizer.tokenize(example.query_item)
        whitespace_doc = whitespace_tokenize(example.context_item)
        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3

        if len(example.start_position) == 0 and len(example.end_position) == 0:
            doc_start_pos = []
            doc_end_pos = []
            all_doc_tokens = []

            for token_item in whitespace_doc:
                tmp_subword_lst = tokenizer.tokenize(token_item)
                all_doc_tokens.extend(tmp_subword_lst)
            doc_start_pos = [0] * len(all_doc_tokens)
            doc_end_pos = [0] * len(all_doc_tokens)
            doc_span_pos = np.zeros((max_seq_length, max_seq_length),
                                    dtype=int)

        else:
            doc_start_pos = []
            doc_end_pos = []
            doc_span_pos = np.zeros((max_seq_length, max_seq_length),
                                    dtype=int)

            all_doc_tokens = []
            offset_idx_dict = {}

            fake_start_pos = [0] * len(whitespace_doc)
            fake_end_pos = [0] * len(whitespace_doc)

            for start_item in example.start_position:
                fake_start_pos[start_item] = 1
            for end_item in example.end_position:
                fake_end_pos[end_item] = 1

            for idx, (token, start_label, end_label) in enumerate(
                    zip(whitespace_doc, fake_start_pos, fake_end_pos)):
                tmp_subword_lst = tokenizer.tokenize(token)

                if len(tmp_subword_lst) > 1:
                    offset_idx_dict[idx] = len(all_doc_tokens)

                    doc_start_pos.append(start_label)
                    doc_start_pos.extend([0] * (len(tmp_subword_lst) - 1))

                    doc_end_pos.append(end_label)
                    doc_end_pos.extend([0] * (len(tmp_subword_lst) - 1))

                    all_doc_tokens.extend(tmp_subword_lst)
                elif len(tmp_subword_lst) == 1:
                    offset_idx_dict[idx] = len(all_doc_tokens)
                    doc_start_pos.append(start_label)
                    doc_end_pos.append(end_label)
                    all_doc_tokens.extend(tmp_subword_lst)
                else:
                    raise ValueError(
                        "Please check the result of tokenizer !!! !!! ")

            for span_item in example.span_position:
                s_idx, e_idx = span_item.split(";")
                if len(query_tokens)+2+offset_idx_dict[int(s_idx)] <= max_tokens_for_doc and \
                len(query_tokens)+2+offset_idx_dict[int(e_idx)] <= max_tokens_for_doc :
                    doc_span_pos[len(query_tokens) + 2 +
                                 offset_idx_dict[int(s_idx)]][
                                     len(query_tokens) + 2 +
                                     offset_idx_dict[int(e_idx)]] = 1
                else:
                    continue

        assert len(all_doc_tokens) == len(doc_start_pos)
        assert len(all_doc_tokens) == len(doc_end_pos)
        assert len(doc_start_pos) == len(doc_end_pos)

        if len(all_doc_tokens) >= max_tokens_for_doc:
            all_doc_tokens = all_doc_tokens[:max_tokens_for_doc]
            doc_start_pos = doc_start_pos[:max_tokens_for_doc]
            doc_end_pos = doc_end_pos[:max_tokens_for_doc]
        if len(example.start_position) == 0 and len(example.end_position) == 0:
            doc_span_pos = np.zeros((max_seq_length, max_seq_length),
                                    dtype=int)

        input_tokens = []
        segment_ids = []
        input_mask = []
        start_pos = []
        end_pos = []

        input_tokens.append("[CLS]")
        segment_ids.append(0)
        input_mask.append(0)
        start_pos.append(0)
        end_pos.append(0)

        for query_item in query_tokens:
            input_tokens.append(query_item)
            segment_ids.append(0)
            input_mask.append(0)
            start_pos.append(0)
            end_pos.append(0)

        input_tokens.append("[SEP]")
        segment_ids.append(0)
        input_mask.append(0)
        start_pos.append(0)
        end_pos.append(0)

        input_tokens.extend(all_doc_tokens)
        segment_ids.extend([1] * len(all_doc_tokens))
        input_mask.extend([1] * len(all_doc_tokens))
        start_pos.extend(doc_start_pos)
        end_pos.extend(doc_end_pos)

        input_tokens.append("[SEP]")
        segment_ids.append(1)
        input_mask.append(1)
        start_pos.append(0)
        end_pos.append(0)

        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)

        # zero-padding up to the sequence length
        if len(input_ids) < max_seq_length and pad_sign:
            padding = [0] * (max_seq_length - len(input_ids))
            input_ids += padding
            input_mask += padding
            segment_ids += padding
            start_pos += padding
            end_pos += padding

        features.append(
            InputFeatures(unique_id=example.qas_id,
                          tokens=input_tokens,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          segment_ids=segment_ids,
                          start_position=start_pos,
                          end_position=end_pos,
                          span_position=doc_span_pos.tolist(),
                          is_impossible=example.is_impossible,
                          ner_cate=label_map[example.ner_cate]))

    return features
def convert_examples_to_features(examples,
                                 tokenizer,
                                 label_lst,
                                 max_seq_length,
                                 is_training=True,
                                 allow_impossible=True,
                                 pad_sign=True):

    print("EXAMPLES LENGTH", len(examples))

    label_map = {tmp: idx for idx, tmp in enumerate(label_lst)}
    features = []

    for (example_idx, example) in enumerate(examples):

        query_tokens = tokenizer.tokenize(example.query_item)
        whitespace_doc = whitespace_tokenize(example.context_item)
        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3

        if len(example.start_position) == 0 and len(example.end_position) == 0:
            doc_start_pos = []
            doc_end_pos = []
            all_doc_tokens = []

            for token_item in whitespace_doc:
                tmp_subword_lst = tokenizer.tokenize(token_item)
                all_doc_tokens.extend(tmp_subword_lst)
            doc_start_pos = [0] * len(all_doc_tokens)
            doc_end_pos = [0] * len(all_doc_tokens)
            doc_span_pos = np.zeros((max_seq_length, max_seq_length),
                                    dtype=int)

        else:
            doc_start_pos = []
            doc_end_pos = []
            doc_span_pos = np.zeros((max_seq_length, max_seq_length),
                                    dtype=int)

            all_doc_tokens = []
            offset_idx_dict = {}

            fake_start_pos = [0] * len(whitespace_doc)
            fake_end_pos = [0] * len(whitespace_doc)

            for start_item in example.start_position:
                fake_start_pos[start_item] = 1
            for end_item in example.end_position:
                fake_end_pos[end_item] = 1

            # improve answer span
            for idx, (token, start_label, end_label) in enumerate(
                    zip(whitespace_doc, fake_start_pos, fake_end_pos)):
                tmp_subword_lst = tokenizer.tokenize(token)

                if len(tmp_subword_lst) > 1:
                    offset_idx_dict[idx] = len(all_doc_tokens)

                    doc_start_pos.append(start_label)
                    doc_start_pos.extend([0] * (len(tmp_subword_lst) - 1))

                    doc_end_pos.append(end_label)
                    doc_end_pos.extend([0] * (len(tmp_subword_lst) - 1))

                    all_doc_tokens.extend(tmp_subword_lst)
                elif len(tmp_subword_lst) == 1:
                    offset_idx_dict[idx] = len(all_doc_tokens)
                    doc_start_pos.append(start_label)
                    doc_end_pos.append(end_label)
                    all_doc_tokens.extend(tmp_subword_lst)
                else:
                    print("TOKEN: ", token)
                    print("tmp_subword_list", tmp_subword_lst)
                    print("Please check the result of tokenizer !!! !!! ")

            if entity_scheme == "bes":
                for span_item in example.span_position:
                    s_idx, e_idx = span_item.split(";")
                    if offset_idx_dict[int(
                            s_idx)] <= max_tokens_for_doc and offset_idx_dict[
                                int(e_idx)] <= max_tokens_for_doc:
                        doc_span_pos[len(query_tokens) + 2 +
                                     offset_idx_dict[int(s_idx)]][
                                         len(query_tokens) + 2 +
                                         offset_idx_dict[int(e_idx)]] = 1
                        doc_span_pos[len(query_tokens) + 2 +
                                     offset_idx_dict[int(e_idx)]][
                                         len(query_tokens) + 2 +
                                         offset_idx_dict[int(s_idx)]] = 1
                    else:
                        continue

        assert len(all_doc_tokens) == len(doc_start_pos)
        assert len(all_doc_tokens) == len(doc_end_pos)
        assert len(doc_start_pos) == len(doc_end_pos)

        if len(all_doc_tokens) >= max_tokens_for_doc:
            all_doc_tokens = all_doc_tokens[:max_tokens_for_doc]
            doc_start_pos = doc_start_pos[:max_tokens_for_doc]
            doc_end_pos = doc_end_pos[:max_tokens_for_doc]
        if len(example.start_position) == 0 and len(example.end_position) == 0:
            doc_span_pos = np.zeros((max_seq_length, max_seq_length),
                                    dtype=int)

        # input_mask:
        #   the mask has 1 for real tokens and 0 for padding tokens.
        #   only real tokens are attended to.
        # segment_ids:
        #   segment token indices to indicate first and second portions of the inputs.
        input_tokens = []
        segment_ids = []
        input_mask = []
        start_pos = []
        end_pos = []

        input_tokens.append("[CLS]")
        segment_ids.append(0)
        start_pos.append(0)
        end_pos.append(0)

        for query_item in query_tokens:
            input_tokens.append(query_item)
            segment_ids.append(0)
            start_pos.append(0)
            end_pos.append(0)

        input_tokens.append("[SEP]")
        segment_ids.append(0)
        input_mask.append(1)
        start_pos.append(0)
        end_pos.append(0)

        input_tokens.extend(all_doc_tokens)
        segment_ids.extend([1] * len(all_doc_tokens))
        start_pos.extend(doc_start_pos)
        end_pos.extend(doc_end_pos)

        input_tokens.append("[SEP]")
        segment_ids.append(1)
        start_pos.append(0)
        end_pos.append(0)
        input_mask = [1] * len(input_tokens)
        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)

        if entity_scheme == "bes":
            span_label_mask = np.zeros((max_seq_length, max_seq_length),
                                       dtype=int)
            span_label_mask[len(query_tokens):len(input_ids),
                            len(query_tokens):len(input_ids)] = 1
        else:
            span_label_mask = None

        # zero-padding up to the sequence length
        if len(input_ids) < max_seq_length and pad_sign:
            padding = [0] * (max_seq_length - len(input_ids))
            input_ids += padding
            input_mask += padding
            segment_ids += padding
            start_pos += padding
            end_pos += padding

        input_ids = np.array(input_ids, dtype=np.int32)
        input_mask = np.array(input_mask, dtype=np.int32)
        segment_ids = np.array(segment_ids, dtype=np.int32)
        start_pos = np.array(start_pos, dtype=np.int32)
        end_pos = np.array(end_pos, dtype=np.int32)
        doc_span_pos = np.array(doc_span_pos, dtype=np.int32)
        span_label_mask = np.array(span_label_mask, dtype=np.int32)

        input_features = InputFeatures(unique_id=example.qas_id,
                                       tokens=input_tokens,
                                       input_ids=input_ids,
                                       input_mask=input_mask,
                                       segment_ids=segment_ids,
                                       start_position=start_pos,
                                       end_position=end_pos,
                                       span_position=doc_span_pos,
                                       span_label_mask=span_label_mask,
                                       is_impossible=example.is_impossible,
                                       ner_cate=label_map[example.ner_cate])
        # print(input_features.input_ids)
        # raise
        yield input_features