def tokenize_input_sequence_to_subtokens(examples, tokenizer, clip_length): len_of_queries = [] len_of_contexts = [] len_of_inputs = [] summary_of_inputs = OrderedDict() oob_counter = 0 for example_idx, example_item in enumerate(examples): context_subtokens_lst = [] query_item = example_item["query"] context_item = example_item["context"] query_subtokens = tokenizer.tokenize(query_item) context_whitespace_tokens = whitespace_tokenize(context_item) for word_item in context_whitespace_tokens: tmp_subword_lst = tokenizer.tokenize(word_item) context_subtokens_lst.extend(tmp_subword_lst) len_of_queries.append(len(query_subtokens)) len_of_contexts.append(len(context_subtokens_lst)) len_of_inputs.append( len(query_subtokens) + len(context_subtokens_lst) + 3) if len(context_subtokens_lst) + len(query_subtokens) >= clip_length: oob_counter += 1 summary_of_inputs["max_query"] = max(len_of_queries) summary_of_inputs["max_context"] = max(len_of_contexts) summary_of_inputs["max_inputs"] = max(len_of_inputs) summary_of_inputs["min_query"] = min(len_of_queries) summary_of_inputs["min_context"] = min(len_of_contexts) summary_of_inputs["min_inputs"] = min(len_of_inputs) summary_of_inputs["avg_query"] = sum(len_of_queries) / len(len_of_queries) summary_of_inputs["avg_context"] = sum(len_of_contexts) / len( len_of_contexts) summary_of_inputs["avg_inputs"] = sum(len_of_inputs) / len(len_of_inputs) summary_of_inputs["num_examples"] = len(len_of_queries) summary_of_inputs["oob_examples"] = oob_counter return summary_of_inputs
def convert_examples_to_features(examples, tokenizer, label_lst, max_seq_length, is_training=True, allow_impossible=True, pad_sign=True): label_map = {tmp: idx for idx, tmp in enumerate(label_lst)} features = [] for (example_idx, example) in enumerate(examples): query_tokens = tokenizer.tokenize(example.query_item) whitespace_doc = whitespace_tokenize(example.context_item) max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 if len(example.start_position) == 0 and len(example.end_position) == 0: doc_start_pos = [] doc_end_pos = [] all_doc_tokens = [] for token_item in whitespace_doc: tmp_subword_lst = tokenizer.tokenize(token_item) all_doc_tokens.extend(tmp_subword_lst) doc_start_pos = [0] * len(all_doc_tokens) doc_end_pos = [0] * len(all_doc_tokens) doc_span_pos = np.zeros((max_seq_length, max_seq_length), dtype=int) else: doc_start_pos = [] doc_end_pos = [] doc_span_pos = np.zeros((max_seq_length, max_seq_length), dtype=int) all_doc_tokens = [] offset_idx_dict = {} fake_start_pos = [0] * len(whitespace_doc) fake_end_pos = [0] * len(whitespace_doc) for start_item in example.start_position: fake_start_pos[start_item] = 1 for end_item in example.end_position: fake_end_pos[end_item] = 1 for idx, (token, start_label, end_label) in enumerate( zip(whitespace_doc, fake_start_pos, fake_end_pos)): tmp_subword_lst = tokenizer.tokenize(token) if len(tmp_subword_lst) > 1: offset_idx_dict[idx] = len(all_doc_tokens) doc_start_pos.append(start_label) doc_start_pos.extend([0] * (len(tmp_subword_lst) - 1)) doc_end_pos.append(end_label) doc_end_pos.extend([0] * (len(tmp_subword_lst) - 1)) all_doc_tokens.extend(tmp_subword_lst) elif len(tmp_subword_lst) == 1: offset_idx_dict[idx] = len(all_doc_tokens) doc_start_pos.append(start_label) doc_end_pos.append(end_label) all_doc_tokens.extend(tmp_subword_lst) else: raise ValueError( "Please check the result of tokenizer !!! !!! ") for span_item in example.span_position: s_idx, e_idx = span_item.split(";") if len(query_tokens)+2+offset_idx_dict[int(s_idx)] <= max_tokens_for_doc and \ len(query_tokens)+2+offset_idx_dict[int(e_idx)] <= max_tokens_for_doc : doc_span_pos[len(query_tokens) + 2 + offset_idx_dict[int(s_idx)]][ len(query_tokens) + 2 + offset_idx_dict[int(e_idx)]] = 1 else: continue assert len(all_doc_tokens) == len(doc_start_pos) assert len(all_doc_tokens) == len(doc_end_pos) assert len(doc_start_pos) == len(doc_end_pos) if len(all_doc_tokens) >= max_tokens_for_doc: all_doc_tokens = all_doc_tokens[:max_tokens_for_doc] doc_start_pos = doc_start_pos[:max_tokens_for_doc] doc_end_pos = doc_end_pos[:max_tokens_for_doc] if len(example.start_position) == 0 and len(example.end_position) == 0: doc_span_pos = np.zeros((max_seq_length, max_seq_length), dtype=int) input_tokens = [] segment_ids = [] input_mask = [] start_pos = [] end_pos = [] input_tokens.append("[CLS]") segment_ids.append(0) input_mask.append(0) start_pos.append(0) end_pos.append(0) for query_item in query_tokens: input_tokens.append(query_item) segment_ids.append(0) input_mask.append(0) start_pos.append(0) end_pos.append(0) input_tokens.append("[SEP]") segment_ids.append(0) input_mask.append(0) start_pos.append(0) end_pos.append(0) input_tokens.extend(all_doc_tokens) segment_ids.extend([1] * len(all_doc_tokens)) input_mask.extend([1] * len(all_doc_tokens)) start_pos.extend(doc_start_pos) end_pos.extend(doc_end_pos) input_tokens.append("[SEP]") segment_ids.append(1) input_mask.append(1) start_pos.append(0) end_pos.append(0) input_ids = tokenizer.convert_tokens_to_ids(input_tokens) # zero-padding up to the sequence length if len(input_ids) < max_seq_length and pad_sign: padding = [0] * (max_seq_length - len(input_ids)) input_ids += padding input_mask += padding segment_ids += padding start_pos += padding end_pos += padding features.append( InputFeatures(unique_id=example.qas_id, tokens=input_tokens, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, start_position=start_pos, end_position=end_pos, span_position=doc_span_pos.tolist(), is_impossible=example.is_impossible, ner_cate=label_map[example.ner_cate])) return features
def convert_examples_to_features(examples, tokenizer, label_lst, max_seq_length, is_training=True, allow_impossible=True, pad_sign=True): print("EXAMPLES LENGTH", len(examples)) label_map = {tmp: idx for idx, tmp in enumerate(label_lst)} features = [] for (example_idx, example) in enumerate(examples): query_tokens = tokenizer.tokenize(example.query_item) whitespace_doc = whitespace_tokenize(example.context_item) max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 if len(example.start_position) == 0 and len(example.end_position) == 0: doc_start_pos = [] doc_end_pos = [] all_doc_tokens = [] for token_item in whitespace_doc: tmp_subword_lst = tokenizer.tokenize(token_item) all_doc_tokens.extend(tmp_subword_lst) doc_start_pos = [0] * len(all_doc_tokens) doc_end_pos = [0] * len(all_doc_tokens) doc_span_pos = np.zeros((max_seq_length, max_seq_length), dtype=int) else: doc_start_pos = [] doc_end_pos = [] doc_span_pos = np.zeros((max_seq_length, max_seq_length), dtype=int) all_doc_tokens = [] offset_idx_dict = {} fake_start_pos = [0] * len(whitespace_doc) fake_end_pos = [0] * len(whitespace_doc) for start_item in example.start_position: fake_start_pos[start_item] = 1 for end_item in example.end_position: fake_end_pos[end_item] = 1 # improve answer span for idx, (token, start_label, end_label) in enumerate( zip(whitespace_doc, fake_start_pos, fake_end_pos)): tmp_subword_lst = tokenizer.tokenize(token) if len(tmp_subword_lst) > 1: offset_idx_dict[idx] = len(all_doc_tokens) doc_start_pos.append(start_label) doc_start_pos.extend([0] * (len(tmp_subword_lst) - 1)) doc_end_pos.append(end_label) doc_end_pos.extend([0] * (len(tmp_subword_lst) - 1)) all_doc_tokens.extend(tmp_subword_lst) elif len(tmp_subword_lst) == 1: offset_idx_dict[idx] = len(all_doc_tokens) doc_start_pos.append(start_label) doc_end_pos.append(end_label) all_doc_tokens.extend(tmp_subword_lst) else: print("TOKEN: ", token) print("tmp_subword_list", tmp_subword_lst) print("Please check the result of tokenizer !!! !!! ") if entity_scheme == "bes": for span_item in example.span_position: s_idx, e_idx = span_item.split(";") if offset_idx_dict[int( s_idx)] <= max_tokens_for_doc and offset_idx_dict[ int(e_idx)] <= max_tokens_for_doc: doc_span_pos[len(query_tokens) + 2 + offset_idx_dict[int(s_idx)]][ len(query_tokens) + 2 + offset_idx_dict[int(e_idx)]] = 1 doc_span_pos[len(query_tokens) + 2 + offset_idx_dict[int(e_idx)]][ len(query_tokens) + 2 + offset_idx_dict[int(s_idx)]] = 1 else: continue assert len(all_doc_tokens) == len(doc_start_pos) assert len(all_doc_tokens) == len(doc_end_pos) assert len(doc_start_pos) == len(doc_end_pos) if len(all_doc_tokens) >= max_tokens_for_doc: all_doc_tokens = all_doc_tokens[:max_tokens_for_doc] doc_start_pos = doc_start_pos[:max_tokens_for_doc] doc_end_pos = doc_end_pos[:max_tokens_for_doc] if len(example.start_position) == 0 and len(example.end_position) == 0: doc_span_pos = np.zeros((max_seq_length, max_seq_length), dtype=int) # input_mask: # the mask has 1 for real tokens and 0 for padding tokens. # only real tokens are attended to. # segment_ids: # segment token indices to indicate first and second portions of the inputs. input_tokens = [] segment_ids = [] input_mask = [] start_pos = [] end_pos = [] input_tokens.append("[CLS]") segment_ids.append(0) start_pos.append(0) end_pos.append(0) for query_item in query_tokens: input_tokens.append(query_item) segment_ids.append(0) start_pos.append(0) end_pos.append(0) input_tokens.append("[SEP]") segment_ids.append(0) input_mask.append(1) start_pos.append(0) end_pos.append(0) input_tokens.extend(all_doc_tokens) segment_ids.extend([1] * len(all_doc_tokens)) start_pos.extend(doc_start_pos) end_pos.extend(doc_end_pos) input_tokens.append("[SEP]") segment_ids.append(1) start_pos.append(0) end_pos.append(0) input_mask = [1] * len(input_tokens) input_ids = tokenizer.convert_tokens_to_ids(input_tokens) if entity_scheme == "bes": span_label_mask = np.zeros((max_seq_length, max_seq_length), dtype=int) span_label_mask[len(query_tokens):len(input_ids), len(query_tokens):len(input_ids)] = 1 else: span_label_mask = None # zero-padding up to the sequence length if len(input_ids) < max_seq_length and pad_sign: padding = [0] * (max_seq_length - len(input_ids)) input_ids += padding input_mask += padding segment_ids += padding start_pos += padding end_pos += padding input_ids = np.array(input_ids, dtype=np.int32) input_mask = np.array(input_mask, dtype=np.int32) segment_ids = np.array(segment_ids, dtype=np.int32) start_pos = np.array(start_pos, dtype=np.int32) end_pos = np.array(end_pos, dtype=np.int32) doc_span_pos = np.array(doc_span_pos, dtype=np.int32) span_label_mask = np.array(span_label_mask, dtype=np.int32) input_features = InputFeatures(unique_id=example.qas_id, tokens=input_tokens, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, start_position=start_pos, end_position=end_pos, span_position=doc_span_pos, span_label_mask=span_label_mask, is_impossible=example.is_impossible, ner_cate=label_map[example.ner_cate]) # print(input_features.input_ids) # raise yield input_features