def gen_tf_record(): sequence_length = 300 data_loader = get_biobert_nli_data_loader(sequence_length) todo = [("train", [data_loader.train_file]), ("dev", [data_loader.dev_file])] batch_size = 32 dir_path = os.path.join(output_path, "biobert_mnli_{}".format(sequence_length)) exist_or_mkdir(dir_path) for name, files in todo[::-1]: output_file = os.path.join(dir_path, name) writer = RecordWriterWrap(output_file) for file in files: for e in data_loader.example_generator(file): f = entry_to_feature_dict(e) f["is_real_example"] = create_int_feature([1]) writer.write_feature(f) if name == "dev": while writer.total_written % batch_size != 0: f["is_real_example"] = create_int_feature([0]) writer.write_feature(f) writer.close() print("Wrote %d total instances" % writer.total_written)
def tf_record_gen(ranked_list: Dict[str, List[SimpleRankedListEntry]], queries: Dict, text_reader: Callable[[str], str], output_path, max_seq_length: int, data_info_save_name, ): writer = RecordWriterWrap(output_path) tokenizer = get_tokenizer() dummy_label = 0 data_id_idx = 0 data_id_info = {} for query_id_str in ranked_list: query_rep = queries[query_id_str] query_str = query_rep['query'] for ranked_entry in ranked_list[query_id_str]: data_id = data_id_idx data_id_idx += 1 data_id_info[data_id] = (query_id_str, ranked_entry.doc_id) text = text_reader(ranked_entry.doc_id) tokens, segment_ids = encode_query_and_text(tokenizer, query_str, text, max_seq_length) features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([dummy_label]) features['data_id'] = create_int_feature([data_id]) writer.write_feature(features) save_to_pickle(data_id_info, data_info_save_name) writer.close()
def encode_two_inputs(max_seq_length, tokenizer, inst: PayloadAsTokens) -> OrderedDict: tokens_1_1: List[str] = inst.text1 tokens_1_2: List[str] = inst.text2 tokens_2_1: List[str] = tokens_1_2 max_seg2_len = max_seq_length - 3 - len(tokens_2_1) tokens_2_2 = inst.passage[:max_seg2_len] def combine(tokens1, tokens2): effective_length = max_seq_length - 3 if len(tokens1) + len(tokens2) > effective_length: half = int(effective_length / 2 + 1) tokens1 = tokens1[:half] remain = effective_length - len(tokens1) tokens2 = tokens2[:remain] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] return tokens, segment_ids tokens_A, segment_ids_A = combine(tokens_1_1, tokens_1_2) tokens_B, segment_ids_B = combine(tokens_2_1, tokens_2_2) features = combine_features_B(tokens_A, segment_ids_A, tokens_B, segment_ids_B, tokenizer, max_seq_length) features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features
def encode_classification_instance_w_data_id( tokenizer, max_seq_length, inst: ClassificationInstanceWDataID) -> OrderedDict: feature: OrderedDict = get_basic_input_feature(tokenizer, max_seq_length, inst.tokens, inst.seg_ids) feature['label_ids'] = create_int_feature([inst.label]) feature['data_id'] = create_int_feature([inst.data_id]) return feature
def entry_to_feature_dict(e): input_ids, input_mask, segment_ids, label = e features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) features["label_ids"] = create_int_feature([label]) return features
def pairwise_entry_to_feature_dict(pair): features = collections.OrderedDict() for idx, e in enumerate(pair): input_ids, input_mask, segment_ids, label = e features["input_ids" + str(idx + 1)] = create_int_feature(input_ids) features["input_mask" + str(idx + 1)] = create_int_feature(input_mask) features["segment_ids" + str(idx + 1)] = create_int_feature(segment_ids) features["label_ids" + str(idx + 1)] = create_int_feature([label]) return features
def encode_inst_as_input_ids(max_seq_length, inst: InstAsInputIds) -> OrderedDict: # this pads input_ids input_ids, input_mask, segment_ids = get_basic_input_feature_as_list_all_ids( inst.input_ids, inst.seg_ids, max_seq_length) feature = ordered_dict_from_input_segment_mask_ids(input_ids, input_mask, segment_ids) feature['label_ids'] = create_int_feature([inst.label]) feature['data_id'] = create_int_feature([inst.data_id]) return feature
def encode_w_data_id(tokenizer, max_seq_length, t: Tuple[str, bool, int]): text, is_correct, data_id = t tokens1: List[str] = tokenizer.tokenize(text) tokens = ["[CLS]"] + tokens1 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([int(is_correct)]) features['data_id'] = create_int_feature([int(data_id)]) return features
def write(self, insts: List[Instance], out_path): writer = RecordWriterWrap(out_path) for inst in insts: feature = get_basic_input_feature(self.tokenizer, self.max_seq_length, inst.tokens, inst.seg_ids) feature["data_id"] = create_int_feature([int(inst.data_id)]) feature["label_ids"] = create_int_feature([int(inst.label)]) writer.write_feature(feature) writer.close()
def encode(inst: Instance) -> OrderedDict: tokens1: List[str] = tokenizer.tokenize(inst.text1) max_seg2_len = max_seq_length - 3 - len(tokens1) tokens2 = tokenizer.tokenize(inst.text2)[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([inst.label]) features['data_id'] = create_int_feature([inst.data_id]) return features
def encode_query_doc_instance(tokenizer, doc_token_length, inst: QueryDocInstance) -> OrderedDict: doc_segment_ids = [1] * len(inst.doc_tokens) doc_input_ids, doc_input_mask, doc_segment_ids \ = get_basic_input_feature_as_list(tokenizer, doc_token_length, inst.doc_tokens, doc_segment_ids) feature = collections.OrderedDict() feature['query'] = create_int_feature( tokenizer.convert_tokens_to_ids(inst.query_tokens)) feature['doc'] = create_int_feature(doc_input_ids) feature['doc_mask'] = create_int_feature(doc_input_mask) feature['label_ids'] = create_int_feature([inst.label]) feature['data_id'] = create_int_feature([inst.data_id]) return feature
def encode(inst: Payload) -> OrderedDict: tokens1: List[str] = tokenizer.tokenize(inst.candidate_text) max_seg2_len = max_seq_length - 3 - len(tokens1) tokens2 = tokenize_from_tokens(inst.passage)[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features
def encode_fn(self, inst: QKInstance) -> OrderedDict: max_seq_length = self.max_seq_length tokens1: List[str] = self.tokenizer.tokenize(inst.query_text) max_seg2_len = self.max_seq_length - 3 - len(tokens1) tokens2 = inst.doc_tokens[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(self.tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features
def encode_fn(self, inst: TokenScoringInstance) -> OrderedDict: max_seq_length = self.max_seq_length tokens1: List[str] = self.tokenizer.tokenize(inst.query_text) max_seg2_len = self.max_seq_length - 3 - len(tokens1) tokens2, scores = self.tokenize_from_tokens_w_scores( inst.doc_tokens, inst.score) tokens2 = tokens2[:max_seg2_len] scores: ScoreVector = scores[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(self.tokenizer, max_seq_length, tokens, segment_ids) score_vector = pad_score_vector(scores, max_seq_length, len(tokens1)) if len(score_vector) != max_seq_length: print(score_vector) print(len(score_vector)) print(max_seq_length) print(len(scores)) print(scores) assert len(score_vector) == max_seq_length features['label_ids'] = score_vector_to_feature(score_vector) features['data_id'] = create_int_feature([inst.data_id]) return features
def encode(inst: TextInstance) -> OrderedDict: tokens = tokenizer.tokenize(inst.text) max_len = max_seq_length - 2 if len(tokens) > max_len: nonlocal long_count long_count = long_count + 1 if long_count > 10 and long_warning: print("long text count", long_count) tokens = tokens[:max_len] tokens = ["[CLS]"] + tokens + ["[SEP]"] seg_ids = [0] * len(tokens) feature: OrderedDict = get_basic_input_feature(tokenizer, max_seq_length, tokens, seg_ids) feature['label_ids'] = create_int_feature([inst.label]) feature['data_id'] = create_int_feature([inst.data_id]) return feature
def encode_three_inputs(max_seq_length_list: List[int], tokenizer, inst: PayloadAsTokens) -> OrderedDict: tokens1: List[str] = inst.text1 tokens2: List[str] = inst.text2 tokens3: List[str] = inst.passage tokens_list = [tokens1, tokens2, tokens3] features = collections.OrderedDict() for i in range(3): input_ids, input_mask, segment_ids = encode_single( tokenizer, tokens_list[i], max_seq_length_list[i]) features["input_ids{}".format(i)] = input_ids features["input_mask{}".format(i)] = input_mask features["segment_ids{}".format(i)] = segment_ids features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features
def encode(self, inst: Instance) -> OrderedDict: if not self.reverse: tokens1 = self.get_p_tokens(inst.pid) tokens2 = inst.sent else: tokens1 = inst.sent tokens2 = self.get_p_tokens(inst.pid) tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) max_seq_length = self.max_seq_length tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(self.tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([0]) features['data_ids'] = create_int_feature([inst.data_id]) return features
def gen_mismatched(): sequence_length = 300 data_loader = get_modified_nli_data_loader(sequence_length) dir_path = os.path.join(output_path, "nli_tfrecord_cls_{}".format(sequence_length)) name = "dev_mis" output_file = os.path.join(dir_path, name) batch_size = 32 writer = RecordWriterWrap(output_file) for e in data_loader.example_generator(data_loader.dev_file2): f = entry_to_feature_dict(e) f["is_real_example"] = create_int_feature([1]) writer.write_feature(f) while writer.total_written % batch_size != 0: f["is_real_example"] = create_int_feature([0]) writer.write_feature(f) writer.close() print("Wrote %d total instances" % writer.total_written)
def write_tfrecord(ranked_list_d: RankedListDict, queries: List[Query], q_rels: Dict[str, List[str]], save_path): max_seq_length = 512 tokenizer = get_tokenizer() encoder = AllSegmentAsDoc(max_seq_length) writer = RecordWriterWrap(save_path) data_id = 0 data_info = [] for query in queries: if query.qid not in ranked_list_d: print("Warning query {} not found".format(query.qid)) continue print(query.qid) ranked_list = ranked_list_d[query.qid] doc_ids = [doc_entry.doc_id for doc_entry in ranked_list] preload_man.preload(BertTokenizedCluewebDoc, doc_ids) q_tokens = tokenizer.tokenize(query.text) for doc_entry in ranked_list: try: tokens_list: List[List[str]] = load(BertTokenizedCluewebDoc, doc_entry.doc_id) tokens = flatten(tokens_list) insts: List[Tuple[List, List]] = encoder.encode(q_tokens, tokens) for inst in insts: label = doc_entry.doc_id in q_rels[query.qid] input_tokens, segment_ids = inst feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens, segment_ids) feature["label_ids"] = create_int_feature([int(label)]) feature["data_id"] = create_int_feature([int(data_id)]) writer.write_feature(feature) data_info.append((data_id, query.qid, doc_entry.doc_id)) data_id += 1 except KeyError as e: print("doc {} not found".format(doc_entry.doc_id)) return data_info
def enc_to_feature(tokenizer, max_seq_length, pc: PerspectiveCandidate) -> OrderedDict: seg1 = tokenizer.tokenize(pc.claim_text) seg2 = tokenizer.tokenize(pc.p_text) input_tokens = ["[CLS]"] + seg1 + ["[SEP]"] + seg2 + ["[SEP]"] segment_ids = [0] * (len(seg1) + 2) + [1] * (len(seg2) + 1) feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens, segment_ids) feature["label_ids"] = create_int_feature([int(pc.label)]) return feature
def encode(inst: PairedInstance) -> OrderedDict: tokens1: List[str] = tokenizer.tokenize(inst.candidate_text) max_seg2_len = max_seq_length - 3 - len(tokens1) def concat_tokens(raw_tokens: List[str]): tokens2 = tokenize_from_tokens(raw_tokens)[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] return tokens, segment_ids out_tokens1, seg1 = concat_tokens(inst.passage_good) out_tokens2, seg2 = concat_tokens(inst.passage_worse) features = combine_features(out_tokens1, seg1, out_tokens2, seg2, tokenizer, max_seq_length) features['strict_good'] = create_int_feature([inst.strict_good]) features['strict_bad'] = create_int_feature([inst.strict_bad]) return features
def encode_classification_feature( max_seq_length, data: Iterable[Tuple[str, str, int]]) -> Iterable[OrderedDict]: tokenizer = get_tokenizer() encoder = FirstSegmentAsDoc(max_seq_length) for query, text, label in data: q_tokens = tokenizer.tokenize(query) text_tokens = tokenizer.tokenize(text) input_tokens, segment_ids = encoder.encode(q_tokens, text_tokens)[0] feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens, segment_ids) feature['label_ids'] = create_int_feature([label]) yield feature
def encode_inner(max_seq_length, tokenizer, inst: PayloadAsTokens) -> OrderedDict: tokens_1: List[str] = inst.text1 tokens_2: List[str] = inst.text2 tokens_3: List[str] = inst.passage def combine(tokens1, tokens2): return combine_with_sep_cls(max_seq_length, tokens1, tokens2) features = collections.OrderedDict() for tokens_a, tokens_b, postfix in [(tokens_1, tokens_2, ""), (tokens_2, tokens_3, "2"), (tokens_1, tokens_3, "3")]: tokens, segment_ids = combine(tokens_a, tokens_b) input_ids, input_mask, segment_ids = get_basic_input_feature_as_list( tokenizer, max_seq_length, tokens, segment_ids) features["input_ids" + postfix] = create_int_feature(input_ids) features["input_mask" + postfix] = create_int_feature(input_mask) features["segment_ids" + postfix] = create_int_feature(segment_ids) features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features
def encode_two_input_ids(max_seq_length, tokenizer, inst: PayloadAsIds) -> OrderedDict: tokens_1_1: List[int] = inst.text1 tokens_1_2: List[int] = inst.text2 tokens_2_1: List[int] = tokens_1_2 cls_id = tokenizer.convert_tokens_to_ids(["[CLS]"])[0] sep_id = tokenizer.convert_tokens_to_ids(["[SEP]"])[0] max_seg2_len = max_seq_length - 3 - len(tokens_2_1) tokens_2_2 = inst.passage[:max_seg2_len] def combine(tokens1, tokens2): effective_length = max_seq_length - 3 if len(tokens1) + len(tokens2) > effective_length: half = int(effective_length / 2 + 1) tokens1 = tokens1[:half] remain = effective_length - len(tokens1) tokens2 = tokens2[:remain] input_ids = [cls_id] + tokens1 + [sep_id] + tokens2 + [sep_id] segment_ids = [0] * (len(tokens1) + 2) + [1] * (len(tokens2) + 1) input_ids = input_ids[:max_seq_length] segment_ids = segment_ids[:max_seq_length] return input_ids, segment_ids input_ids_A, segment_ids_A = combine(tokens_1_1, tokens_1_2) input_ids_B, segment_ids_B = combine(tokens_2_1, tokens_2_2) input_ids, input_mask, segment_ids = get_basic_input_feature_as_list_all_ids( input_ids_A, segment_ids_A, max_seq_length) features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) input_ids, input_mask, segment_ids = get_basic_input_feature_as_list_all_ids( input_ids_B, segment_ids_B, max_seq_length) features["input_ids2"] = create_int_feature(input_ids) features["input_mask2"] = create_int_feature(input_mask) features["segment_ids2"] = create_int_feature(segment_ids) features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features
def encode(inst: Tuple[str, int]) -> OrderedDict: text, label = inst tokens = tokenizer.tokenize(text) max_len = max_seq_length - 2 if len(tokens) > max_len: nonlocal long_count long_count = long_count + 1 if long_count > 10: print("long text count", long_count) tokens = tokens[:max_len] tokens = ["[CLS]"] + tokens + ["[SEP]"] seg_ids = [0] * len(tokens) feature: OrderedDict = get_basic_input_feature(tokenizer, max_seq_length, tokens, seg_ids) feature['label_ids'] = create_int_feature([label]) return feature
def enc_to_feature(pc: PerspectiveCandidate) -> OrderedDict: emb_model = get_aux_embedding_fn(pc.cid) seg1 = tokenizer.tokenize(pc.claim_text) seg2 = tokenizer.tokenize(pc.p_text) input_tokens = ["[CLS]"] + seg1 + ["[SEP]"] + seg2 + ["[SEP]"] aux_emb = get_word_embedding(emb_model, input_tokens, dims) aux_emb += (max_seq_length - len(aux_emb)) * [zero_vector] aux_emb = np.array(aux_emb) flat_aux_emb = np.reshape(aux_emb, [-1]) segment_ids = [0] * (len(seg1) + 2) + [1] * (len(seg2) + 1) feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens, segment_ids) feature["label_ids"] = create_int_feature([int(pc.label)]) feature["aux_emb"] = create_float_feature(flat_aux_emb) return feature
def encode_three_inputs(max_seq_length, tokenizer, inst: PayloadAsTokens) -> collections.OrderedDict: tokens_1_1: List[str] = inst.text1 tokens_1_2: List[str] = inst.text2 tokens_2_1: List[str] = tokens_1_2 tokens_2_2 = inst.passage[:max_seq_length] def combine(tokens1, tokens2): effective_length = max_seq_length - 3 if len(tokens1) + len(tokens2) > effective_length: half = int(effective_length / 2 + 1) tokens1 = tokens1[:half] remain = effective_length - len(tokens1) tokens2 = tokens2[:remain] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] return tokens, segment_ids def fill(tokens1, seg_id): tokens = ["[CLS]"] + tokens1 + ["[SEP]"] segment_ids = [seg_id] * (len(tokens1) + 2) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] return tokens, segment_ids tokens_A, segment_ids_A = combine(tokens_1_1, tokens_1_2) tokens_B, segment_ids_B = fill(tokens_2_1, 0) tokens_C, segment_ids_C = fill(tokens_2_2, 1) features = collections.OrderedDict() input_ids, input_mask, segment_ids = get_basic_input_feature_as_list( tokenizer, max_seq_length, tokens_A, segment_ids_A) features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) input_ids, input_mask, segment_ids = get_basic_input_feature_as_list( tokenizer, max_seq_length, tokens_B, segment_ids_B) features["input_ids1"] = create_int_feature(input_ids) features["input_mask1"] = create_int_feature(input_mask) features["segment_ids1"] = create_int_feature(segment_ids) input_ids, input_mask, segment_ids = get_basic_input_feature_as_list( tokenizer, max_seq_length, tokens_C, segment_ids_C) features["input_ids2"] = create_int_feature(input_ids) features["input_mask2"] = create_int_feature(input_mask) features["segment_ids2"] = create_int_feature(segment_ids) features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features