def show_tfrecord(file_path): itr = load_record_v2(file_path) tokenizer = get_tokenizer() name = os.path.basename(file_path) html = HtmlVisualizer(name + ".html") for features in itr: input_ids = take(features["input_ids"]) alt_emb_mask = take(features["alt_emb_mask"]) tokens = tokenizer.convert_ids_to_tokens(input_ids) p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids) p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids) p_cells = [ Cell(p_tokens[i], 100 if p_mask[i] else 0) for i in range(len(p_tokens)) ] h_cells = [ Cell(h_tokens[i], 100 if h_mask[i] else 0) for i in range(len(h_tokens)) ] label = take(features["label_ids"])[0] html.write_paragraph("Label : {}".format(label)) html.write_table([p_cells]) html.write_table([h_cells])
def combine_segment(features_c, features_p) -> OrderedDict: # input_ids does not contain CLS, SEP c_seg_id = list(take(features_c['segment_ids'])) max_seq_len = len(c_seg_id) st = c_seg_id.index(1) input_mask = list(take(features_c['input_mask'])) ed = input_mask.index(0) feature_c_input_ids = take(features_c['input_ids']) paragraph = feature_c_input_ids[st:ed] c_input_ids = feature_c_input_ids[:st] feature_p_input_ids = take(features_p['input_ids']) p_seg_id = list(take(features_p['segment_ids'])) st = p_seg_id.index(1) p_input_ids = feature_p_input_ids[:st] input_ids = [CLS_ID] + c_input_ids + p_input_ids + [SEP_ID] + paragraph + [ SEP_ID ] #+ [random.randint(10, 13)] segment_ids = [0] * (2 + len(c_input_ids) + len(p_input_ids)) + [1] * (1 + len(paragraph)) input_ids, input_mask, segment_ids = get_basic_input_feature_as_list_all_ids( input_ids, segment_ids, max_seq_len) return ordered_dict_from_input_segment_mask_ids(input_ids, input_mask, segment_ids)
def main(dir_path): output_path = os.path.join(dir_path, "all_balanced") pos_insts = [] neg_insts = [] all_insts = [neg_insts, pos_insts] for i in range(665): p = os.path.join(dir_path, str(i)) if os.path.exists(p): for record in load_record(p): new_features = collections.OrderedDict() for key in record: new_features[key] = create_int_feature(take(record[key])) label = take(record['label_ids'])[0] all_insts[label].append(new_features) random.shuffle(pos_insts) random.shuffle(neg_insts) num_sel = min(len(pos_insts), len(neg_insts)) print("{} insts per label".format(num_sel)) insts_to_write = pos_insts[:num_sel] + neg_insts[:num_sel] writer = RecordWriterWrap(output_path) foreach(writer.write_feature, insts_to_write)
def feature_transformer(feature): new_features = collections.OrderedDict() def put(feature_name): return create_int_feature(take(feature[feature_name])) for left_right_idx in [1, 2]: input_names = [input_names1, input_names2][left_right_idx - 1] input_ids = take(feature["input_ids{}".format(left_right_idx)]) input_masks = take(feature["input_mask{}".format(left_right_idx)]) cls_loc = [] last_non_pad = -1 for i in range(seq_length): if input_ids[i] == 101: cls_loc.append(i) if input_masks[i]: last_non_pad = i assert last_non_pad >= 0 assert last_non_pad > cls_loc[-1] assert len(cls_loc) <= max_num_seg num_seg = len(cls_loc) input_building = {} for name in input_names: input_building[name] = [] for i in range(num_seg): st = cls_loc[i] ed = cls_loc[i + 1] if i + 1 < num_seg else last_non_pad + 1 pad_len = window_size - (ed - st) for input_name in input_names: arr = take(feature[input_name]) seq = arr[st:ed] + pad_len * [0] input_building[input_name].extend(seq) n_empty_seg = max_num_seg - num_seg for i in range(n_empty_seg): for input_name in input_names: input_building[input_name].extend([0] * window_size) for input_name in input_names: checksum1 = sum(input_building[input_name]) checksum2 = sum(take(feature[input_name])) assert checksum1 == checksum2 for input_name in input_names: new_features[input_name] = create_int_feature( input_building[input_name]) new_features["data_ids"] = put("data_ids") return new_features
def combine_feature(lm_entry, nli_entry): new_features = collections.OrderedDict() for key in lm_entry: new_features[key] = create_int_feature(take(lm_entry[key])) for key in nli_entry: if key == "label_ids": new_features[key] = create_int_feature(take(nli_entry[key])) else: new_key = "nli_" + key new_features[new_key] = create_int_feature(take(nli_entry[key])) return new_features
def write_feature_to_html(feature, html, tokenizer): input_ids = take(feature['input_ids']) label_ids = take(feature['label_ids']) seg1, seg2 = split_p_h_with_input_ids(input_ids, input_ids) text1 = tokenizer.convert_ids_to_tokens(seg1) text2 = tokenizer.convert_ids_to_tokens(seg2) text1 = pretty_tokens(text1, True) text2 = pretty_tokens(text2, True) html.write_headline("{}".format(label_ids[0])) html.write_paragraph(text1) html.write_paragraph(text2)
def load(file_no): path = os.path.join(data_path, "pc_rel_tfrecord_dev", str(file_no)) d = {} for feature in load_record(path): data_id = take(feature["data_id"])[0] input_ids = take(feature["input_ids"]) segment_ids = take(feature["segment_ids"]) d[data_id] = input_ids, segment_ids print(data_id) print("loaded {} data".format(len(d))) return d
def write_feature_to_html(feature, html, tokenizer): input_ids = take(feature['input_ids']) focus_msak = take(feature['focus_mask']) label_ids = take(feature['label_ids']) text1 = tokenizer.convert_ids_to_tokens(input_ids) row = [] for i in range(len(input_ids)): highlight_score = 100 if focus_msak[i] else 0 row.append(Cell(text1[i], highlight_score)) html.write_headline("{}".format(label_ids[0])) html.multirow_print(row)
def get_paragraph(features_c, features_p) -> Iterator[int]: # input_ids does not contain CLS, SEP c_seg_id = list(take(features_c['segment_ids'])) max_seq_len = len(c_seg_id) st = c_seg_id.index(1) input_mask = list(take(features_c['input_mask'])) ed = input_mask.index(0) feature_c_input_ids = take(features_c['input_ids']) paragraph = feature_c_input_ids[st:ed] return paragraph
def feature_transformer(feature): new_features = collections.OrderedDict() success = False for key in feature: v = take(feature[key]) if key == "input_ids": alt_emb_mask = [0] * len(v) s = set(v) if len(s.intersection(all_tokens)) >= min_overlap: for word in seq_set: pre_match = 0 for i in range(len(v)): if v[i] == word[pre_match]: pre_match += 1 else: pre_match = 0 if pre_match == len(word): pre_match = 0 for j in range(i - len(word) + 1, i + 1): alt_emb_mask[j] = 1 success = True new_features["alt_emb_mask"] = create_int_feature(alt_emb_mask) new_features[key] = create_int_feature(v) if success: return new_features else: return None
def rel_filter_to_para(tfrecord_itr, relevance_scores: Dict[DataID, Tuple[CPIDPair, Logits, Logits]], cpid_to_label: Dict[CPIDPair, int]) -> Iterator[Tuple]: last_feature = None for features in tfrecord_itr: if last_feature is None: last_feature = features continue data_id = take(features["data_id"])[0] t = relevance_scores[data_id] cpid: CPIDPair = t[0] c_logits = t[1] p_logits = t[2] c_score = softmax(c_logits)[1] p_score = softmax(p_logits)[1] weight = c_score * p_score label: int = cpid_to_label[cpid] if weight > 0.5: paragraph = get_paragraph(last_feature, features) else: paragraph = [] output_entry = cpid, label, paragraph, c_score, p_score yield output_entry
def debug_call_back(features): nonlocal inst_cnt if inst_cnt < 4: input_tokens = tokenizer.convert_ids_to_tokens( take(features['input_ids'])) print(pretty_tokens(input_tokens)) inst_cnt += 1
def rel_filter(tfrecord_itr, relevance_scores: Dict[DataID, Tuple[CPIDPair, Logits, Logits]], cpid_to_label: Dict[CPIDPair, int]) -> Iterator[OrderedDict]: last_feature = None for features in tfrecord_itr: if last_feature is None: last_feature = features continue data_id = take(features["data_id"])[0] t = relevance_scores[data_id] cpid: CPIDPair = t[0] c_logits = t[1] p_logits = t[2] c_score = softmax(c_logits)[1] p_score = softmax(p_logits)[1] weight = c_score * p_score label: int = cpid_to_label[cpid] if weight > 0.5: new_feature = combine_segment(last_feature, features) #new_feature['weight'] = create_float_feature([weight]) new_feature['label_ids'] = create_int_feature([label]) new_feature['data_id'] = create_int_feature([data_id]) yield new_feature last_feature = None
def count_terms(file_path): counter = Counter() for feature in load_record_v2(file_path): input_ids = take(feature["input_ids"]) alt_emb_mask = take(feature["alt_emb_mask"]) cur_words = [] for i in range(len(input_ids)): if alt_emb_mask[i]: cur_words.append(input_ids[i]) else: if cur_words: sig = " ".join([str(num) for num in cur_words]) counter[sig] += 1 cur_words = [] return counter
def tfrecord_to_old_stype(tfrecord_path, feature_names: List): all_insts = [] for feature in load_record(tfrecord_path): inst = [] for key in feature_names: v = take(feature[key]) inst.append(list(v)) all_insts.append(inst) return all_insts
def feature_transformer(feature): new_features = collections.OrderedDict() mapping = {0: 0, 1: 1, 2: 1} for key in feature: v = take(feature[key]) if key == "label_ids": v = [mapping[v[0]]] new_features[key] = create_int_feature(v) return new_features
def get_correctness(filename, file_path): itr = load_record_v2(file_path) data = EstimatorPredictionViewerGosford(filename) correctness = [] for entry in data: features = itr.__next__() input_ids = entry.get_vector("input_ids") input_ids2 = take(features["input_ids"]) assert np.all(input_ids == input_ids2) label = take(features["label_ids"])[0] logits = entry.get_vector("logits") pred = np.argmax(logits) if pred == label: correctness.append(1) else: correctness.append(0) return correctness
def main(dir_path): output_path = os.path.join(dir_path, "all") writer = RecordWriterWrap(output_path) for i in range(665): p = os.path.join(dir_path, str(i)) if os.path.exists(p): for record in load_record(p): new_features = collections.OrderedDict() for key in record: new_features[key] = create_int_feature(take(record[key])) writer.write_feature(new_features)
def generate_training_data(data_id): num_samples_list = open( os.path.join(working_path, "entry_prediction_n", data_id), "r").readlines() p = os.path.join(working_path, "entry_loss", "entry{}.pickle".format(data_id)) loss_outputs_list = pickle.load(open(p, "rb")) print("Loaded input data") loss_outputs = [] for e in loss_outputs_list: loss_outputs.extend(e["masked_lm_example_loss"]) print("Total of {} loss outputs".format(len(loss_outputs))) feature_itr = load_record_v2( os.path.join(working_path, "entry_prediction_tf.done", data_id)) instance_idx = 0 writer = tf.python_io.TFRecordWriter( os.path.join(working_path, "entry_prediction_train", data_id)) n = len(num_samples_list) for i in range(n): n_sample = int(num_samples_list[i]) assert n_sample > 0 first_inst = feature_itr.__next__() if instance_idx + n_sample >= len(loss_outputs): break if n_sample == 1: continue no_dict_loss = loss_outputs[instance_idx] instance_idx += 1 all_samples = [] for j in range(1, n_sample): feature = feature_itr.__next__() loss = loss_outputs[instance_idx] if loss < no_dict_loss * 0.9: label = 1 else: label = 0 new_features = collections.OrderedDict() for key in feature: new_features[key] = btd.create_int_feature(take(feature[key])) new_features["useful_entry"] = btd.create_int_feature([label]) example = tf.train.Example(features=tf.train.Features( feature=new_features)) writer.write(example.SerializeToString()) writer.close()
def check_feature(feature): feature_d = {} for key in feature: v = take(feature[key]) feature_d[key] = v input_ids = feature_d["input_ids"] alt_emb_mask = feature_d["alt_emb_mask"] for i in range(len(input_ids)): if alt_emb_mask[i] and input_ids[i] not in all_tokens: print(i, input_ids[i])
def feature_transformer(feature): new_features = collections.OrderedDict() mapping = {0: 0, 1: 0, 2: 1} for key in feature: l = take(feature[key]) if key == "segment_ids": l = list([mapping[v] for v in l]) new_features[key] = create_int_feature(l) return new_features
def show_prediction(filename, file_path, correctness_1, correctness_2): data = EstimatorPredictionViewerGosford(filename) itr = load_record_v2(file_path) tokenizer = get_tokenizer() name = os.path.basename(filename) html = HtmlVisualizer(name + ".html") idx = 0 for entry in data: features = itr.__next__() input_ids = entry.get_vector("input_ids") input_ids2 = take(features["input_ids"]) assert np.all(input_ids == input_ids2) alt_emb_mask = take(features["alt_emb_mask"]) tokens = tokenizer.convert_ids_to_tokens(input_ids) p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids) p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids) p_cells = [ Cell(p_tokens[i], 100 if p_mask[i] else 0) for i in range(len(p_tokens)) ] h_cells = [ Cell(h_tokens[i], 100 if h_mask[i] else 0) for i in range(len(h_tokens)) ] label = take(features["label_ids"])[0] logits = entry.get_vector("logits") pred = np.argmax(logits) if not correctness_1[idx] or not correctness_2[idx]: html.write_paragraph("Label : {} Correct: {}/{}".format( label, correctness_1[idx], correctness_2[idx])) html.write_table([p_cells]) html.write_table([h_cells]) idx += 1
def transform(max_seq_length, feature): query_ids = take(feature["query_ids"]) doc_ids = feature["doc_ids"].int64_list.value label_ids = feature["label"].int64_list.value[0] input_ids = list(query_ids) + list(doc_ids) segment_ids = [0] * len(query_ids) + [1] * len(doc_ids) input_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) return input_ids, input_mask, segment_ids, label_ids
def build_word_tf(continuation_tokens: Set[int], file_path): feature_itr = load_record_v2(file_path) counter = Counter() for feature in feature_itr: if not is_real_example(feature): continue input_ids = take(feature["input_ids"]) cur_word = [] for idx, token_id in enumerate(input_ids): if token_id in continuation_tokens: cur_word.append(token_id) else: if len(cur_word) > 1: word_sig = " ".join([str(t) for t in cur_word]) counter[word_sig] += 1 cur_word = [token_id] return counter
def feature_transformer(feature): new_features = collections.OrderedDict() success = False for key in feature: v = take(feature[key]) if key == "input_ids": input_ids = v success, alt_emb_mask, alt_input_ids = get_alt_emb(input_ids) if not success and include_not_match: assert len(input_ids) > 0 alt_emb_mask = [0] * len(input_ids) alt_input_ids = [0] * len(input_ids) new_features["alt_emb_mask"] = create_int_feature(alt_emb_mask) new_features["alt_input_ids"] = create_int_feature( alt_input_ids) new_features[key] = create_int_feature(v) if success or include_not_match: return new_features else: return None
def write_instance(self, instances, output_path): writer = RecordWriterWrap(output_path) for (inst_index, instance) in enumerate(instances): new_features = collections.OrderedDict() feature, contexts = instance for key in feature: v = take(feature[key]) new_features[key] = create_int_feature(v[:self.max_seq_length]) context_input_ids = [] context_input_mask = [] context_segment_ids = [] for tokens in contexts: segment_ids = [0] * len(tokens) input_ids, input_mask, segment_ids = \ get_basic_input_feature_as_list(self.tokenizer, self.max_context_len, tokens, segment_ids) context_input_ids.extend(input_ids) context_input_mask.extend(input_mask) context_segment_ids.extend(segment_ids) dummy_len = self.max_context - len(contexts) for _ in range(dummy_len): input_ids, input_mask, segment_ids = \ get_basic_input_feature_as_list(self.tokenizer, self.max_context_len, [], []) context_input_ids.extend(input_ids) context_input_mask.extend(input_mask) context_segment_ids.extend(segment_ids) new_features["context_input_ids"] = create_int_feature(context_input_ids) new_features["context_input_mask"] = create_int_feature(context_input_mask) new_features["context_segment_ids"] = create_int_feature(context_segment_ids) writer.write_feature(new_features) if inst_index < 20: log_print_feature(new_features) writer.close()
def feature_to_ordered_dict(feature): new_features = collections.OrderedDict() for key in feature: new_features[key] = create_int_feature(take(feature[key])) return new_features
def put(feature_name): return create_int_feature(take(feature[feature_name]))
def condition_fn(features): return id_keyword in take(features['input_ids'])
def is_real_example(feature): if "is_real_example" not in feature: return True return take(feature["is_real_example"])[0] == 1