def enum_dir_records(dir_path): file_path_list = get_dir_files(dir_path) while True: for file_path in file_path_list: for item in load_record(file_path): yield item
def main(dir_path): output_path = os.path.join(dir_path, "all_balanced") pos_insts = [] neg_insts = [] all_insts = [neg_insts, pos_insts] for i in range(665): p = os.path.join(dir_path, str(i)) if os.path.exists(p): for record in load_record(p): new_features = collections.OrderedDict() for key in record: new_features[key] = create_int_feature(take(record[key])) label = take(record['label_ids'])[0] all_insts[label].append(new_features) random.shuffle(pos_insts) random.shuffle(neg_insts) num_sel = min(len(pos_insts), len(neg_insts)) print("{} insts per label".format(num_sel)) insts_to_write = pos_insts[:num_sel] + neg_insts[:num_sel] writer = RecordWriterWrap(output_path) foreach(writer.write_feature, insts_to_write)
def tfrecord_to_old_stype(tfrecord_path, feature_names: List): all_insts = [] for feature in load_record(tfrecord_path): inst = [] for key in feature_names: v = take(feature[key]) inst.append(list(v)) all_insts.append(inst) return all_insts
def do_filtering(file_path, out_path, condition_fn, debug_call_back=None): writer = RecordWriterWrap(out_path) for item in load_record(file_path): features = feature_to_ordered_dict(item) if condition_fn(features): if debug_call_back is not None: debug_call_back(features) writer.write_feature(features) writer.close()
def run(dir_path, save_dir): exist_or_mkdir(save_dir) for split in ["train", "dev"]: for idx, topic in enumerate( data_generator.argmining.ukp_header.all_topics): file_name = "{}_{}".format(split, topic) file_path = os.path.join(dir_path, file_name) save_path = os.path.join(save_dir, file_name) augment_topic_ids(load_record(file_path), save_path)
def read(fn): examples = load_record(fn) tokenizer = tokenizer_wo_tf.FullTokenizer( os.path.join(data_path, "bert_voca.txt")) for feature in examples: print(inst2str(feature, tokenizer)) print() print()
def work(self, job_id): tfrecord_path = os.path.join(self.input_dir, str(job_id)) features = load_record(tfrecord_path) save_path = os.path.join(self.out_dir, str(job_id)) writer = RecordWriterWrap(save_path) for f in collect_passages(features, self.relevance_scores, self.cpid_to_label, self.num_max_para, self.window_size): writer.write_feature(f) writer.close()
def work(self, job_id): tfrecord_path = os.path.join(self.input_dir, str(job_id)) features = load_record(tfrecord_path) save_path = os.path.join(self.out_dir, str(job_id)) writer = RecordWriterWrap(save_path) for f in rel_filter(features, self.relevance_scores, self.cpid_to_label): writer.write_feature(f) writer.close()
def main(dir_path): output_path = os.path.join(dir_path, "all") writer = RecordWriterWrap(output_path) for i in range(665): p = os.path.join(dir_path, str(i)) if os.path.exists(p): for record in load_record(p): new_features = collections.OrderedDict() for key in record: new_features[key] = create_int_feature(take(record[key])) writer.write_feature(new_features)
def load(file_no): path = os.path.join(data_path, "pc_rel_tfrecord_dev", str(file_no)) d = {} for feature in load_record(path): data_id = take(feature["data_id"])[0] input_ids = take(feature["input_ids"]) segment_ids = take(feature["segment_ids"]) d[data_id] = input_ids, segment_ids print(data_id) print("loaded {} data".format(len(d))) return d
def work(self, job_id): tfrecord_path = os.path.join(self.input_dir, str(job_id)) features = load_record(tfrecord_path) save_path = os.path.join(self.out_dir, str(job_id)) all_entry = [] for entry in rel_filter_to_para(features, self.relevance_scores, self.cpid_to_label): all_entry.append(entry) pickle.dump(all_entry, open(save_path, "wb"))
def get_lm_tf(fn, sample_size=None, as_subword=True): tokenizer = get_tokenizer() tfrecord_itr = load_record(fn) lm = LM(as_subword, tokenizer) for idx, inst in enumerate(tfrecord_itr): if sample_size is not None and idx > sample_size: break input_ids = inst["input_ids"].int64_list.value lm.update(input_ids) return lm.tf
def print_as_html(fn): examples = load_record(fn) tokenizer = tokenizer_wo_tf.FullTokenizer( os.path.join(data_path, "bert_voca.txt")) html_output = HtmlVisualizer("out_name.html") for feature in examples: masked_inputs = feature["input_ids"].int64_list.value idx = 0 step = 512 while idx < len(masked_inputs): slice = masked_inputs[idx:idx + step] tokens = tokenizer.convert_ids_to_tokens(slice) idx += step cells = cells_from_tokens(tokens) html_output.multirow_print(cells) html_output.write_paragraph("----------")
def work_inner(self, input_file_path, output_path): itr = load_record(input_file_path) writer = RecordWriterWrap(output_path) def reform_a_input(raw_input): return np.reshape(raw_input, [self.inner_batch_size, self.max_seq_length]) def reform_mask_input(raw_input): return np.reshape(raw_input, [self.inner_batch_size, self.max_predictions_per_seq]) def get_as_list(feature, name): ids = list(feature[name].int64_list.value) ids_list = reform_a_input(ids) return ids_list all_features = [] for feature in itr: listed_inputs = {} for key in ["input_ids", "input_mask", "segment_ids"]: listed_inputs[key] = get_as_list(feature, key) for key in ["masked_lm_positions","masked_lm_ids"]: ids = list(feature[key].int64_list.value) listed_inputs[key] = reform_mask_input(ids) listed_inputs["masked_lm_weights"] = reform_mask_input(feature["masked_lm_weights"].float_list.value) for i in range(self.inner_batch_size): new_features = collections.OrderedDict() for key, value in listed_inputs.items(): if key is "masked_lm_weights": new_features[key] = create_float_feature(value[i]) else: new_features[key] = create_int_feature(value[i]) all_features.append(new_features) random.shuffle(all_features) for f in all_features: writer.write_feature(f) writer.close()
def sample_median(): # we don't want to make one of (bad/good) split to have shorter text than the other. all_scores = [] scorer = get_lm_scorer() files = get_dir_files(tf_record_dir) random.shuffle(files) for file_path in files[:10]: tfrecord_itr = load_record(file_path) ticker = TimeEstimator(1000) for idx, inst in enumerate(tfrecord_itr): all_scores.append(scorer(inst)) if idx > 1000: break ticker.tick() all_scores.sort() l = len(all_scores) print(l) mid = int(l / 2) print(all_scores[mid])
def get_iterator(): return load_record( os.path.join(data_path, "nli", "bert_code_train.tf_record"))
def translate(tfrecord_path, st, ed): max_seq_length = 512 transform_fn = partial(transform, max_seq_length) itr = slice_iterator(load_record(tfrecord_path), st, ed) for entry in itr: yield transform_fn(entry)
def load_tfrecord(record_path): for feature in load_record(record_path): input_ids = feature["input_ids"].int64_list.value label_ids = feature["label_ids"].int64_list.value[0] yield input_ids, label_ids
def create_instances(self, input_path, target_topic, target_seq_length): tokenizer = get_tokenizer() doc_top_k = 1000 all_train_data = list(load_record(input_path)) train_data = [] for feature in all_train_data: input_ids = feature["input_ids"].int64_list.value token_id = input_ids[1] topic = token_ids_to_topic[token_id] if target_topic == topic: train_data.append(feature) print("Selected {} from {}".format(len(train_data), len(all_train_data))) doc_dict = load_tokens_for_topic(target_topic) token_doc_list = [] ranked_list = sydney_get_ukp_ranked_list()[target_topic] print("Ranked list contains {} docs, selecting top-{}".format(len(ranked_list), doc_top_k)) doc_ids = [doc_id for doc_id, _, _ in ranked_list[:doc_top_k]] for doc_id in doc_ids: doc = doc_dict[doc_id] token_doc = pool_tokens(doc, target_seq_length) token_doc_list.extend(token_doc) ranker = Ranker() target_tf_list = lmap(ranker.get_terms, token_doc_list) ranker.init_df_from_tf_list(target_tf_list) inv_index = collections.defaultdict(list) for doc_idx, doc_tf in enumerate(target_tf_list): for term in doc_tf: if ranker.df[term] < ranker.N * 0.3: inv_index[term].append(doc_idx) def get_candidate_from_inv_index(inv_index, terms): s = set() for t in terms: s.update(inv_index[t]) return s source_tf_list = [] selected_context = [] for s_idx, feature in enumerate(train_data): input_ids = feature["input_ids"].int64_list.value topic_seg, sent = split_p_h_with_input_ids(input_ids, input_ids) source_tf = ranker.get_terms_from_ids(sent) source_tf_list.append(source_tf) ranked_list = [] candidate_docs = get_candidate_from_inv_index(inv_index, source_tf.keys()) for doc_idx in candidate_docs: target_tf = target_tf_list[doc_idx] score = ranker.bm25(source_tf, target_tf) ranked_list.append((doc_idx, score, target_tf)) ranked_list.sort(key=lambda x: x[1], reverse=True) ranked_list = list(filter_overlap(ranked_list)) ranked_list = ranked_list[:self.max_context] if s_idx < 10: print("--- Source sentence : \n", pretty_tokens(tokenizer.convert_ids_to_tokens(sent), True)) print("-------------------") for rank, (idx, score, target_tf) in enumerate(ranked_list): ranker.bm25(source_tf, target_tf, True) print("Rank#{} {} : ".format(rank, score) + pretty_tokens(token_doc_list[idx], True)) if s_idx % 100 == 0: print(s_idx) contexts = list([token_doc_list[idx] for idx, score, _ in ranked_list]) selected_context.append(contexts) for sent_idx, feature in enumerate(train_data): contexts = selected_context[sent_idx] yield feature, contexts
def run(dir_path, save_dir): for idx, topic in enumerate( data_generator.argmining.ukp_header.all_topics): file_path = os.path.join(dir_path, topic) save_path = os.path.join(save_dir, topic) augment_topic_ids(load_record(file_path), idx, save_path)
def show_feature_text(tfrecord_path, output_file_name): html = HtmlVisualizer(output_file_name) tokenizer = get_tokenizer() for feature in load_record(tfrecord_path): write_feature_to_html(feature, html, tokenizer)
def itr(): for file in get_dir_files(path): for item in load_record(file): yield item