def __init__(self): vocab_file = os.path.join(data_path, "bert_voca.txt") self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) self.stemmer = CacheStemmer() self.stopword = load_stopwords() self.df = self.load_galgo_df_stat()
def __init__(self, window_size): self.stemmer = CacheStemmer() self.window_size = window_size self.doc_posting = None self.stopword = load_stopwords() vocab_file = os.path.join(cpath.data_path, "bert_voca.txt") self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=True) def load_pickle(name): p = os.path.join(cpath.data_path, "adhoc", name + ".pickle") return pickle.load(open(p, "rb")) self.doc_len_dict = load_pickle("doc_len") self.qdf = load_pickle("robust_qdf_ex") self.meta = load_pickle("robust_meta") self.head_tokens = load_pickle("robust_title_tokens") self.seg_info = load_pickle("robust_seg_info") self.not_found = set() self.total_doc_n = len(self.doc_len_dict) self.avdl = sum(self.doc_len_dict.values()) / len(self.doc_len_dict) tprint("Init PassageRanker")
def dev(): train_data_feeder = load_cache("train_data_feeder") tokenizer = tokenizer_wo_tf.FullTokenizer( os.path.join(data_path, "bert_voca.txt")) html_writer = HtmlVisualizer("nli_w_dict.html", dark_mode=False) for _ in range(100): batch = train_data_feeder.get_random_batch(1) input_ids, input_mask, segment_ids, d_input_ids, d_input_mask, d_location_ids, y = batch tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) for i in range(len(tokens)): if i is not 0 and i in d_location_ids: tokens[i] = "<b>{}</b>".format(tokens[i]) if tokens[i] == "[unused3]": tokens[i] = "[SEP]\n" s = tokenizer_wo_tf.pretty_tokens(tokens) html_writer.write_headline("Input") html_writer.write_paragraph(s) d_tokens = tokenizer.convert_ids_to_tokens(d_input_ids[0]) for i in range(len(d_tokens)): if tokens[i] == "[unused5]": tokens[i] = "<br>\n" s = tokenizer_wo_tf.pretty_tokens(d_tokens) html_writer.write_headline("Dict def") html_writer.write_paragraph(s) html_writer.close()
def main(): mark_path = os.path.join(working_path, "wiki_eval_token") mtm = MTM(100, mark_path) vocab_file = os.path.join(data_path, "bert_voca.txt") tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) docs_dict = {} job_id = mtm.pool_job() print("Job id : ", job_id) todo = "dev" if todo == "train": out_path_format = "/mnt/nfs/work3/youngwookim/data/enwiki4bert/enwiki_train_tokens.{}" in_path_format = "/mnt/nfs/work3/youngwookim/data/enwiki4bert/enwiki_train.txt.line.{}" elif todo == "dev": out_path_format = "/mnt/nfs/work3/youngwookim/data/enwiki4bert/enwiki_eval_tokens.{}" in_path_format = "/mnt/nfs/work3/youngwookim/data/enwiki4bert/enwiki_eval.txt.line.{}" else: assert False while job_id is not None: i = int(job_id / 100) if i not in docs_dict: file_path = in_path_format.format(i) docs_dict[i] = parse_wiki(file_path) work(job_id, docs_dict[i], tokenizer, out_path_format) job_id = mtm.pool_job() print("Job id : ", job_id)
def worker_p(job_id): max_seq = 512 vocab_file = os.path.join(cpath.data_path, "bert_voca.txt") tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) p = os.path.join(cpath.data_path, "tlm", "instances", "inst_{}.pickle".format(job_id)) if not os.path.exists(p): return output_path = os.path.join(cpath.data_path, "tlm", "tf_record_pred", "tf_{}.pickle".format(job_id)) #if os.path.exists(output_path): # return inst_list, info_list = filter_instances(pickle.load(open(p, "rb"))) uid_list = [] info_d = {} for inst, info in zip(inst_list, info_list): a, b, c = info.split("_") unique_id = int(a) * 1000 * 1000 + int(b) * 10 + int(c) uid_list.append(unique_id) info_d[unique_id] = info max_pred = 20 data = zip(inst_list, uid_list) p = os.path.join(cpath.data_path, "tlm", "pred", "info_d_{}.pickle".format(job_id)) pickle.dump(info_d, open(p, "wb")) write_predict_instance(data, tokenizer, max_seq, max_pred, [output_path])
def load_and_analyze_gradient(): p = os.path.join(output_path, "dict_grad1.pickle") data = pickle.load(open(p, "rb")) data = data[0] tokenizer = tokenizer_wo_tf.FullTokenizer( os.path.join(data_path, "bert_voca.txt")) analyze_gradient(data, tokenizer)
def __init__(self): vocab_file = os.path.join(data_path, "bert_voca.txt") self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) self.masked_lm_prob = 0.15 self.max_seq_length = 512 self.dupe_factor = 1 self.rng = random.Random(time.time())
def __init__(self, number_of_pairs=10000): vocab_file = os.path.join(data_path, "bert_voca.txt") self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) self.number_of_pairs = number_of_pairs self.max_seq_length = 200 self.rng = random.Random(time.time())
def __init__(self): self.token_reader = load_seg_token_readers() vocab_file = os.path.join(data_path, "bert_voca.txt") self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) vocab = vocab_words = list(self.tokenizer.vocab.keys()) self.tf_inst_maker = TFInstanceMakerPair(vocab)
def __init__(self, dictionary_pickle, max_word_tokens, max_seq_length, out_dir): vocab_file = os.path.join(data_path, "bert_voca.txt") self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=True) self.d = dictionary_pickle self.max_word_tokens = max_word_tokens self.max_seq_length = max_seq_length self.out_dir = out_dir
def read(fn): examples = load_record(fn) tokenizer = tokenizer_wo_tf.FullTokenizer( os.path.join(data_path, "bert_voca.txt")) for feature in examples: print(inst2str(feature, tokenizer)) print() print()
def __init__(self): super(LMTrainGen, self).__init__() vocab_file = os.path.join(data_path, "bert_voca.txt") self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) self.short_seq_prob = 0.1 self.problem_per_job = 100 * 1000 self.max_predictions_per_seq = int(self.max_seq_length * self.masked_lm_prob)
def pritn_token_id(): vocab_file = os.path.join(data_path, "bert_voca.txt") tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) tokens = ["[CLS]", "[SEP]", "[MASK]", "[PAD]"] ids = tokenizer.convert_tokens_to_ids(tokens) for token, id in zip(tokens, ids): print(token, id)
def dump_robust_cap_tokens(): vocab_file = os.path.join(cpath.data_path, "bert_voca.txt") cap_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=False) c = trec.load_robust_ingham() d = {} for key in c: doc = c[key] d[key] = cap_tokenizer.basic_tokenizer.tokenize(doc) dump_dict(d, "robust_token_cap")
def __init__(self, window_size): super().__init__(window_size) self.date_dict = load_from_pickle("robust_date") #self.token_reader = get_token_reader() self.token_dump = DumpAccess("robust_token") self.text_dump = DumpAccess("robust") c_path = os.path.join(data_path, "stream_pickled", "CandiSet_{}_0") self.ll = LazyLoader(c_path) vocab_file = os.path.join(cpath.data_path, "bert_voca.txt") self.cap_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=False)
def load_and_analyze_hv(): tokenizer = tokenizer_wo_tf.FullTokenizer( os.path.join(data_path, "bert_voca.txt")) p = os.path.join(output_path, "hv_tt.pickle") hv_tt = pickle.load(open(p, "rb")) p = os.path.join(output_path, "hv_lm.pickle") hv_lm = pickle.load(open(p, "rb")) p = os.path.join(output_path, "grad.pickle") tt_grad = pickle.load(open(p, "rb")) analyze_hv(hv_tt, hv_lm, tt_grad, tokenizer)
def __init__(self, max_seq): print("TFRecordMaker Init") self.max_seq = max_seq self.robust_tokens = load_robust_token() vocab_file = os.path.join(cpath.data_path, "bert_voca.txt") self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) self.vocab_words = list(self.tokenizer.vocab.keys()) self.rng = random.Random(0) def load_pickle(name): p = os.path.join(cpath.data_path, "adhoc", name + ".pickle") return pickle.load(open(p, "rb")) self.seg_info = load_pickle("robust_seg_info") print("TFRecordMaker Init Done")
def __init__(self, data, data_info): super(DictAuxDataFeeder, self).__init__(data) self.stopword = load_stopwords() vocab_file = os.path.join(data_path, "bert_voca.txt") self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) self.dict = self.encode_dict_as_feature(self.raw_dictionary) # data is already truncated and padded self.data = data self.data_len = len(self.data) if data_info is not None: self.data_info = data_info else: self.data_info = self.nli_data_indexing(data)
def print_as_html(fn): examples = load_record(fn) tokenizer = tokenizer_wo_tf.FullTokenizer( os.path.join(data_path, "bert_voca.txt")) html_output = HtmlVisualizer("out_name.html") for feature in examples: masked_inputs = feature["input_ids"].int64_list.value idx = 0 step = 512 while idx < len(masked_inputs): slice = masked_inputs[idx:idx + step] tokens = tokenizer.convert_ids_to_tokens(slice) idx += step cells = cells_from_tokens(tokens) html_output.multirow_print(cells) html_output.write_paragraph("----------")
def __init__(self, out_path): vocab_file = os.path.join(data_path, "bert_voca.txt") self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) self.masked_lm_prob = 0.15 self.short_seq_prob = 0.1 self.problem_per_job = 100 * 1000 self.max_seq_length = 512 self.max_predictions_per_seq = 20 self.dupe_factor = 1 self.out_dir = out_path seed = time.time() self.rng = random.Random(seed) print("Loading documents") self.documents = self.load_documents_from_pickle() print("Loading documents Done : ", len(self.documents))
def tokenize_stream(in_file, out_path): dp = DumpPickle(out_path) vocab_file = os.path.join(cpath.data_path, "bert_voca.txt") tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) in_f = open(in_file, "r") def read_doc(f): line = f.readline() if not line: raise EndofDocument() assert "<DOC>" in line line = f.readline() assert "<DOCNO>" in line pre_n = len("<DOCNO>") ed_n = len("</DOCNO>") + 1 title = line[pre_n:-ed_n].strip() line = f.readline() assert "<TEXT>" in line content = [] line = f.readline() while line.strip() != "</TEXT>": content.append(line) line = f.readline() line = f.readline() assert "</DOC>" in line return title, content try: ticker = TimeEstimator(1285381, "reader", 100) while True: title, content = read_doc(in_f) tokens = flatten(lmap(tokenizer.tokenize, content)) dp.dump(title, tokens) ticker.tick() except EndofDocument as e: pass dp.close()
def worker(job_id): max_seq = 512 print("TF_record_writer") rng = random.Random(0) vocab_file = os.path.join(cpath.data_path, "bert_voca.txt") tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) p = os.path.join(cpath.data_path, "tlm", "instances_local", "inst_{}.pickle".format(job_id)) if not os.path.exists(p): return output_path = os.path.join(cpath.data_path, "tlm", "tf_record_local", "tf_rand_{}.pickle".format(job_id)) if os.path.exists(output_path): return inst_list, info_list = filter_instances(pickle.load(open(p, "rb"))) rng.shuffle(inst_list) max_pred = 20 print(inst_list[0]) write_instance_to_example_files(inst_list, tokenizer, max_seq, max_pred, [output_path])
def load_and_visualize(): tokenizer = tokenizer_wo_tf.FullTokenizer( os.path.join(data_path, "bert_voca.txt")) data_id = "1" n_list = open(os.path.join(output_path, "lookup_n", data_id), "r").readlines() p = os.path.join(output_path, "example_loss.pickle") data = pickle.load(open(p, "rb")) data = data[0]["masked_lm_example_loss"] feature_itr = load_record_v1( os.path.join(output_path, "lookup_example", data_id)) n = len(n_list) feature_idx = 0 html_writer = HtmlVisualizer("lookup_loss2.html", dark_mode=False) for i in range(n): n_sample = int(n_list[i]) rows = [] assert n_sample > 0 for j in range(n_sample): feature = feature_itr.__next__() input_ids = take(feature["input_ids"]) masked_lm_ids = take(feature["masked_lm_ids"]) masked_lm_positions = take(feature["masked_lm_positions"]) input_mask = take(feature["input_mask"]) selected_word = take(feature["selected_word"]) d_input_ids = take(feature["d_input_ids"]) d_location_ids = take(feature["d_location_ids"]) word_tokens = tokenizer.convert_ids_to_tokens(selected_word) word = tokenizer_wo_tf.pretty_tokens((word_tokens)) emph_word = "<b>" + word + "</b>" if j == 0: mask_ans = {} masked_terms = tokenizer.convert_ids_to_tokens(masked_lm_ids) for pos, id in zip(list(masked_lm_positions), masked_terms): mask_ans[pos] = id tokens = tokenizer.convert_ids_to_tokens(input_ids) for i in range(len(tokens)): if tokens[i] == "[MASK]": tokens[i] = "[MASK_{}: {}]".format(i, mask_ans[i]) if i in d_location_ids and i is not 0: if tokens[i - 1] != emph_word: tokens[i] = emph_word else: tokens[i] = "-" def_str = tokenizer_wo_tf.pretty_tokens( tokenizer.convert_ids_to_tokens(d_input_ids), True) row = list() row.append(Cell(word)) row.append(Cell(data[feature_idx])) row.append(Cell(def_str)) rows.append(row) feature_idx += 1 s = tokenizer_wo_tf.pretty_tokens(tokens, True) html_writer.write_paragraph(s) html_writer.write_table(rows) html_writer.close()