def main(): dir_path = sys.argv[1] tokenizer = get_tokenizer() averager = Averager() sbc = SubwordConvertor() df = Counter() collection_size = 0 tikcer = TimeEstimator(485393) for file_path in get_dir_files(dir_path): for idx, record in enumerate( tf.compat.v1.python_io.tf_record_iterator(file_path)): example = tf.train.Example() example.ParseFromString(record) feature = example.features.feature input_ids = feature["input_ids"].int64_list.value tokens = tokenizer.convert_ids_to_tokens(input_ids) sep_idx1 = tokens.index("[SEP]") sep_idx2 = tokens.index("[SEP]", sep_idx1 + 1) doc_tokens = tokens[sep_idx1:sep_idx2] words = lmap(tuple, sbc.get_word_as_subtoken_tuple(doc_tokens)) dl = len(words) collection_size += dl averager.append(dl) for word in set(words): df[word] += 1 tikcer.tick() print("collection length", collection_size) print("average dl", averager.get_average()) save_to_pickle(df, "subword_df_robust_train")
def main(): dir_path = sys.argv[1] tokenizer = get_tokenizer() averager = Averager() for file_path in get_dir_files(dir_path): for idx, record in enumerate( tf.compat.v1.python_io.tf_record_iterator(file_path)): if idx % 3: continue example = tf.train.Example() example.ParseFromString(record) feature = example.features.feature input_mask = feature["input_mask"].int64_list.value if input_mask[-1]: input_ids = feature["input_ids"].int64_list.value tokens = tokenizer.convert_ids_to_tokens(input_ids) sep_idx1 = tokens.index("[SEP]") sep_idx2 = tokens.index("[SEP]", sep_idx1 + 1) doc_tokens = tokens[sep_idx1:sep_idx2] continue_cnt = 0 for t in doc_tokens: if t[:2] == "##": continue_cnt += 1 ## n_words = len(doc_tokens) - continue_cnt averager.append(n_words) print("average", averager.get_average())
def work(dir_path: FilePath): q_config_id = Q_CONFIG_ID_DEV_ALL print(dir_path) for file_path in get_dir_files(dir_path): print(file_path) ## insert_ranked_list_from_path(file_path, q_config_id)
def enum_dir_records(dir_path): file_path_list = get_dir_files(dir_path) while True: for file_path in file_path_list: for item in load_record(file_path): yield item
def run(in_dir_path, out_dir_path, keyword): exist_or_mkdir(out_dir_path) tokenizer = get_tokenizer() ids = tokenizer.convert_tokens_to_ids([keyword]) assert len(ids) == 1 id_keyword = ids[0] def condition_fn(features): return id_keyword in take(features['input_ids']) inst_cnt = 0 def debug_call_back(features): nonlocal inst_cnt if inst_cnt < 4: input_tokens = tokenizer.convert_ids_to_tokens( take(features['input_ids'])) print(pretty_tokens(input_tokens)) inst_cnt += 1 for file_path in get_dir_files(in_dir_path): inst_cnt = 0 name = os.path.basename(file_path) out_path = os.path.join(out_dir_path, name) do_filtering(file_path, out_path, condition_fn)
def run_dir(in_dir_name: FileName, out_dir_name: FileName): in_dir = pjoin(sydney_working_dir, in_dir_name) out_dir = pjoin(sydney_working_dir, out_dir_name) exist_or_mkdir(out_dir) for file_path in get_dir_files(in_dir): name = FileName(os.path.basename(file_path)) out_path = pjoin(out_dir, name) convert_to_2way(file_path, out_path)
def main(): dir_path = os.path.join(job_man_dir, "qcknc3_dev_info") out_dir_path = os.path.join(job_man_dir, "qcknc3_dev_info_light") exist_or_mkdir(out_dir_path) for file_path in get_dir_files(FilePath(dir_path)): print(file_path) if file_path.endswith(".info"): out_file_path = os.path.join(out_dir_path, os.path.basename(file_path)) drop_tokens(file_path, out_file_path)
def load_tokens_for_topic(token_path, topic): d = {} for path in get_dir_files(token_path): if topic.replace(" ", "_") in path: data = pickle.load(open(path, "rb")) if len(data) < 10000: print("{} has {} data".format(path, len(data))) d.update(data) print("Loaded {} docs for {}".format(len(d), topic)) return d
def main(): dir_path = os.path.join(data_path, "pc_evi_qck_predict_dev_info") out_dir_path = os.path.join(data_path, "pc_evi_qck_predict_dev_info_fixed") exist_or_mkdir(out_dir_path) for file_path in get_dir_files(FilePath(dir_path)): # print(file_path) # file_path = "/mnt/nfs/work3/youngwookim/job_man/pc_evi_qck_predict_dev_info/0.info" # out_file_path = "/mnt/nfs/work3/youngwookim/job_man/temp_0.info" out_file_path = os.path.join(out_dir_path, os.path.basename(file_path)) modify_and_save(file_path, out_file_path)
def load_ranked_list(relevance_list_path): all_ranked_list = {} for file_path in get_dir_files(relevance_list_path): file_name = os.path.basename(file_path) ranked_list_d = load_galago_ranked_list(file_path) queries = ranked_list_d.keys() any_query = list(queries)[0] ranked_list = ranked_list_d[any_query] all_ranked_list[file_name] = ranked_list return all_ranked_list
def load_combine_info_jsons(dir_path, convert_map, drop_kdp=True) -> Dict: if os.path.isdir(dir_path): d = {} for file_path in get_dir_files(dir_path): if file_path.endswith(".info"): j = json.load(open(file_path, "r", encoding="utf-8")) parse_info(j, convert_map, drop_kdp) d.update(j) else: d = json.load(open(dir_path, "r")) parse_info(d, convert_map, drop_kdp) return d
def load(): root = os.path.join(scope_dir, "by_time") l_all = [] for dir_path in get_dir_dir(root): print(dir_path) for file_path in get_dir_files(dir_path): l = load_article_only_short_url(file_path) l_all.extend(l) print("Total of {} articles ".format(len(l_all))) out_path = os.path.join(root, "list.pickle") pickle.dump(l_all, open(out_path, "wb"))
def load_corpus(): dir_path = FilePath("/mnt/nfs/work3/youngwookim/data/bert_tf/clueweb12_13B_word_tokens/") corpus = [] cnt = 0 for file_path in get_dir_files(dir_path): tokens_list = load_pickle_from(file_path) corpus.extend(tokens_list) if cnt > 50: break cnt += 1 return corpus
def load_combine_info_jsons(dir_path, convert_map): token_d = {} for file_path in get_dir_files(dir_path): if file_path.endswith(".info"): j = json.load(open(file_path, "r", encoding="utf-8")) parse_info(j, convert_map, False) for data_id, info in j.items(): kdp: KDP = info["kdp"] key = kdp.doc_id, kdp.passage_idx if key in token_d: if str(token_d[key]) != str(kdp.tokens): print(key) token_d[key] = kdp.tokens
def load_all_docs() -> List[MPQARawDoc]: docs = [] doc_dir_path = os.path.join(root_dir, "docs") for parent_dir in get_dir_dir(doc_dir_path): parent_name = os.path.basename(parent_dir) for doc_leaf_path in get_dir_files(parent_dir): file_name = os.path.basename(doc_leaf_path) doc_id = parent_name + "/" + file_name try: content = open(doc_leaf_path, "r", encoding="utf-8").read() docs.append(MPQARawDoc(doc_id, content)) except UnicodeDecodeError: print(doc_leaf_path) raise return docs
def summarize_runner(summarizer, out_root): dir_root = "/mnt/nfs/scratch1/youngwookim/data/clueweb12_10000_pred_ex" for file_path in get_dir_files(dir_root): try: if "abortion" not in file_path: continue print(file_path) file_name = os.path.basename(file_path) obj = pickle.load(open(file_path, "rb")) r = summarizer(obj) out_path = os.path.join(out_root, file_name) pickle.dump(r, open(out_path, "wb")) except Exception as e: print(e) pass
def enum_docs_and_stance(): topic = "abortion" summary_path = "/mnt/nfs/work3/youngwookim/data/stance/clueweb12_10000_pred_ex_summary_w_logit" relevance_list_path = "/home/youngwookim/work/ukp/relevant_docs/clueweb12" all_tokens = ukp_load_tokens_for_topic(topic) all_ranked_list = load_ranked_list(relevance_list_path) for file_path in get_dir_files(summary_path): if topic not in file_path: continue file_name = os.path.basename(file_path) predictions = pickle.load(open(file_path, "rb")) for doc_idx, preds in predictions: doc_id, rank, score = all_ranked_list[file_name][doc_idx] doc = all_tokens[doc_id] yield doc, preds
def loss_view(dir_path): tokenizer = get_tokenizer() html_writer = HtmlVisualizer("ukp_lm_grad_high.html", dark_mode=False) for file_path in get_dir_files(dir_path): items = pickle.load(open(file_path, "rb")) for e in items: input_ids, masked_input_ids, masked_lm_example_loss = e tokens = mask_resolve_1( tokenizer.convert_ids_to_tokens(input_ids), tokenizer.convert_ids_to_tokens(masked_input_ids)) highlight = lmap(is_mask, tokens) cells = cells_from_tokens(tokens, highlight) html_writer.multirow_print(cells)
def load_all_annotations() -> List[Tuple[str, List[MPQAAnnLine]]]: doc_dir_path = os.path.join(root_dir, "man_anns") for parent_dir in get_dir_dir(doc_dir_path): parent_name = os.path.basename(parent_dir) for doc_leaf_path in get_dir_dir(parent_dir): doc_leaf_name = os.path.basename(doc_leaf_path) doc_id = parent_name + "/" + doc_leaf_name ann_set_list = [] for ann_file_path in get_dir_files(doc_leaf_path): ann_file_type = os.path.basename(ann_file_path) assert ann_file_type in [ "gateman.mpqa.lre.2.0", "gatesentences.mpqa.2.0", "answer.mpqa.2.0" ] lines = read_mqpa_anns(ann_file_path) ann_set_list.extend(lines) yield doc_id, ann_set_list
def sample_median(): # we don't want to make one of (bad/good) split to have shorter text than the other. files = get_dir_files(get_prediction_dir(working_dir)) random.shuffle(files) all_scores = [] for file_path in files[:10]: data = pickle.load(open(file_path, "rb")) data = flatten_batches(data) t = scorer(data["prob1"], data["prob2"]) all_scores.extend(t) all_scores.sort() l = len(all_scores) print(l) mid = int(l / 2) print(all_scores[mid])
def load_multiple_ranked_list(dir_path, get_key_from_name): files = get_dir_files(dir_path) data = [] for file_path in files: name = os.path.basename(file_path) ranked_list_d = load_galago_ranked_list(file_path) for query, ranked_list in ranked_list_d.items(): data.append((name, ranked_list)) new_d = {} key_fn = lambda x: get_key_from_name(x[0]) for key, sub_data in group_by(data, key_fn).items(): ranked_list = right(sub_data) new_d[key] = merge_ranked_list_list(ranked_list) return new_d
def estimator_prediction_loader(p, fetch_field_list=None): if os.path.isdir(p): data = [] for file_path in get_dir_files(p): data.extend(pickle.load(open(file_path, "rb"))) else: data = pickle.load(open(p, "rb")) if fetch_field_list is None: keys = list(data[0].keys()) vectors = flatten_batches(data) else: keys = list([k for k in data[0].keys() if k in fetch_field_list]) vectors = flatten_batches_inner(data, fetch_field_list) any_key = keys[0] data_len = len(vectors[any_key]) return vectors, keys, data_len
def __init__(self, select_by_preds): summary_path = "/mnt/nfs/work3/youngwookim/data/stance/clueweb12_10000_pred_summary" relevance_list_path = "/home/youngwookim/work/ukp/relevant_docs/clueweb12" all_ranked_list = load_ranked_list(relevance_list_path) self.selected = set() for file_path in get_dir_files(summary_path): file_name = os.path.basename(file_path) predictions = pickle.load(open(file_path, "rb")) n_reject = 0 for doc_idx, preds in predictions: doc_id, rank, score = all_ranked_list[file_name][doc_idx] assert rank == doc_idx + 1 if select_by_preds(preds): self.selected.add(doc_id) else: n_reject += 1 print("{} Reject {}".format(file_name, n_reject / len(predictions)))
def count_terms_for_dir(dir_path): def sig_to_terms(sig: str): token_ids = [int(t) for t in sig.split(" ")] terms = tokenizer.convert_ids_to_tokens(token_ids) return "".join(terms) counter = Counter() file_list = get_dir_files(dir_path) ticker = TimeEstimator(len(file_list)) for file_path in file_list: counter.update(count_terms(file_path)) ticker.tick() tokenizer = get_tokenizer() for sig, cnt in counter.items(): term = sig_to_terms(sig) print(term, cnt) return
def sample_median(): # we don't want to make one of (bad/good) split to have shorter text than the other. all_scores = [] scorer = get_lm_scorer() files = get_dir_files(tf_record_dir) random.shuffle(files) for file_path in files[:10]: tfrecord_itr = load_record(file_path) ticker = TimeEstimator(1000) for idx, inst in enumerate(tfrecord_itr): all_scores.append(scorer(inst)) if idx > 1000: break ticker.tick() all_scores.sort() l = len(all_scores) print(l) mid = int(l / 2) print(all_scores[mid])
def show(dir_path): topic = "abortion" tokenizer = get_tokenizer() for file_path in get_dir_files(dir_path): if topic not in file_path: continue file_name = os.path.basename(file_path) predictions = pickle.load(open(file_path, "rb")) for doc in predictions: show_doc = False for e in doc: sout, input_ids = e if sout[2] > 0.5: show_doc = True if show_doc: for e in doc: sout, input_ids = e tokens = tokenizer.convert_ids_to_tokens(input_ids) pred = np.argmax(sout) print(pred, pretty_tokens(tokens, True)) print("------------")
def collect_unique_passage(dir_path): key_set = set() unique_passages = [] def update(j): for doc_id, value in j.items(): j = value kdp = KDP(*j['kdp']) key = kdp.doc_id, kdp.passage_idx if key not in key_set: unique_passages.append(kdp) key_set.add(key) if os.path.isdir(dir_path): d = {} for file_path in get_dir_files(dir_path): if file_path.endswith(".info"): j = json.load(open(file_path, "r", encoding="utf-8")) update(j) else: d = json.load(open(dir_path, "r")) update(d) return unique_passages
def get_dir_all_itr(dir_path): for file_path in get_dir_files(dir_path): one_itr = load_record_v2(file_path) for item in one_itr: yield item
def load_all_comments(dir_path): for comment_path in get_dir_files(dir_path): yield parse_comment.parse_comments(comment_path)
def iter_gz_files_for_group(self, group_name): dir_to_iter = os.path.join(self.root_dir, group_name_to_subdir_path(group_name)) file_list = get_dir_files(FilePath(dir_to_iter)) file_list.sort() return file_list