def main(config): split = config['split'] word_prob_path = config['word_prob_path'] per_query_infos: Dict[str, Dict[WordAsID, np.array]] = load_pickle_from(word_prob_path) claims = load_claims_for_sub_split(split) claim_d = claims_to_dict(claims) stopwords = load_stopwords_for_query() def is_stopword(tokens): if len(tokens) == 1 and tokens[0] in stopwords: return True else: return False tokenizer = get_tokenizer() for query_id, d in per_query_infos.items(): entry = [] for key in d.keys(): tokens: List[str] = decode_word_as_id(tokenizer, key) if is_stopword(tokens): continue plain_word: str = pretty_tokens(tokens, True) pos, neg = d[key] pos_log = math.log(pos + 1e-10) neg_log = math.log(neg + 1e-10) diff = pos_log - neg_log entry.append((plain_word, diff, pos_log, neg_log)) print(query_id, claim_d[int(query_id)]) entry.sort(key=get_second, reverse=True) for word, diff, pos, neg in entry[:100]: word = word.strip() print("{0}\t{1:.2f}\t{2:.2f}\t{3:.2f}".format( word, diff, pos, neg))
def get_input_as_text(self, resolve_mask=False, highlight_lookup_word=False): if resolve_mask: mask_ans = self.get_mask_answer_dict() if highlight_lookup_word: d_location_ids = self.get_d_location_ids() word = self.get_selected_word_text() emph_word = "<b>" + word + "</b>" tokens = self.tokenizer.convert_ids_to_tokens(self.get_input_ids()) for i in range(len(tokens)): if resolve_mask and tokens[i] == "[MASK]": tokens[i] = "[MASK_{}: {}]".format(i, mask_ans[i]) if highlight_lookup_word and i in d_location_ids and i is not 0: print(i, emph_word) if tokens[i - 1] != emph_word: tokens[i] = emph_word else: tokens[i] = "-" return tokenization.pretty_tokens(tokens, True)
def __init__(self, d: Dict[WordAsID, np.array], skip_stopwords=True, stem=True): self.tokenizer = get_tokenizer() self.stopwords_as_ids: Set[WordAsID] = set() new_d = {} if skip_stopwords: stopwords = load_stopwords_for_query() for key in d.keys(): tokens = decode_word_as_id(self.tokenizer, key) if len(tokens) == 1 and tokens[0] in stopwords: pass self.stopwords_as_ids.add(key) else: new_d[key] = d[key] d = new_d if stem: d_raw = defaultdict(list) stemmer = Stemmer() for key in d.keys(): tokens = decode_word_as_id(self.tokenizer, key) plain_word = pretty_tokens(tokens, True) stemmed = stemmer.stem(plain_word) d_raw[stemmed].append(d[key]) new_d: Dict[str, TokenScore] = {} for key, items in d_raw.items(): score: TokenScore = [average([t[0] for t in items]), average([t[1] for t in items])] new_d[key] = score d = new_d self.stem = True self.stemmer = stemmer self.log_odd = self.log_odd_w_stem self.d = d self.smoothing = 0.1
def analyze_gradient(data, tokenizer): gradients = data['gradients'] d_input_ids = data['d_input_ids'] mask_input_ids = data['masked_input_ids'] masked_lm_positions = data["masked_lm_positions"] n_inst, seq_len = mask_input_ids.shape n_inst2, def_len = d_input_ids.shape assert n_inst == n_inst2 def_len = 256 hidden_dim = 768 reshaped_grad = reshape_gradienet(gradients, n_inst, def_len, hidden_dim) print(reshaped_grad.shape) n_pred = reshaped_grad.shape[1] grad_per_token = np.sum(np.abs(reshaped_grad), axis=3) html_writer = HtmlVisualizer("dict_grad.html", dark_mode=False) for inst_idx in range(n_inst): tokens = tokenizer.convert_ids_to_tokens(mask_input_ids[inst_idx]) #ans_tokens = tokenizer.convert_ids_to_tokens(input_ids[inst_idx]) for i in range(len(tokens)): if tokens[i] == "[MASK]": tokens[i] = "[MASK_{}]".format(i) if tokens[i] == "[SEP]": tokens[i] = "[SEP]<br>" def_tokens = tokenizer.convert_ids_to_tokens(d_input_ids[inst_idx]) s = tokenizer_wo_tf.pretty_tokens(tokens) lines = [] grad_total_max = 0 for pred_idx in range(n_pred): row = [] max_val = max(grad_per_token[inst_idx, pred_idx]) total = sum(grad_per_token[inst_idx, pred_idx]) mask_pos = masked_lm_positions[inst_idx, pred_idx] if total > grad_total_max: grad_total_max = total row.append(Cell(mask_pos)) row.append(Cell(int(total))) for def_idx in range(def_len): term = def_tokens[def_idx] cont_right = def_idx + 1 < def_len and def_tokens[ def_idx][:2] == "##" cont_left = term[:2] == "##" space_left = " " if not cont_left else "" space_right = " " if not cont_right else "" if term == "[PAD]": break if term == "[unused5]": term = "[\\n]" score = grad_per_token[inst_idx, pred_idx, def_idx] / (hidden_dim * 2) bg_color = get_color(score) row.append(Cell(term, score, not cont_left, not cont_right)) print("{}({})".format( term, grad_per_token[inst_idx, pred_idx, def_idx]), end=" ") lines.append((mask_pos, row)) print("") lines.sort(key=lambda x: x[0]) s = s.replace("[unused4]", "<b>DictTerm</b>") html_writer.write_paragraph(s) if grad_total_max > 5000000: html_writer.write_headline("HIGH Gradient") rows = right(lines) html_writer.write_table(rows) print("----------") html_writer.close()
def create_instances(self, input_path, target_topic, target_seq_length): tokenizer = get_tokenizer() doc_top_k = 1000 all_train_data = list(load_record(input_path)) train_data = [] for feature in all_train_data: input_ids = feature["input_ids"].int64_list.value token_id = input_ids[1] topic = token_ids_to_topic[token_id] if target_topic == topic: train_data.append(feature) print("Selected {} from {}".format(len(train_data), len(all_train_data))) doc_dict = load_tokens_for_topic(target_topic) token_doc_list = [] ranked_list = sydney_get_ukp_ranked_list()[target_topic] print("Ranked list contains {} docs, selecting top-{}".format(len(ranked_list), doc_top_k)) doc_ids = [doc_id for doc_id, _, _ in ranked_list[:doc_top_k]] for doc_id in doc_ids: doc = doc_dict[doc_id] token_doc = pool_tokens(doc, target_seq_length) token_doc_list.extend(token_doc) ranker = Ranker() target_tf_list = lmap(ranker.get_terms, token_doc_list) ranker.init_df_from_tf_list(target_tf_list) inv_index = collections.defaultdict(list) for doc_idx, doc_tf in enumerate(target_tf_list): for term in doc_tf: if ranker.df[term] < ranker.N * 0.3: inv_index[term].append(doc_idx) def get_candidate_from_inv_index(inv_index, terms): s = set() for t in terms: s.update(inv_index[t]) return s source_tf_list = [] selected_context = [] for s_idx, feature in enumerate(train_data): input_ids = feature["input_ids"].int64_list.value topic_seg, sent = split_p_h_with_input_ids(input_ids, input_ids) source_tf = ranker.get_terms_from_ids(sent) source_tf_list.append(source_tf) ranked_list = [] candidate_docs = get_candidate_from_inv_index(inv_index, source_tf.keys()) for doc_idx in candidate_docs: target_tf = target_tf_list[doc_idx] score = ranker.bm25(source_tf, target_tf) ranked_list.append((doc_idx, score, target_tf)) ranked_list.sort(key=lambda x: x[1], reverse=True) ranked_list = list(filter_overlap(ranked_list)) ranked_list = ranked_list[:self.max_context] if s_idx < 10: print("--- Source sentence : \n", pretty_tokens(tokenizer.convert_ids_to_tokens(sent), True)) print("-------------------") for rank, (idx, score, target_tf) in enumerate(ranked_list): ranker.bm25(source_tf, target_tf, True) print("Rank#{} {} : ".format(rank, score) + pretty_tokens(token_doc_list[idx], True)) if s_idx % 100 == 0: print(s_idx) contexts = list([token_doc_list[idx] for idx, score, _ in ranked_list]) selected_context.append(contexts) for sent_idx, feature in enumerate(train_data): contexts = selected_context[sent_idx] yield feature, contexts
def ids_to_pretty_text(self, ids): tokens = self.tokenizer.convert_ids_to_tokens(ids) return tokenization.pretty_tokens(tokens, True)
def load_and_visualize(): tokenizer = tokenizer_wo_tf.FullTokenizer( os.path.join(data_path, "bert_voca.txt")) data_id = "1" n_list = open(os.path.join(output_path, "lookup_n", data_id), "r").readlines() p = os.path.join(output_path, "example_loss.pickle") data = pickle.load(open(p, "rb")) data = data[0]["masked_lm_example_loss"] feature_itr = load_record_v1( os.path.join(output_path, "lookup_example", data_id)) n = len(n_list) feature_idx = 0 html_writer = HtmlVisualizer("lookup_loss2.html", dark_mode=False) for i in range(n): n_sample = int(n_list[i]) rows = [] assert n_sample > 0 for j in range(n_sample): feature = feature_itr.__next__() input_ids = take(feature["input_ids"]) masked_lm_ids = take(feature["masked_lm_ids"]) masked_lm_positions = take(feature["masked_lm_positions"]) input_mask = take(feature["input_mask"]) selected_word = take(feature["selected_word"]) d_input_ids = take(feature["d_input_ids"]) d_location_ids = take(feature["d_location_ids"]) word_tokens = tokenizer.convert_ids_to_tokens(selected_word) word = tokenizer_wo_tf.pretty_tokens((word_tokens)) emph_word = "<b>" + word + "</b>" if j == 0: mask_ans = {} masked_terms = tokenizer.convert_ids_to_tokens(masked_lm_ids) for pos, id in zip(list(masked_lm_positions), masked_terms): mask_ans[pos] = id tokens = tokenizer.convert_ids_to_tokens(input_ids) for i in range(len(tokens)): if tokens[i] == "[MASK]": tokens[i] = "[MASK_{}: {}]".format(i, mask_ans[i]) if i in d_location_ids and i is not 0: if tokens[i - 1] != emph_word: tokens[i] = emph_word else: tokens[i] = "-" def_str = tokenizer_wo_tf.pretty_tokens( tokenizer.convert_ids_to_tokens(d_input_ids), True) row = list() row.append(Cell(word)) row.append(Cell(data[feature_idx])) row.append(Cell(def_str)) rows.append(row) feature_idx += 1 s = tokenizer_wo_tf.pretty_tokens(tokens, True) html_writer.write_paragraph(s) html_writer.write_table(rows) html_writer.close()
def loss_drop_tendency(): tokenizer = get_tokenizer() filename = "ukp_all_loss_1.pickle" p = os.path.join(output_path, filename) data = pickle.load(open(p, "rb")) batch_size, seq_length = data[0]['input_ids'].shape keys = list(data[0].keys()) vectors = {} for e in data: for key in keys: if key not in vectors: vectors[key] = [] vectors[key].append(e[key]) for key in keys: vectors[key] = np.concatenate(vectors[key], axis=0) n_instance = len(vectors['input_ids']) print("n_instance ", n_instance) token_cnt = Counter() acc_prob_before = Counter() acc_prob_after = Counter() num_predictions = len(vectors["grouped_positions"][0][0]) prev_word = defaultdict(list) context = defaultdict(list) def bin_fn(v): return int(v / 0.05) bin_avg_builder = BinAverage(bin_fn) for i in range(n_instance): tokens = tokenizer.convert_ids_to_tokens(vectors['input_ids'][i]) positions = vectors["grouped_positions"][i] num_trials = len(positions) for t_i in range(num_trials): for p_i in range(num_predictions): loc = vectors["grouped_positions"][i][t_i][p_i] loss1 = vectors["grouped_loss1"][i][t_i][p_i] loss2 = vectors["grouped_loss2"][i][t_i][p_i] # get the term at the target location t = combine(tokens[loc - 1], tokens[loc]) ctx = pretty_tokens(tokens[loc - 5:loc + 4], drop_sharp=False) prob_before = math.exp(-loss1) prob_after = math.exp(-loss2) prev_word[t].append(tokens[loc - 1]) context[t].append(ctx) token_cnt[t] += 1 acc_prob_before[t] += prob_before acc_prob_after[t] += prob_after bin_avg_builder.add(prob_before, prob_after) bin_avg = bin_avg_builder.all_average() infos = [] for t in token_cnt: cnt = token_cnt[t] avg_prob_before = acc_prob_before[t] / cnt avg_prob_after = acc_prob_after[t] / cnt avg_diff = avg_prob_before - avg_prob_after e = t, avg_prob_before, avg_prob_after, avg_diff, cnt infos.append(e) infos = list([e for e in infos if e[4] > 30]) info_d = {e[0]: e for e in infos} relation_extraction_keywords = [ "founded_by", "located_in", "died_of", "such_as", "or_other", "and_other", "is_buried", "was_born" ] ukp_keywords = [ "do_you", "the_above", "once_the", "would_save", "therefore_,", "they_viewed", "lead_to", "an_increase", "may_not", "was_common", "the_number" ] group_A = relation_extraction_keywords group_B = ukp_keywords def get_avg_drop_nli(score): mapping = [ 0.00592526, 0.046476, 0.06966591, 0.0852695, 0.0819463, 0.11604176, 0.13313128, 0.14524656, 0.17160586, 0.18507012, 0.19524361, 0.23223796, 0.23867801, 0.2618752, 0.28670366, 0.31369072, 0.3431509, 0.39701927, 0.45573084, 0.72065012, 0.99999865 ] st = [ 0., 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1., ] for i, st_i in enumerate(st): if st_i <= score < st_i + 0.05: return mapping[i] assert False def get_avg_drop(v): bin_id = bin_fn(v) return bin_avg[bin_id] for t in group_A + group_B: if t not in info_d: print("Does not exists: ", t) else: t, avg_prob_before, avg_prob_after, avg_diff, cnt = info_d[t] v = get_avg_drop(avg_prob_before) if avg_prob_after > v: print("RIGHT: {} bf={} af={} av_af={}".format( t, avg_prob_before, avg_prob_after, v)) else: print("LEFT: {} bf={} af={} av_af={}".format( t, avg_prob_before, avg_prob_after, v)) def entropy(cnt_dict: Counter): total = sum(cnt_dict.values()) ent = 0 for key, value in cnt_dict.items(): p = value / total ent += -p * math.log(p) return ent def print_n(e_list, n): for e in e_list[:n]: t, avg_prob_before, avg_prob_after, avg_diff, cnt = e print("{} ({})".format(t, cnt)) print("Before : {0:3f}".format(avg_prob_before)) print("After : {0:3f}".format(avg_prob_after)) print("AvgDiff: {0:3f}".format(avg_diff)) term_stat = Counter(prev_word[e[0]]) print(term_stat) print(context[t]) print("Entropy: ", entropy(term_stat)) print(type(infos[0][0])) print(type(infos[0][1])) print(type(infos[0][2])) print("<< Most common >>") infos.sort(key=lambda x: x[1], reverse=True) print_n(infos, 10) print("---------------------") infos.sort(key=lambda x: x[3], reverse=True) print("<< Big Drop >>") print_n(infos, 10) print("---------------------") infos.sort(key=lambda x: x[3], reverse=False) print("<< Negative Drop (NLI Improve >>") print_n(infos, 10) print("---------------------") plt.rcParams.update({'font.size': 22}) fig, ax = plt.subplots() infos = list([e for e in infos if e[4] > 30]) y = list([x[1] for x in infos]) z = list([x[2] for x in infos]) fig.set_size_inches(18.5, 10.5) ax.scatter(z, y) x = np.linspace(0, 1, 1000) ax.plot(x, x) for i, e in enumerate(infos): ax.annotate(e[0], (z[i], y[i])) mpld3.show()
def main(): doc_id = "clueweb12-0005wb-96-30750" doc = load(BertTokenizedCluewebDoc, doc_id) print("doc has {} lines", len(doc)) print("last line:", pretty_tokens(doc[-1], True))
def text_print(): d = get_tokens() for key, tokens in d: print(key) print(pretty_tokens(tokens))
def print_ids(ids): print(pretty_tokens(tokenizer.convert_ids_to_tokens(ids), True))