def __init__(self, window_size): self.stemmer = CacheStemmer() self.window_size = window_size self.doc_posting = None self.stopword = load_stopwords() vocab_file = os.path.join(cpath.data_path, "bert_voca.txt") self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=True) def load_pickle(name): p = os.path.join(cpath.data_path, "adhoc", name + ".pickle") return pickle.load(open(p, "rb")) self.doc_len_dict = load_pickle("doc_len") self.qdf = load_pickle("robust_qdf_ex") self.meta = load_pickle("robust_meta") self.head_tokens = load_pickle("robust_title_tokens") self.seg_info = load_pickle("robust_seg_info") self.not_found = set() self.total_doc_n = len(self.doc_len_dict) self.avdl = sum(self.doc_len_dict.values()) / len(self.doc_len_dict) tprint("Init PassageRanker")
def display(tf1, tf2, label_name1="pos", label_name2="neg"): odd_dict = get_all_term_odd(tf1, tf2, 0.95) def contrib(e): key, value = e return (tf1[key] + tf2[key]) * value odd_list = list(odd_dict.items()) odd_list.sort(key=contrib, reverse=True) stopword = load_stopwords() def valid(e): key, value = e return key not in stopword and tf1[key] > 10 and tf2[key] > 10 acc = 0 for key, value in odd_list: acc += value * (tf1[key] + tf2[key]) ctf = sum(tf1.values()) + sum(tf2.values()) print(acc, acc/ctf) k = 50 odd_list = list(filter(valid, odd_list)) print("Top {} ".format(label_name1)) for key, value in odd_list[:k]: print(key, tf1[key], tf2[key], odd_dict[key]) print("Top {} ".format(label_name2)) for idx in range(len(odd_list) - 1, len(odd_list) - 1 - k, -1): key, value = odd_list[idx] print(key, contrib(odd_list[idx]), tf1[key], tf2[key], odd_dict[key])
def sum_random_walk_score(name_class): d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) claim_d = claims_to_dict(claims) prob_score_d = load_from_pickle("pc_{}_word_prob_train".format(name_class)) stopwords = load_stopwords() acc_counter_prob_init = Counter() for claim_id, prob_scores in prob_score_d.items(): for k, v in prob_scores: if k not in stopwords: acc_counter_prob_init[k] += v rw_score = dict(load_from_pickle("bias_random_walk_train_{}".format(name_class))) acc_counter = Counter() for claim_id, qtf in rw_score.items(): for k, v in qtf.items(): acc_counter[k] += v acc_counter_prob_init = normalize_counter_to_sum1(acc_counter_prob_init) acc_counter = normalize_counter_to_sum1(acc_counter) new_counter = Counter() for k, v in acc_counter.items(): if len(k) > 2: new_v = v - acc_counter_prob_init[k] new_counter[k] = new_v return new_counter
def merge_batch(parsed_data): stopwords = load_stopwords() all_annotations = defaultdict(list) for entry in parsed_data: url_id = entry['url_id'] all_annotations[url_id].extend(entry['statements']) def get_dist(text1, text2): tokens1 = tokenize(text1, stopwords) tokens2 = tokenize(text2, stopwords) common = set(tokens1).intersection(set(tokens2)) n_common = len(common) return (n_common / len(tokens1)) * (n_common / len(tokens2)) result = dict() dist_thres = 0.5 for key in all_annotations: annot_list = all_annotations[key] n = len(annot_list) for idx1 in range(n): for idx2 in range(idx1 + 1, n): annot1 = annot_list[idx1] annot2 = annot_list[idx2] dist = get_dist(annot1, annot2) if dist > dist_thres: print("Diff : ") print(annot1) print(annot2) result[key] = annot_list return result
def __init__(self): vocab_file = os.path.join(data_path, "bert_voca.txt") self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) self.stemmer = CacheStemmer() self.stopword = load_stopwords() self.df = self.load_galgo_df_stat()
def segment_per_doc_index(task_id): token_reader = get_token_reader() stemmer = CacheStemmer() stopword = load_stopwords() p = os.path.join(cpath.data_path, "adhoc", "robust_seg_info.pickle") seg_info = pickle.load(open(p, "rb")) def get_doc_posting_list(doc_id): doc_posting = defaultdict(list) for interval in seg_info[doc_id]: (loc, loc_ed), (_, _) = interval tokens = token_reader.retrieve(doc_id) st_tokens = list([stemmer.stem(t) for t in tokens]) ct = Counter(st_tokens[loc:loc_ed]) for term, cnt in ct.items(): if term in stopword: continue doc_posting[term].append((loc, cnt)) return doc_posting doc_id_list = get_doc_task(task_id) ticker = TimeEstimator(len(doc_id_list)) doc_posting_d = {} for doc_id in doc_id_list: doc_posting_d[doc_id] = get_doc_posting_list(doc_id) ticker.tick() save_path = os.path.join(cpath.data_path, "adhoc", "per_doc_posting_{}.pickle".format(task_id)) pickle.dump(doc_posting_d, open(save_path, "wb"))
def get_odd_list(): result = load_from_pickle("majority_3gram") tf0, tf1, tf2 = result odd_dict = get_all_term_odd(tf1, tf2, 0.95) def contrib(e): key, value = e return (tf1[key] + tf2[key]) * value odd_list = list(odd_dict.items()) odd_list.sort(key=contrib, reverse=True) stopword = load_stopwords() def valid(e): key, value = e return key not in stopword and tf1[key] > 10 and tf2[key] > 10 acc = 0 for key, value in odd_list: acc += value * (tf1[key] + tf2[key]) ctf = sum(tf1.values()) + sum(tf2.values()) print(acc, acc / ctf) return list(filter(valid, odd_list))
def summarize(self): topic = data_generator.argmining.ukp_header.all_topics[0] data_loader = ukp.DataLoader(topic) stopwords = load_stopwords() def tokenize(x): return tokenizer.tokenize(x, stopwords) def sent_score(token_sent, bow_score): score = 0 factor = 1 for t in token_sent: score += bow_score[t] * factor factor *= 0.5 return score def is_argument(entry): return entry['annotation'] == "Argument_for" or entry[ 'annotation'] == "Argument_against" for topic in data_generator.argmining.ukp_header.all_topics: entries = data_loader.all_data[topic] raw_sents = list( [e['sentence'] for e in entries if e['set'] == 'train']) token_sents = list(map(tokenize, raw_sents)) tprint("Runing TextRank") text_rank = TextRank(token_sents) tr_score = Counter(text_rank.run(flatten(token_sents))) tprint("claim_gen.generate") raw_sents.sort(key=lambda x: sent_score(tokenize(x), tr_score), reverse=True) for i in range(10): print(raw_sents[i])
def __init__(self, target_topic, is_3way, max_sequence, vocab_filename): DataLoader.__init__(self, target_topic, is_3way) self.max_seq = max_sequence voca_path = os.path.join(data_path, vocab_filename) self.stopwords = load_stopwords() self.lower_case = True self.sep_char = "#" self.encoder = FullTokenizerWarpper(voca_path)
def __init__(self): self.stopword = load_stopwords() self.stemmer = CacheStemmer() vocab_file = os.path.join(cpath.data_path, "bert_voca.txt") self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) tprint("Loading inv_index for robust") self.collection = RobustCollection() tprint("Done") self.num_candidate = 10
def build3(self, C, NC): stopwords = load_stopwords() self.supervised = True self.NC = NC self.C = C self.stopword = stopwords self.NC_ctf = sum(self.NC.values()) self.C_ctf = sum(self.C.values())
def __init__(self, data): self.p_reset = 0.1 self.max_repeat = 500 self.window_size = 10 self.idf = collections.Counter() self.def_idf = 2 for document in data: for word in set(document): self.idf[word] += 1 self.stopword = load_stopwords()
def build_inv_index(sents): stopword = load_stopwords() stopword.add("should") group = {} for idx, sent in enumerate(sents): for t in sent: if len(t) > 1 and t not in stopword: assign_list_if_not_exists(group, t) group[t].append(idx) return group
def build(self, lm_docs_list, bg_tf, bg_ctf): self.n_lm = len(lm_docs_list) stopwords = load_stopwords() def transform(counter): if self.stemmer is None: new_tf = counter else: new_tf = Counter() for key in counter: source = key target = self.stemmer(key) new_tf[target] += counter[source] counter = new_tf new_tf = Counter() for key in counter: if len(key) <= 3 or key in stopwords: pass else: new_tf[key] = counter[key] return new_tf def remove_stopword(counter): new_tf = Counter() for key in counter: if len(key) < 3 or key in stopwords: pass else: new_tf[key] = counter[key] return new_tf self.BG = transform(bg_tf) self.BG_ctf = bg_ctf self.stopword = stopwords for lm_docs in lm_docs_list: c_tf = collections.Counter() for idx, s in enumerate(lm_docs): tokens = self.tokenizer(s) for token in tokens: if token in bg_tf: c_tf[token] += 1 tf_dict = transform(c_tf) self.C.append(tf_dict) self.C_ctf.append(sum(tf_dict.values()))
def build2(self, x, y): stopwords = load_stopwords() self.stopword = stopwords self.supervised = True self.NC = collections.Counter() self.C = collections.Counter() def update(counter, tokens): for token in tokens: counter[token] += 1 for idx, s in enumerate(x): tokens = self.tokenize(s) if y[idx] == 0: update(self.NC, tokens) elif y[idx] == 1: update(self.C, tokens) self.NC_ctf = sum(self.NC.values()) self.C_ctf = sum(self.C.values()) vectors = [] for idx, s in enumerate(x): tokens = self.tokenize(s) odd = self.log_odd_binary(tokens) vectors.append((odd, y[idx])) vectors.sort(key=lambda x: x[0], reverse=True) total = len(vectors) p = np.count_nonzero(y) fp = 0 max_acc = 0 self.opt_alpha = 0 for idx, (odd, label) in enumerate(vectors): alpha = odd - 1e-8 if label == 0: fp += 1 tp = (idx + 1) - fp fn = p - tp tn = total - (idx + 1) - fn acc = (tp + tn) / (total) if acc > max_acc: self.opt_alpha = alpha max_acc = acc print("Train acc : {}".format(max_acc))
def build_krovetz_index(): stemmer = Stemmer() stopwords = load_stopwords() stem_dict = dict() def stem(token): if token in stem_dict: return stem_dict[token] else: r = stemmer.stem(token) stem_dict[token] = r return r collection = trec.load_robust(trec.robust_path) print("writing...") inv_index = dict() ticker = TimeEstimator(len(collection)) for doc_id in collection: content = collection[doc_id] tokens = nltk.tokenize.wordpunct_tokenize(content) terms = dict() for idx, t in enumerate(tokens): if t in stopwords: continue t_s = stem(t) if t_s not in terms: terms[t_s] = list() terms[t_s].append(idx) for t_s in terms: if t_s not in inv_index: inv_index[t_s] = list() posting = (doc_id, terms[t_s]) inv_index[t_s].append(posting) ticker.tick() save_path = os.path.join(cpath.data_path, "adhoc", "robust_inv_index.pickle") pickle.dump(inv_index, open(save_path, "wb"))
def __init__(self, data, data_info): super(DictAuxDataFeeder, self).__init__(data) self.stopword = load_stopwords() vocab_file = os.path.join(data_path, "bert_voca.txt") self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) self.dict = self.encode_dict_as_feature(self.raw_dictionary) # data is already truncated and padded self.data = data self.data_len = len(self.data) if data_info is not None: self.data_info = data_info else: self.data_info = self.nli_data_indexing(data)
def get_simple_claim_query(claims, drop_stopwords=False) -> List[DocQuery]: if drop_stopwords: stopword = load_stopwords() queries: List[DocQuery] = [] for c in claims: cid = str(c["cId"]) claim_text = c["text"] q_terms: List[str] = clean_tokenize_str_to_tokens(claim_text) print(q_terms) if drop_stopwords: q_terms = list([t for t in q_terms if t not in stopword]) q_terms = list([t.replace(".", "") for t in q_terms]) print(q_terms) q_entry: DocQuery = format_query_simple(cid, q_terms) queries.append(q_entry) return queries
def build_co_occurrence(list_tokens: List[List[str]], window_size, stemmer: CacheStemmer) -> Counter: list_tokens: List[List[str]] = lmap(stemmer.stem_list, list_tokens) stopword = load_stopwords() def remove_stopwords(tokens: List[str]) -> List[str]: return list([t for t in tokens if t not in stopword]) list_tokens: List[List[str]] = lmap(remove_stopwords, list_tokens) counter = Counter() def count_co_ocurrence_fn(token_list): count_co_ocurrence(window_size, counter, token_list) foreach(count_co_ocurrence_fn, list_tokens) return counter
def tf_stat(self): topic = data_generator.argmining.ukp_header.all_topics[0] data_loader = ukp.DataLoader(topic) stopwords = load_stopwords() def tokenize(x): return tokenizer.tokenize(x, stopwords) for topic in data_generator.argmining.ukp_header.all_topics: print("-----------") print(topic) entries = data_loader.all_data[topic] token_sents = list([ tokenize(e['sentence']) for e in entries if e['set'] == 'train' ]) tf_dict = Counter(flatten(token_sents)) for word, tf in tf_dict.most_common(30): print(word, tf)
def test_generative_model(): train, val = load_feature_and_split() print("Training lm") classifier = learn_lm(train) stopwords = load_stopwords() def fileter_fn(data_point: Dict): remove_stopword_and_punct(stopwords, data_point['feature']) foreach(fileter_fn, train) def is_correct(data_point: Dict): x = data_point['feature'] y = int(data_point['label']) return classifier.predict(x) == int(y) correctness = lmap(is_correct, val) print("val acc: ", average(correctness))
def from_pos_neg(pos_docs, neg_docs): stemmer = None stopwords = load_stopwords() y = list(1 for _ in pos_docs) + list(0 for _ in neg_docs) def transform(counter): if stemmer is None: new_tf = counter else: new_tf = Counter() for key in counter: source = key target = stemmer(key) new_tf[target] += counter[source] counter = new_tf new_tf = Counter() for key in counter: if len(key) <= 3 or key in stopwords: pass else: new_tf[key] = counter[key] return new_tf def count_word_parallel(documents): split = 30 p = Pool(split) args = chunks(documents, split) counters = p.map(count_word, args) g_counter = Counter() for counter in counters: for key in counter.keys(): g_counter[key] += counter[key] return g_counter c_counter = transform(count_word_parallel(pos_docs)) nc_counter = transform(count_word_parallel(neg_docs)) tokenizer = lambda x: tokenize(x, set(), False) classifier = LMClassifer(tokenizer, None, fulltext=True) classifier.build3(c_counter, nc_counter) return classifier
def get_yw_may(): from dispute.guardian import load_local_pickle stopwords = load_stopwords() tokenizer = lambda x: tokenize(x, stopwords, False) class YWMay: def __init__(self): self.stopwords = stopwords self.topic_info = load_local_pickle("topic_score") def get_tf10(self, tokens): counter = Counter() for t in tokens: if t not in self.stopwords and len(t) > 2: counter[t] += 1 return counter.most_common(10) def score(self, docs): def term_odd(token): if token not in self.topic_info: return 0 else: p = self.topic_info[token] if p > 0.9999 or p < 0.0001: return 0 else: return math.log(p) - math.log(1 - p) def predict(doc): tokens = tokenizer(doc) sum_odd = 0 top10 = left(list(self.get_tf10(tokens))) odd_list = lmap(term_odd, tokens) result = sum(odd_list) return result return lmap(predict, docs) return YWMay()
def __init__(self): tprint("Pipeline Init") self.stemmer = CacheStemmer() vocab_file = os.path.join(cpath.data_path, "bert_voca.txt") self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) self.iteration_dir = "/mnt/scratch/youngwookim/data/tlm_iter1" if not os.path.exists("/mnt/scratch/youngwookim/"): self.iteration_dir = "/mnt/nfs/work3/youngwookim/data/tlm_iter1" self.seg_max_seq = 256 self.model_max_seq = 512 self.rng = random.Random(0) self.masked_lm_prob = 0.15 self.short_seq_prob = 0.1 self.inst_per_job = 1000 self.stopword = load_stopwords() self.pr = FeatureExtractor(self.seg_max_seq - 3) self.tf_record_maker = None self.code_tick = CodeTiming() tprint("Pipeline Init Done")
def build4(self, c_tf, bg_tf, bg_ctf): stopwords = load_stopwords() self.stopword = stopwords def transform(counter: Counter): if self.stemmer is None: new_tf = counter else: new_tf = Counter() for key in counter: source = key try: target = self.stemmer(key) new_tf[target] += counter[source] except: pass counter = new_tf new_tf = Counter() for key in counter: if len(key) <= 3 or key in stopwords: pass else: new_tf[key] = counter[key] return new_tf def remove_stopword(counter): new_tf = Counter() for key in counter: if len(key) < 3 or key in stopwords: pass else: new_tf[key] = counter[key] return new_tf self.BG = transform(bg_tf) self.BG_ctf = bg_ctf self.C = transform(c_tf) self.C_ctf = sum(self.C.values())
def count_it( data: Dict[str, List[ScoreParagraph]]) -> List[Tuple[str, Counter]]: stemmer = CacheStemmer() r = [] stopword = load_stopwords() def remove_stopwords(tokens: List[str]) -> List[str]: return list([t for t in tokens if t not in stopword]) ticker = TimeEstimator(len(data)) for cid, para_list in data.items(): ticker.tick() tokens_list: List[List[str]] = [e.paragraph.tokens for e in para_list] list_tokens: List[List[str]] = lmap(stemmer.stem_list, tokens_list) list_tokens: List[List[str]] = lmap(remove_stopwords, list_tokens) all_cnt = Counter() for tokens in list_tokens: all_cnt.update(Counter(tokens)) r.append((cid, all_cnt)) return r
def lm_contribution(): train, val = load_feature_and_split() print("Training lm") stopwords = load_stopwords() def fileter_fn(data_point): remove_stopword_and_punct(stopwords, data_point[0][0]) foreach(fileter_fn, train) classifier = learn_lm(train) acc_contrib = Counter() for data_point in train: (tf, num), y = data_point contrib = classifier.counter_contribution(tf) # print("{} {} {}".format(y, classifier.predict(tf), classifier.counter_odd(tf))) # print("--------------") for t, score in contrib.most_common(100): acc_contrib[t] += score for t, score in acc_contrib.most_common(100): print(t, score, classifier.P_w_C_dict[t], classifier.P_w_NC_dict[t])
def save_qdf_ex(): ii_path = os.path.join(cpath.data_path, "adhoc", "robust_inv_index.pickle") inv_index = pickle.load(open(ii_path, "rb")) save_path = os.path.join(cpath.data_path, "adhoc", "robust_meta.pickle") meta = pickle.load(open(save_path, "rb")) stopwords = load_stopwords() stemmer = CacheStemmer() simple_posting = {} qdf_d = Counter() for term in inv_index: simple_posting[term] = set() for doc_id, _ in inv_index[term]: simple_posting[term].add(doc_id) for doc in meta: date, headline = meta[doc] tokens = nltk.tokenize.wordpunct_tokenize(headline) terms = set() for idx, t in enumerate(tokens): if t in stopwords: continue t_s = stemmer.stem(t) terms.add(t_s) for t in terms: simple_posting[t].add(doc) for term in inv_index: qdf = len(simple_posting[term]) qdf_d[term] = qdf save_path = os.path.join(cpath.data_path, "adhoc", "robust_qdf_ex.pickle") pickle.dump(qdf_d, open(save_path, "wb"))
def divergence(self): # Compare Arg vs Non-Arg topic = data_generator.argmining.ukp_header.all_topics[0] data_loader = ukp.DataLoader(topic) stopwords = load_stopwords() def tokenize(x): return tokenizer.tokenize(x, stopwords) def is_argument(entry): return entry['annotation'] == "Argument_for" or entry[ 'annotation'] == "Argument_against" for topic in data_generator.argmining.ukp_header.all_topics: print("-----------") print(topic) entries = data_loader.all_data[topic] token_sents = list([ tokenize(e['sentence']) for e in entries if e['set'] == 'train' ]) topic_tf = Counter(flatten(token_sents)) arg_div = [] narg_div = [] for e in entries: sent_tf = Counter(tokenize(e['sentence'])) div = kl.kl_divergence_subset(sent_tf, topic_tf) assert not math.isnan(div) if e['set'] == 'train' and is_argument(e): arg_div.append(div) elif e['set'] == 'train': narg_div.append(div) print("Arg KL mean : ", average(arg_div)) print("Non-Arg KL mean : ", average(narg_div))
def divergence_lr(self): f1_list = [] for dev_topic in data_generator.argmining.ukp_header.all_topics: print(dev_topic) data_loader = ukp.DataLoader(dev_topic) idx_for = data_loader.labels.index("Argument_for") idx_against = data_loader.labels.index("Argument_against") train_data = data_loader.get_train_data() dev_data = data_loader.get_dev_data() train_X, train_y = zip(*train_data) dev_X, dev_y = zip(*dev_data) feature = CountVectorizer() train_X_v = feature.fit_transform(train_X) stopwords = load_stopwords() def tokenize(x): return tokenizer.tokenize(x, stopwords) data_idx = 0 for topic in data_generator.argmining.ukp_header.all_topics: if topic == dev_topic: continue entries = data_loader.all_data[topic] token_sents = list([ tokenize(e['sentence']) for e in entries if e['set'] == 'train' ]) topic_tf = Counter(flatten(token_sents)) for e in entries: if e['set'] == 'train': sent_tf = Counter(tokenize(e['sentence'])) div = kl.kl_divergence_subset(sent_tf, topic_tf) assert not math.isnan(div) train_X_v[data_idx, -1] = div data_idx += 1 assert data_idx == len(train_X) classifier = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial') #classifier = LinearSVC() classifier = MLPClassifier() classifier.fit(train_X_v, train_y) dev_X_v = feature.transform(dev_X) token_sents = list([ tokenize(e['sentence']) for e in data_loader.all_data[dev_topic] if e['set'] == 'val' ]) topic_tf = Counter(flatten(token_sents)) data_idx = 0 for e in data_loader.all_data[dev_topic]: if e['set'] == 'val': sent_tf = Counter(tokenize(e['sentence'])) div = kl.kl_divergence_subset(sent_tf, topic_tf) train_X_v[data_idx, -1] = div data_idx += 1 assert data_idx == len(dev_X) train_pred = classifier.predict(train_X_v) dev_pred = classifier.predict(dev_X_v) def print_eval(pred_y, gold_y): all_result = eval_3label(pred_y, gold_y) for_result = all_result[idx_for] against_result = all_result[idx_against] f1 = sum([result['f1'] for result in all_result]) / 3 print("F1", f1) print("P_arg+", for_result['precision']) print("R_arg+", for_result['recall']) print("P_arg-", against_result['precision']) print("R_arg-", against_result['recall']) return f1 #print("Train") #print_eval(train_pred, train_y) f1 = print_eval(dev_pred, dev_y) f1_list.append(f1) average(f1_list)