def main(output_path, norm, stop): dirname, fname = os.path.split(output_path) if dirname != "" and not os.path.exists(dirname): os.makedirs(dirname) output_path = os.path.join( dirname, "{}.norm-{}{}.spl.gz".format(fname, norm, ".stop" if stop else "")) print "Domain: {}".format(fname) print "Output Path: {}".format(output_path) events = [ event for event in cuttsum.events.get_events() if event.type in dom2type[fname] and event.query_num < 26 ] ne = False #if ne is True: # annotators = ["tokenize", "ssplit", "pos", "lemma", "ner"] if norm == "lemma": annotators = ["tokenize", "ssplit", "pos", "lemma"] else: annotators = ["tokenize", "ssplit"] with cnlp.Server(annotators=annotators, mem="6G", port=2001, max_message_len=1000000) as client, \ gzip.open(output_path, "w") as f: query_ids = set([event.query_id for event in events]) updates = matches_df[matches_df["query id"].apply( lambda x: x in query_ids)] texts = updates.drop_duplicates( subset='update id')["update text"].apply(heal_text).tolist() central_per = None central_loc = None central_org = None print "processing update text" docs = [client.annotate(text) for text in texts] for doc in docs[:10]: print doc print "tokenizing" X_upd_txt = tokenize(docs, norm, stop, ne, central_per=central_per, central_loc=central_loc, central_org=central_org) print "writing" for line in X_upd_txt: f.write(line + "\n")
def main2(): events = cuttsum.events.get_events() df = cuttsum.judgements.get_merged_dataframe() stopwords = english_stopwords() with corenlp.Server( port=9876, mem="20G", threads=8, max_message_len=524288, annotators=["tokenize", "ssplit", "pos", "lemma"], #, "ner"], corenlp_props={ "pos.maxlen": 50, "ssplit.eolonly": "true" }) as pipeline: for event in events[8:9]: matches = df[df["query id"] == event.query_id] matching_update_ids = set(matches["update id"].tolist()) all_nuggets = matches.groupby("nugget id") thrsh_nuggets = all_nuggets.filter(lambda x: len(x) <= 10) nugget_ids = list(set(thrsh_nuggets["nugget id"].tolist())) nugget_ids.sort() for nugget_id in nugget_ids: if event.query_id.startswith("TS13"): updates = cuttsum.judgements.get_2013_updates() elif event.query_id.startswith("TS14"): updates = cuttsum.judgements.get_2014_sampled_updates() updates = updates[updates["query id"] == event.query_id] non_matching_updates = updates[updates["update id"].apply( lambda x: x not in matching_update_ids)] matching_updates = matches[matches["nugget id"] == nugget_id] nugget_text = matching_updates.iloc[0]["nugget text"] print nugget_text n_matching = len(matching_updates) n_nonmatching = len(non_matching_updates) n_instances = n_matching + n_nonmatching #matching_updates["update text"] = \ # matching_updates["update text"].apply(lambda x: x.lower()) non_matching_updates = non_matching_updates.iloc[ np.random.permutation(len(non_matching_updates))] non_matching_updates = non_matching_updates.iloc[np.arange( n_nonmatching)] #non_matching_updates["text"] = \ # non_matching_updates["text"].apply(lambda x: x.lower()) y = np.zeros(n_instances, dtype="int32") y[:n_matching] = 1 X_string = matching_updates["update text"].tolist() X_string += non_matching_updates.head( n_nonmatching)["text"].tolist() assert len(X_string) == n_instances p = np.random.permutation(n_instances) y = y[p] X_string = [X_string[i] for i in p] print "pipeline start" docs = pipeline.annotate_mp(X_string, n_procs=8) nugget_doc = pipeline.annotate(nugget_text) print "pipeline done" lemmas = [] for doc in docs: lems = [] for sent in doc: for tok in sent: if unicode(tok).lower() not in stopwords and len( unicode(tok)) < 50: lems.append(tok.lem.lower()) #print lems lemmas.append(set(lems)) nugget_lems = [] for sent in nugget_doc: for tok in sent: if unicode(tok).lower() not in stopwords and len( unicode(tok)) < 50: nugget_lems.append(tok.lem.lower()) nugget_lems = set(nugget_lems) n_lems = float(len(nugget_lems)) if n_lems == 1: print continue for i in xrange(n_instances): if len(lemmas[i]) > 50: continue cov = len(nugget_lems.intersection(lemmas[i])) / n_lems if cov > .75: if isinstance(nugget_text, str): print nugget_text else: print nugget_text.encode("utf-8") if isinstance(X_string[i], str): print y[i], X_string[i] else: print y[i], X_string[i].encode("utf-8") print
def main(): events = cuttsum.events.get_events() df = cuttsum.judgements.get_merged_dataframe() stopwords = english_stopwords() from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer() import os from sklearn.externals import joblib wtmf_models = {} wtmf_models["accidents"] = joblib.load( os.getenv("TREC_DATA") + "/semsim/accidents.norm-stem.lam20.000.pkl") wtmf_models["social-unrest"] = joblib.load( os.getenv("TREC_DATA") + "/semsim/social-unrest.norm-stem.lam1.000.pkl") wtmf_models["terrorism"] = joblib.load( os.getenv("TREC_DATA") + "/semsim/terrorism.norm-stem.lam10.000.pkl") wtmf_models["natural-disasters"] = joblib.load( os.getenv("TREC_DATA") + "/semsim/natural-disasters.norm-stem.lam20.000.pkl") all_acc = [] all_aug_acc = [] with corenlp.Server( port=9876, mem="20G", threads=8, max_message_len=524288, annotators=["tokenize", "ssplit", "pos", "lemma"], #, "ner"], corenlp_props={ "pos.maxlen": 50, "ssplit.eolonly": "true" }) as pipeline: for event in events: if event.query_num == 7: continue if event.query_num > 25: continue if event.type in dom2type["natural-disasters"]: wtmf_vec = wtmf_models["natural-disasters"] if event.type in dom2type["accidents"]: wtmf_vec = wtmf_models["accidents"] if event.type in dom2type["social-unrest"]: wtmf_vec = wtmf_models["social-unrest"] if event.type in dom2type["terrorism"]: wtmf_vec = wtmf_models["terrorism"] matches = df[df["query id"] == event.query_id] matching_update_ids = set(matches["update id"].tolist()) all_nuggets = matches.groupby("nugget id") thrsh_nuggets = all_nuggets.filter(lambda x: len(x) > 10) nugget_ids = list(set(thrsh_nuggets["nugget id"].tolist())) #nugget_ids.sort() for num_nug, nugget_id in enumerate(nugget_ids): if event.query_id.startswith("TS13"): updates = cuttsum.judgements.get_2013_updates() elif event.query_id.startswith("TS14"): updates = cuttsum.judgements.get_2014_sampled_updates() updates = updates[updates["query id"] == event.query_id] non_matching_updates = updates[updates["update id"].apply( lambda x: x not in matching_update_ids)] matching_updates = matches[matches["nugget id"] == nugget_id] nugget_text = matching_updates.iloc[0]["nugget text"] n_matching = len(matching_updates) n_nonmatching = min(n_matching, len(non_matching_updates)) n_instances = n_matching + n_nonmatching #matching_updates["update text"] = \ # matching_updates["update text"].apply(lambda x: x.lower()) non_matching_updates = non_matching_updates.iloc[ np.random.permutation(len(non_matching_updates))] non_matching_updates = non_matching_updates.iloc[np.arange( n_nonmatching)] #non_matching_updates["text"] = \ # non_matching_updates["text"].apply(lambda x: x.lower()) y = np.zeros(n_instances, dtype="int32") y[:n_matching] = 1 X_string = matching_updates["update text"].tolist() X_string += non_matching_updates.head( n_nonmatching)["text"].tolist() assert len(X_string) == n_instances p = np.random.permutation(n_instances) y = y[p] X_string = [X_string[i] for i in p] print "pipeline start" docs = pipeline.annotate_mp(X_string, n_procs=8) nugget_doc = pipeline.annotate(nugget_text) print "pipeline done" lemmas = [] all_stems = [] for doc in docs: lems = [] stems = [] for sent in doc: for tok in sent: if unicode(tok).lower() not in stopwords and len( unicode(tok)) < 50: lems.append(tok.lem.lower()) stems.append(stemmer.stem(unicode(tok).lower())) #print lems lemmas.append(lems) all_stems.append(u" ".join(stems)) nugget_lems = [] nugget_stems = [] for sent in nugget_doc: for tok in sent: if unicode(tok).lower() not in stopwords and len( unicode(tok)) < 50: nugget_lems.append(tok.lem.lower()) nugget_stems.append(stemmer.stem(unicode(tok).lower())) nugget_stems = [u" ".join(nugget_stems)] # map( # lambda doc: [str(tok) # for doc in docs # for sent in doc # for tok in sent X_string = [u" ".join(lem) for lem in lemmas] vec = TfidfVectorizer(input=u"content", stop_words="english", ngram_range=(1, 5)) vec.fit([u" ".join(nugget_lems)] + X_string) X = vec.transform(X_string).todense() nugget_lems = set(nugget_lems) x_cov = [ len(nugget_lems.intersection(set(lems))) / float(len(nugget_lems)) for lems in lemmas ] x_cov = np.array(x_cov)[:, np.newaxis] X = np.hstack([X, x_cov]) #print X[:, -1] #X_nug = vec.transform([u" ".join(nugget_lems)]).todense() from sklearn.cross_validation import StratifiedKFold from sklearn.metrics import classification_report from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier(n_estimators=500, learning_rate=.1, max_depth=8, random_state=0, max_features="log2") #scores = cross_validation.cross_val_score(gbc, X, y, cv=10) #print scores.mean() K = cosine_similarity(wtmf_vec.transform(all_stems), wtmf_vec.transform(nugget_stems)) X_aug = np.hstack([X, K, K * x_cov]) scores = [] aug_scores = [] print event.fs_name(), nugget_text for train_index, test_index in StratifiedKFold(y, n_folds=10): X_train = X[train_index] y_train = y[train_index] X_test = X[test_index] y_test = y[test_index] gbc.fit(X_train, y_train) score = gbc.score(X_test, y_test) X_aug_train = X_aug[train_index] y_train = y[train_index] X_aug_test = X_aug[test_index] y_test = y[test_index] gbc.fit(X_aug_train, y_train) score_aug = gbc.score(X_aug_test, y_test) print score, score_aug scores.append(score) aug_scores.append(score_aug) print "mean", np.mean(scores), np.mean(aug_scores) all_aug_acc.append(np.mean(aug_scores)) all_acc.append(np.mean(scores)) print classification_report(y_test, gbc.predict(X_aug_test)) y_pred = gbc.predict(X_aug) for i, c in enumerate(y_pred): if c == 0 and y[i] == 1: print nugget_text #.encode("utf-8") print X_string[i] #.encode("utf-8") print print "False positives" for i, c in enumerate(y_pred): if c == 1 and y[i] == 0: print nugget_text #.encode("utf-8") print X_string[i] #.encode("utf-8") # model_dir = self.get_model_dir(event, nugget_id) # if not os.path.exists(model_dir): # os.makedirs(model_dir) # joblib.dump(vec, self.get_vectorizer_path(event, nugget_id), compress=9) ### Classifier shootout here. ### #prob_thresh = .5 print "Macro avg acc", np.mean(all_acc), np.mean(all_aug_acc)
def main(input_path, output_path, norm, stop, ne, lam, port): dirname, domain = os.path.split(input_path) input_path = os.path.join( dirname, "{}.norm-{}{}{}.lam{:0.3f}.pkl".format(domain, norm, ".stop" if stop else "", ".ne" if ne else "", lam)) print "Domain: {}".format(domain) print "Model Path: {}".format(input_path) events = [ event for event in cuttsum.events.get_events() if event.type in dom2type[domain] and event.query_num < 26 and event.query_num != 7 ] if ne is True: annotators = ["tokenize", "ssplit", "pos", "lemma", "ner"] elif norm == "lemma": annotators = ["tokenize", "ssplit", "pos", "lemma"] else: annotators = ["tokenize", "ssplit"] results = [] vec = joblib.load(input_path) modelname = "{}.norm_{}.stop_{}.ne_{}.lam_{}".format( domain, norm, stop, ne, lam) with cnlp.Server(annotators=annotators, mem="6G", port=port, max_message_len=1000000) as client: for event in events: print event event_nuggets = nuggets.loc[nuggets["query id"] == event.query_id] print "processing nugget text" nugget_docs = [ client.annotate(text) for text in event_nuggets["text"].tolist() ] #for doc in nugget_docs: # print doc #print if ne: central_per, central_loc, central_org = find_central_nes( nugget_docs) else: central_per = None central_loc = None central_org = None X_nug_txt = tokenize(nugget_docs, norm, stop, ne, central_per=central_per, central_loc=central_loc, central_org=central_org) nuggets.loc[nuggets["query id"] == event.query_id, "X"] = X_nug_txt event_nuggets = nuggets[nuggets["query id"] == event.query_id] event_nuggets = event_nuggets[event_nuggets["X"].apply( lambda x: len(x.split(" ")) < 50 and len(x.split(" ")) > 0)] X_nug_txt = event_nuggets["X"].tolist() #for txt in X_nug_txt: # print txt #print print "transforming nugget text" X_nug = vec.transform(X_nug_txt) assert X_nug.shape[0] == len(event_nuggets) print "getting updates" updates.loc[updates["query id"] == event.query_id, "text"] = \ updates.loc[updates["query id"] == event.query_id, "text"].apply(heal_text) event_updates = updates[(updates["query id"] == event.query_id) & (updates["text"].apply(len) < 1000)] print "processing update text" docs = [ client.annotate(text) for text in event_updates["text"].tolist() ] X_upd_txt = tokenize(docs, norm, stop, ne, central_per=central_per, central_loc=central_loc, central_org=central_org) print "transforming update text" X_upd = vec.transform(X_upd_txt) for i, (index, nugget) in enumerate(event_nuggets.iterrows()): boolean = (matches_df["query id"] == event.query_id) & ( matches_df["nugget id"] == nugget["nugget id"]) match_ids = set(matches_df.loc[boolean, "update id"].tolist()) if len(match_ids) == 0: continue #print index, nugget["nugget id"], nugget["text"] #print X_nug[i] if (X_nug[i] == 0).all(): continue n_matches = 0 K = cosine_similarity(X_nug[i], X_upd) for j in K.ravel().argsort()[::-1][:100]: #print K[0,j], #print event_updates.iloc[j]["text"] if event_updates.iloc[j]["update id"] in match_ids: n_matches += 1 #print P100 = n_matches / 100. optP100 = min(1., len(match_ids) / 100.) nP100 = P100 / optP100 results.append({ "model": modelname, "nugget id": nugget["nugget id"], "P@100": P100, "opt P@100": optP100, "normP@100": nP100 }) df = pd.DataFrame(results) print df print df["normP@100"].mean() df["model"] = modelname return results
import wtmf from sklearn.metrics.pairwise import cosine_similarity import numpy as np from sklearn.externals import joblib import corenlp as cnlp import cuttsum.judgements import gzip matches_df = cuttsum.judgements.get_merged_dataframe() nuggets_df = cuttsum.judgements.get_nuggets() annotators = ["tokenize", "ssplit"] with cnlp.Server(annotators=annotators, mem="6G", max_message_len=1000000) as client: for event, event_nuggets in nuggets_df.groupby('query id'): nugget_docs = [] for text in event_nuggets["text"].tolist(): doc = client.annotate(text) print doc nugget_docs.append(nugget_docs) # for nugget in nuggets_df.iterrows(): # print nugget exit() with gzip.open("wp-lm-preproc/accidents.norm-lemma.stop.spl.gz", "r") as f: X = f.readlines()
def do_job_unit(self, event, corpus, unit, **kwargs): stopwords = english_stopwords() ### Preprocessing here. ### df = cuttsum.judgements.get_merged_dataframe() matches = df[df["query id"] == event.query_id] matching_update_ids = set(matches["update id"].tolist()) #nuggets = matches.groupby("nugget id") #thrsh_nuggets = all_nuggets.filter(lambda x: len(x) > 10) nugget_ids = list(set(matches["nugget id"].tolist())) #nugget_ids = list(set(all_nuggets["nugget id"].tolist())) nugget_ids.sort() nugget_id = nugget_ids[unit] with corenlp.Server(port=9876 + event.query_num * 100 + unit, mem="20G", threads=4, max_message_len=524288, annotators=["tokenize", "ssplit", "pos", "lemma"], #, "ner"], corenlp_props={ "pos.maxlen": 50, "ssplit.eolonly": "true"}) as pipeline: if event.query_id.startswith("TS13"): updates = cuttsum.judgements.get_2013_updates() elif event.query_id.startswith("TS14"): updates = cuttsum.judgements.get_2014_sampled_updates() elif event.query_id.startswith("TS15"): updates = cuttsum.judgements.get_2015_sampled_updates() updates = updates[updates["query id"] == event.query_id] non_matching_updates = updates[updates["update id"].apply( lambda x: x not in matching_update_ids)] matching_updates = matches[matches["nugget id"] == nugget_id] # if len(matching_updates) == 0: # return #matching_updates = df[df["nugget id"] == nugget_id] nugget_text = matching_updates.iloc[0]["nugget text"] n_matching = len(matching_updates) n_nonmatching = min(n_matching, len(non_matching_updates)) n_instances = n_matching + n_nonmatching semsim = event2semsim(event) from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer() nugget_doc = pipeline.annotate(nugget_text) nugget_lems = [] nugget_stems = [] for sent in nugget_doc: for tok in sent: if unicode(tok).lower() not in stopwords and len(unicode(tok)) < 50: nugget_lems.append(tok.lem.lower()) stem = stemmer.stem(unicode(tok).lower()) if len(stem) < 50: nugget_stems.append(stem) nugget_stems = [u" ".join(nugget_stems)] if n_matching <= 10: model_dir = self.get_model_dir(event, nugget_id) if not os.path.exists(model_dir): os.makedirs(model_dir) joblib.dump([None, set(nugget_lems), nugget_stems], self.get_vectorizer_path(event, nugget_id), compress=9) joblib.dump([], self.get_model_path(event, nugget_id, "gbc"), compress=9) return non_matching_updates = non_matching_updates.iloc[ np.random.permutation(len(non_matching_updates))] non_matching_updates = non_matching_updates.iloc[ np.arange(n_nonmatching)] #non_matching_updates["text"] = \ # non_matching_updates["text"].apply(lambda x: x.lower()) y = np.zeros(n_instances, dtype="int32") y[:n_matching] = 1 X_string = matching_updates["update text"].tolist() X_string += non_matching_updates.head(n_nonmatching)["text"].tolist() assert len(X_string) == n_instances p = np.random.permutation(n_instances) y = y[p] X_string = [X_string[i] for i in p] print "pipeline start" docs = pipeline.annotate_mp(X_string, n_procs=4) print "pipeline done" lemmas = [] all_stems = [] for doc in docs: lems = [] stems = [] for sent in doc: for tok in sent: if unicode(tok).lower() not in stopwords and len(unicode(tok)) < 50: lems.append(tok.lem.lower()) stem = stemmer.stem(unicode(tok).lower()) if len(stem) < 50: stems.append(stem) #print lems lemmas.append(lems) all_stems.append(u" ".join(stems)) # map( # lambda doc: [str(tok) # for doc in docs # for sent in doc # for tok in sent K = cosine_similarity( semsim.transform(all_stems), semsim.transform(nugget_stems)) X_string = [u" ".join(lem) for lem in lemmas] vec = TfidfVectorizer( input=u"content", stop_words="english", ngram_range=(1,5)) vec.fit([u" ".join(nugget_lems)] + X_string) X = vec.transform(X_string).todense() nugget_lems = set(nugget_lems) x_cov = [len(nugget_lems.intersection(set(lems))) / float(len(nugget_lems)) for lems in lemmas] x_cov = np.array(x_cov)[:, np.newaxis] X = np.hstack([X, x_cov, K, K * x_cov]) gbc = GradientBoostingClassifier( n_estimators=500, learning_rate=.1, max_depth=8, random_state=0, max_features="log2") gbc.fit(X, y) print "SCORE", gbc.score(X, y) model_dir = self.get_model_dir(event, nugget_id) if not os.path.exists(model_dir): os.makedirs(model_dir) joblib.dump([vec, nugget_lems, nugget_stems], self.get_vectorizer_path(event, nugget_id), compress=9) joblib.dump( gbc, self.get_model_path(event, nugget_id, "gbc"), compress=9)
def main(input_dir, output_path, norm, stop, ne, port): dirname, fname = os.path.split(output_path) if dirname != "" and not os.path.exists(dirname): os.makedirs(dirname) output_path = os.path.join( dirname, "{}.norm-{}{}{}.spl.gz".format(fname, norm, ".stop" if stop else "", ".ne" if ne else "")) print "Writing spl file to {} ...".format(output_path) if not os.path.exists(input_dir): raise Exception("{} does not exist!".format(input_dir)) paths = [os.path.join(input_dir, fname) for fname in os.listdir(input_dir)] texts = [] for path in paths: with open(path, "r") as f: text = f.read() if text.strip() == "": continue texts.append(text) print "Removed", len(paths) - len(texts), "empty files." if ne is True: annotators = ["tokenize", "ssplit", "pos", "lemma", "ner"] elif norm == "lemma": annotators = ["tokenize", "ssplit", "pos", "lemma"] else: annotators = ["tokenize", "ssplit"] if norm == "stem": from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer() if stop: with open("stopwords.txt", "r") as f: sw = set([word.strip().decode("utf-8").lower() for word in f]) with cnlp.Server(annotators=annotators, mem="6G", port=port, max_message_len=1000000) as client, \ gzip.open(output_path, "w") as f: for i, text in enumerate(texts): sys.stdout.write("{:3.1f}%\r".format(i * 100. / len(texts))) sys.stdout.flush() doc = client.annotate(text) if ne: per_counts = defaultdict(int) org_counts = defaultdict(int) loc_counts = defaultdict(int) for sent in doc: for tok in sent: if tok.ne == "PERSON": per_counts[unicode(tok.lem).lower()] += 1 elif tok.ne == "LOCATION": loc_counts[unicode(tok.lem).lower()] += 1 elif tok.ne == "ORGANIZATION": org_counts[unicode(tok.lem).lower()] += 1 if len(per_counts) > 0: central_per = max(per_counts.items(), key=lambda x: [1])[0] else: central_per = None if len(org_counts) > 0: central_org = max(org_counts.items(), key=lambda x: [1])[0] else: central_org = None if len(loc_counts) > 0: central_loc = max(loc_counts.items(), key=lambda x: [1])[0] else: central_loc = None for sent in doc: if ne: toks = [] for tok in sent: if tok.ne == "PERSON": if unicode(tok.lem).lower() == central_per: print tok, "__CPER__" toks.append(u"__CPER__") else: toks.append(u"__PER__") elif tok.ne == "LOCATION": if unicode(tok.lem).lower() == central_loc: print tok, "__CLOC__" toks.append(u"__CLOC__") else: toks.append(u"__LOC__") elif tok.ne == "ORGANIZATION": if unicode(tok.lem).lower() == central_org: print tok, "__CORG__" toks.append(u"__CORG__") else: toks.append(u"__ORG__") else: if norm == "lemma": form = unicode(tok.lem).lower() elif norm == "stem": form = stemmer.stem(unicode(tok).lower()) else: form = unicode(tok).lower() if stop: if form not in sw and len(form) < 50: toks.append(form) else: if len(form) < 50: toks.append(form) else: if norm == "lemma": toks = [unicode(tok.lem).lower() for tok in sent] elif norm == "stem": toks = [ stemmer.stem(unicode(tok).lower()) for tok in sent ] else: toks = [unicode(tok).lower() for tok in sent] if stop: toks = [tok for tok in toks if tok not in sw] toks = [tok for tok in toks if len(tok) < 50] if len(toks) == 0: continue string = u" ".join(toks).encode("utf-8") + "\n" print string f.write(string)