def initialize_labels(self, Y): y_nodes_flat = [y_val for y in Y for y_val in y.nodes] y_links_flat = [y_val for y in Y for y_val in y.links] self.prop_encoder_ = LabelEncoder().fit(y_nodes_flat) self.link_encoder_ = LabelEncoder().fit(y_links_flat) self.n_prop_states = len(self.prop_encoder_.classes_) self.n_link_states = len(self.link_encoder_.classes_) self.prop_cw_ = np.ones_like(self.prop_encoder_.classes_, dtype=np.double) self.link_cw_ = compute_class_weight(self.class_weight, self.link_encoder_.classes_, y_links_flat) self.link_cw_ /= self.link_cw_.min() logging.info('Setting node class weights {}'.format(", ".join( "{}: {}".format(lbl, cw) for lbl, cw in zip(self.prop_encoder_.classes_, self.prop_cw_)))) logging.info('Setting link class weights {}'.format(", ".join( "{}: {}".format(lbl, cw) for lbl, cw in zip(self.link_encoder_.classes_, self.link_cw_))))
def linear_cv_score(dataset, alpha, l1_ratio, constraints): fn = cache_fname("linear_cv_score", (dataset, alpha, l1_ratio, constraints)) if os.path.exists(fn): logging.info("Loading {}".format(fn)) with open(fn, "rb") as f: return dill.load(f) load, ids = get_dataset_loader(dataset, split="train") n_folds = 5 if dataset == 'ukp' else 3 scores = [] for k, (tr, val) in enumerate(KFold(n_folds).split(ids)): Y_marg, bl = saga_decision_function(dataset, k, alpha, alpha, l1_ratio) val_docs = list(load(ids[val])) Y_true = [doc.label for doc in val_docs] Y_pred = bl.fast_decode(Y_marg, val_docs, constraints) scores.append(bl._score(Y_true, Y_pred)) with open(fn, "wb") as f: logging.info("Saving {}".format(fn)) dill.dump(scores, f) return scores
def _set_size_joint_feature(self): # assumes no second order compat_size = self.n_prop_states**2 * self.n_link_states if self.compat_features: compat_size *= self.n_compat_features_ total_n_second_order = (self.n_second_order_features_ * self.n_second_order_factors_) self.size_joint_feature = (self.n_prop_features * self.n_prop_states + self.n_link_features * self.n_link_states + compat_size + total_n_second_order) logging.info("Joint feature size: {}".format(self.size_joint_feature))
def store_optimized_embeddings(dataset, glove_path): from marseille.datasets import get_dataset_loader out_path = os.path.join('data', '{}-glove.npz'.format(dataset)) vocab = set() load, ids = get_dataset_loader(dataset, "train") for doc in load(ids): vocab.update(doc.tokens()) res = optimize_glove(glove_path, vocab) glove_vocab, glove_embeds = res coverage = len(glove_vocab) / len(vocab) np.savez(out_path, vocab=glove_vocab, embeds=glove_embeds) logging.info("GloVe coverage: {:.2f}%".format(100 * coverage))
def baseline_argrnn_cv_score(dataset, dynet_weight_decay, mlp_dropout, rnn_dropout, prop_layers, constraints): fn = cache_fname("baseline_argrnn_cv_score", (dataset, dynet_weight_decay, mlp_dropout, rnn_dropout, prop_layers, constraints)) if os.path.exists(fn): logging.info("Cached file already exists.") with open(fn, "rb") as f: return dill.load(f) load, ids = get_dataset_loader(dataset, split="train") embeds = load_embeds(dataset) scores = [] Y_pred = [] score_at_iter = [10, 25, 50, 75, 100] n_folds = 5 if dataset == 'ukp' else 3 for k, (tr, val) in enumerate(KFold(n_folds).split(ids)): docs_train = list(load(ids[tr])) docs_val = list(load(ids[val])) Y_train = [doc.label for doc in docs_train] Y_val = [doc.label for doc in docs_val] rnn = BaselineArgumentLSTM(lstm_dropout=rnn_dropout, mlp_dropout=mlp_dropout, prop_mlp_layers=prop_layers, max_iter=100, score_at_iter=score_at_iter, n_mlp=128, n_lstm=128, lstm_layers=2, link_mlp_layers=1, embeds=embeds, link_bilinear=True, constraints=constraints) rnn.fit(docs_train, Y_train, docs_val, Y_val) Y_val_pred = rnn.predict(docs_val) Y_pred.extend(Y_val_pred) scores.append(rnn.scores_) with open(fn, "wb") as f: dill.dump((scores, score_at_iter, Y_pred), f) return scores, score_at_iter, Y_pred
def saga_cv_cache(*args): arghash = sha1(repr(args).encode('utf-8')).hexdigest() fn = "res/baseline_linear_{}.dill".format(arghash) try: with open(fn, 'rb') as f: out = dill.load(f) logging.info("Loaded cached version.") except FileNotFoundError: logging.info("Computing...") out = saga_cv(*args) with open(fn, 'wb') as f: dill.dump(out, f) return out
def vectorize(train_docs, test_docs, which, n_most_common=500): """Train a vectorizer on the training docs and transform the test docs. We use a function because scikit-learn vectorizers cannot change the number of samples, but we need to extract multiple rows from each doc. So we cannot use pipelines. """ logging.info("Vectorizing...") # One pass to compute training corpus statistics. train_docs = list(train_docs) test_docs = list(test_docs) stats = stats_train(train_docs) _, _, _, pmi_incoming, pmi_outgoing = stats # link vectors vect, X_tr = link_vectorizer(train_docs, stats, n_most_common, return_transf=True) y_tr = np.array([f['label_'] for doc in train_docs for f in doc.features], dtype=np.bool) test_feats = [f for doc in test_docs for f in doc.features] [add_pmi_features(f, pmi_incoming, pmi_outgoing) for f in test_feats] y_te = np.array([f['label_'] for f in test_feats], dtype=np.bool) X_te = vect.transform(test_feats) # prop vectors prop_vect, prop_X_tr = prop_vectorizer(train_docs, which, stats, n_most_common_tok=None, n_most_common_dep=2000, return_transf=True) prop_y_tr = np.array( [str(f['label_']) for doc in train_docs for f in doc.prop_features]) prop_y_te = np.array( [str(f['label_']) for doc in test_docs for f in doc.prop_features]) test_feats = [f for doc in test_docs for f in doc.prop_features] prop_X_te = prop_vect.transform(test_feats) return ((prop_X_tr, prop_X_te, prop_y_tr, prop_y_te, prop_vect), (X_tr, X_te, y_tr, y_te, vect))
def svmstruct_cv_score(dataset, C, class_weight, constraints, compat_features, second_order_features): fn = cache_fname("svmstruct_cv_score", (dataset, C, class_weight, constraints, compat_features, second_order_features)) if os.path.exists(fn): logging.info("Cached file already exists.") with open(fn, "rb") as f: return dill.load(f) load, ids = get_dataset_loader(dataset, split="train") n_folds = 5 if dataset == 'ukp' else 3 # below are boolean logical ops grandparents = second_order_features and dataset == 'ukp' coparents = second_order_features siblings = second_order_features and dataset == 'cdcp' scores = [] all_Y_pred = [] for k, (tr, val) in enumerate(KFold(n_folds).split(ids)): train_docs = list(load(ids[tr])) val_docs = list(load(ids[val])) clf, Y_val, Y_pred = fit_predict(train_docs, val_docs, dataset, C, class_weight, constraints, compat_features, second_order_features, grandparents, coparents, siblings) all_Y_pred.extend(Y_pred) scores.append(clf.model._score(Y_val, Y_pred)) with open(fn, "wb") as f: dill.dump((scores, all_Y_pred), f) return scores, all_Y_pred
def saga_decision_function(dataset, k, link_alpha, prop_alpha, l1_ratio): fn = cache_fname("linear_val_df", (dataset, k, link_alpha, prop_alpha, l1_ratio)) if os.path.exists(fn): logging.info("Loading {}".format(fn)) with open(fn, "rb") as f: return dill.load(f) ds = 'erule' if dataset == 'cdcp' else 'ukp-essays' # sorry path = os.path.join("data", "process", ds, "folds", "{}", "{}") # sorry again: get val docs n_folds = 5 if dataset == 'ukp' else 3 load, ids = get_dataset_loader(dataset, "train") for k_, (_, val) in enumerate(KFold(n_folds).split(ids)): if k_ == k: break val_docs = list(load(ids[val])) X_tr_link, y_tr_link = load_csr(path.format(k, 'train.npz'), return_y=True) X_te_link, y_te_link = load_csr(path.format(k, 'val.npz'), return_y=True) X_tr_prop, y_tr_prop = load_csr(path.format(k, 'prop-train.npz'), return_y=True) X_te_prop, y_te_prop = load_csr(path.format(k, 'prop-val.npz'), return_y=True) baseline = BaselineStruct(link_alpha, prop_alpha, l1_ratio) baseline.fit(X_tr_link, y_tr_link, X_tr_prop, y_tr_prop) Y_marg = baseline.decision_function(X_te_link, X_te_prop, val_docs) with open(fn, "wb") as f: logging.info("Saving {}".format(fn)) dill.dump((Y_marg, baseline), f) return Y_marg, baseline
def predict(self, docs, exact=None): if exact is None: exact = self.exact_test pred = [] statuses = Counter() tic = time() for doc in docs: dy_potentials = self.build_cg(doc, training=False) potentials = self._get_potentials(doc, dy_potentials) y_pred, status = self._inference(doc, potentials, relaxed=False, exact=exact, constraints=self.constraints) pred.append(y_pred) statuses[status] += 1 toc = time() logging.info("Prediction time: {:.2f}s/doc".format((toc - tic) / len(docs))) logging.info("Test inference status: " + ", ".join( "{:.1f}% {}".format(100 * val / len(docs), key) for key, val in statuses.most_common())) return pred
def fit(self, docs, Y, docs_val=None, Y_val=None): self.initialize(docs, Y) self.scores_ = [] if self.score_at_iter: score_at_iter = self.score_at_iter else: score_at_iter = [] train_time = 0 for it in range(self.max_iter): # evaluate if docs_val and it in score_at_iter: Y_val_pred = self.predict(docs_val, exact=False) val_scores = self._score(Y_val, Y_val_pred) self.scores_.append(val_scores) with warnings.catch_warnings() as w: warnings.simplefilter('ignore') print("\t\t val link: {:.3f}/{:.3f} Node: {:.3f}/{:.3f} " "accuracy {:.3f}".format(*val_scores)) docs, Y = shuffle(docs, Y, random_state=0) iter_loss = 0 iter_max_loss = 0 inference_status = Counter() tic = time() for doc, y in zip(docs, Y): if len(y.nodes) == 0: continue obj, loss, max_loss, status = self._doc_loss(doc, y) inference_status[status] += 1 iter_loss += loss iter_max_loss += max_loss if loss < 1e-9: continue obj.scalar_value() obj.backward() self._trainer.update() self._trainer.update_epoch() self._trainer.status() toc = time() train_time += toc - tic print("Iter {} loss {:.4f}".format(it, iter_loss / iter_max_loss)) print(", ".join("{:.1f}% {}".format(100 * val / len(docs), key) for key, val in inference_status.most_common())) if iter_loss < 1e-9: break if docs_val and self.max_iter in score_at_iter: Y_val_pred = self.predict(docs_val, exact=False) val_scores = self._score(Y_val, Y_val_pred) self.scores_.append(val_scores) logging.info( "Training time: {:.2f}s/iteration ({:.2f}s/doc-iter)".format( train_time / it, train_time / (it * len(docs))))
def init_params(self): self.model = dy.Model() self._trainer = dy.AdamTrainer(self.model) if self.embeds is not None: sz = self.embeds[1].shape[1] self.n_embed = sz logging.info("Overriding n_embeds to glove size {}".format(sz)) self._embed = self.model.add_lookup_parameters( (len(self.vocab), self.n_embed)) if self.embeds is not None: # initialize embeddings with glove logging.info("Initializing embeddings...") embed_vocab, embed_data = self.embeds inv_embed = {w: k for k, w in enumerate(embed_vocab)} for k, w in enumerate(self.vocab): if w in inv_embed: self._embed.init_row(k, embed_data[inv_embed[w]]) logging.info("...done") self._rnn = dy.BiRNNBuilder(self.lstm_layers, self.n_embed, self.n_lstm, self.model, dy.LSTMBuilder) # proposition classifier MLP self._prop_mlp = MultiLayerPerceptron( [self.n_lstm] + [self.n_mlp] * self.prop_mlp_layers + [self.n_prop_states], activation=dy.rectify, model=self.model) # link classifier MLP (possibly bilinear) LinkEncoder = LinkBilinear if self.link_bilinear else LinkMLP self._link = LinkEncoder(self.n_lstm, self.n_mlp, self.n_link_states, self.link_mlp_layers, self.model) # compatibility (trigram) factors, optionally with features n_compat = self.n_prop_states**2 * self.n_link_states if self.compat_features: n_compat *= self.n_compat_features self._compat = self.model.add_parameters(n_compat, init=dy.ConstInitializer(0)) # optional second-order scorers SecondOrderEncoder = (SecondOrderMultilinear if self.second_order_multilinear else SecondOrderMLP) if self.coparent_layers: # scorer for a -> b <- c self._coparent = SecondOrderEncoder(self.n_lstm, self.n_mlp, self.coparent_layers, self.model) if self.grandparent_layers: # scorer for a -> b -> c self._grandparent = SecondOrderEncoder(self.n_lstm, self.n_mlp, self.grandparent_layers, self.model) if self.sibling_layers: # scorer for a <- b -> c self._sibling = SecondOrderEncoder(self.n_lstm, self.n_mlp, self.sibling_layers, self.model)
def main(): from docopt import docopt usage = """ Usage: baselines (cdcp|ukp) [--n-folds=N] Options: --n-folds=N number of cross-val folds to generate. [default: 3] """ args = docopt(usage) n_folds = int(args['--n-folds']) all_true = [] all_false = [] adjacent = [] adjacent_ltr = [] adjacent_rtl = [] if args['cdcp']: path = os.path.join("data", "process", "erule", "folds", "{}", "{}") elif args['ukp']: path = os.path.join("data", "process", "ukp-essays", "folds", "{}", "{}") for k in range(n_folds): fname = path.format(k, 'val.npz') logging.info("Loading sparse vectorized file {}".format(fname)) X_te, y_te = load_csr(fname, return_y=True) with open(path.format(k, "fnames.txt")) as f: fnames = [line.strip() for line in f] props_between = fnames.index('nrm__props_between') src_precedes_trg = fnames.index('raw__src_precedes_trg') trg_precedes_src = fnames.index('raw__trg_precedes_src') y_all_true = np.ones_like(y_te) y_all_false = np.zeros_like(y_te) y_adj = ~(X_te[:, props_between] != 0).A.ravel() is_src_first = X_te[:, src_precedes_trg].astype(np.bool).A.ravel() is_trg_first = X_te[:, trg_precedes_src].astype(np.bool).A.ravel() y_adj_ltr = y_adj & is_src_first y_adj_rtl = y_adj & is_trg_first def _score(y): p, r, f, _ = precision_recall_fscore_support(y_te, y, pos_label=1, average='binary') return p, r, f all_true.append(_score(y_all_true)) all_false.append(_score(y_all_false)) adjacent.append(_score(y_adj)) adjacent_ltr.append(_score(y_adj_ltr)) adjacent_rtl.append(_score(y_adj_rtl)) preds = (all_false, all_true, adjacent, adjacent_ltr, adjacent_rtl) preds = [np.array(x).mean(axis=0) for x in preds] names = ["All false", "All true", "Adjacent", "Adj s -> t", "Adj t <- s"] for name, scores in zip(names, preds): print("{:18} {:.4f} {:.4f} {:.4f}".format(name, *scores))
args = docopt(usage) dataset = 'cdcp' if args['cdcp'] else 'ukp' method = args['--method'] model = args['--model'] params = hyperparams[method][model][dataset] load_tr, ids_tr = get_dataset_loader(dataset, split="train") load_te, ids_te = get_dataset_loader(dataset, split="test") train_docs = list(load_tr(ids_tr)) test_docs = list(load_te(ids_te)) logging.info("{} {} on {} ({})".format(method, model, dataset, params)) filename = os.path.join( 'test_results', 'exact={}_{}_{}_{}'.format(exact_test, dataset, method, model)) if not os.path.exists('test_results'): os.makedirs('test_results') # logic for constraints and compat features # note that compat_features and second_order aren't used # if the model isn't structured, but it's more readable this way. if model == 'bare': constraints = '' compat_features = False second_order = False
def saga_score_struct(which, link_alpha, prop_alpha, l1_ratio, decode=False): if which == 'cdcp': n_folds = 3 ids = np.array(cdcp_train_ids) path = os.path.join("data", "process", "erule", "folds", "{}", "{}") _tpl = os.path.join("data", "process", "erule", "{}", "{:05d}") _load = lambda which, ks: (CdcpArgumentationDoc(_tpl.format(which, k)) for k in ks) elif which == 'ukp': n_folds = 5 ids = np.array(ukp_train_ids) path = os.path.join("data", "process", "ukp-essays", "folds", "{}", "{}") _tpl = os.path.join("data", "process", "ukp-essays", "essay{:03d}") _load = lambda which, ks: (UkpEssayArgumentationDoc(_tpl.format(k)) for k in ks) else: raise ValueError baseline = BaselineStruct(link_alpha, prop_alpha, l1_ratio) all_Y_pred = [] scores = [] for k, (tr, val) in enumerate(KFold(n_folds).split(ids)): val_docs = list(_load("train", ids[val])) Y_true = [] for doc in val_docs: y_prop = np.array([str(f['label_']) for f in doc.prop_features]) y_link = np.array([f['label_'] for f in doc.features]) Y_true.append(DocLabel(y_prop, y_link)) X_tr_link, y_tr_link = load_csr(path.format(k, 'train.npz'), return_y=True) X_te_link, y_te_link = load_csr(path.format(k, 'val.npz'), return_y=True) X_tr_prop, y_tr_prop = load_csr(path.format(k, 'prop-train.npz'), return_y=True) X_te_prop, y_te_prop = load_csr(path.format(k, 'prop-val.npz'), return_y=True) baseline.fit(X_tr_link, y_tr_link, X_tr_prop, y_tr_prop) Y_marg = baseline.decision_function(X_te_link, X_te_prop, val_docs) zero_compat = np.zeros((baseline.n_prop_states, baseline.n_prop_states, baseline.n_link_states)) if decode: statuses = Counter() Y_pred = [] for doc, y in zip(val_docs, Y_marg): doc.link_to_node_ = np.array( [(f['src__prop_id_'], f['trg__prop_id_']) for f in doc.features], dtype=np.intp) doc.second_order_ = [] potentials = (y.nodes, y.links, zero_compat, [], [], []) y_decoded, status = baseline._inference(doc, potentials, relaxed=False, constraints=which) Y_pred.append(y_decoded) statuses[status] += 1 logging.info("Test inference status: " + ", ".join( "{:.1f}% {}".format(100 * val / len(val_docs), key) for key, val in statuses.most_common())) else: Y_pred = [ baseline._round(y.nodes, y.links, inverse_transform=True) for y in Y_marg ] all_Y_pred.extend(Y_pred) scores.append(baseline._score(Y_true, Y_pred)) return scores, all_Y_pred
docs = list(load(ids)) Y_true = [doc.label for doc in docs] prop_labels = (['MajorClaim', 'Claim', 'Premise'] if dataset == 'ukp' else ['value', 'policy', 'testimony', 'fact', 'reference']) predictions = dict() model_names = [] doc_scores = [] for method in ("linear", "linear-struct", "rnn", "rnn-struct"): for model in ("bare", "full", "strict"): fn = tpl.format(dataset, method, model) if not os.path.isfile(fn): logging.info("Could not find {}".format(fn)) continue with open(fn, "rb") as f: predictions[method, model] = Y_pred = dill.load(f) model_names.append((method, model)) with warnings.catch_warnings() as w: warnings.simplefilter('ignore') doc_scores.append(scores_per_doc(Y_true, Y_pred, prop_labels)) doc_scores = np.array(doc_scores).T # n_samples x n_models margin_win, margin_lose = margins(doc_scores) for k, name in enumerate(model_names): fn = os.path.join("res", "error_analysis", "{}_{}_{}_{}.html".format(
constraint_vals = ('', dataset, dataset + '+strict') for C in Cs: for constraints in constraint_vals: for extras in (False, True): compat_features = extras second_order_features = extras try: results = load_results( "svmstruct_cv_score", (dataset, C, class_weight, constraints, compat_features, second_order_features)) except Exception as e: logging.info("not loaded: dataset={} C={} cw={} " "constraints={} compat_features={} " "second_order_features={} {}".format( dataset, C, class_weight, constraints, compat_features, second_order_features, e)) continue scores, _ = results scores = np.mean(scores, axis=0) link_macro, link_micro, node_macro, node_micro, acc = scores pystruct_df.append( dict(C=C, constraints=constraints, second_order=second_order_features, compat_features=compat_features, link_macro=link_macro, link_micro=link_micro, node_macro=node_macro,
baseline_df = [] full_df = [] for mlp_dropout in (0.05, 0.10, 0.15, 0.20, 0.25): for constraints in ("", dataset, dataset + "+strict"): # get the baseline results try: results = load_results( "baseline_argrnn_cv_score", (dataset, dynet_weight_decay, mlp_dropout, rnn_dropout, prop_layers, constraints)) except Exception as e: logging.info("not loaded: dataset={} dynet_weight_decay={} " "mlp_dropout={} rnn_dropout={} prop_layers={} " "constraints={} error={}".format( dataset, dynet_weight_decay, mlp_dropout, rnn_dropout, prop_layers, constraints, e)) continue scores, score_at_iter, _ = results avg_scores = combine_scores(scores, score_at_iter) for iter, score in zip(score_at_iter, avg_scores): link_macro, link_micro, node_macro, node_micro, acc = score baseline_df.append( dict(mlp_dropout=mlp_dropout, constraints=constraints, iter=iter, link_macro=link_macro, link_micro=link_micro,