def linear_cv_score(dataset, alpha, l1_ratio, constraints): fn = cache_fname("linear_cv_score", (dataset, alpha, l1_ratio, constraints)) if os.path.exists(fn): logging.info("Loading {}".format(fn)) with open(fn, "rb") as f: return dill.load(f) load, ids = get_dataset_loader(dataset, split="train") n_folds = 5 if dataset == 'ukp' else 3 scores = [] for k, (tr, val) in enumerate(KFold(n_folds).split(ids)): Y_marg, bl = saga_decision_function(dataset, k, alpha, alpha, l1_ratio) val_docs = list(load(ids[val])) Y_true = [doc.label for doc in val_docs] Y_pred = bl.fast_decode(Y_marg, val_docs, constraints) scores.append(bl._score(Y_true, Y_pred)) with open(fn, "wb") as f: logging.info("Saving {}".format(fn)) dill.dump(scores, f) return scores
def baseline_argrnn_cv_score(dataset, dynet_weight_decay, mlp_dropout, rnn_dropout, prop_layers, constraints): fn = cache_fname("baseline_argrnn_cv_score", (dataset, dynet_weight_decay, mlp_dropout, rnn_dropout, prop_layers, constraints)) if os.path.exists(fn): logging.info("Cached file already exists.") with open(fn, "rb") as f: return dill.load(f) load, ids = get_dataset_loader(dataset, split="train") embeds = load_embeds(dataset) scores = [] Y_pred = [] score_at_iter = [10, 25, 50, 75, 100] n_folds = 5 if dataset == 'ukp' else 3 for k, (tr, val) in enumerate(KFold(n_folds).split(ids)): docs_train = list(load(ids[tr])) docs_val = list(load(ids[val])) Y_train = [doc.label for doc in docs_train] Y_val = [doc.label for doc in docs_val] rnn = BaselineArgumentLSTM(lstm_dropout=rnn_dropout, mlp_dropout=mlp_dropout, prop_mlp_layers=prop_layers, max_iter=100, score_at_iter=score_at_iter, n_mlp=128, n_lstm=128, lstm_layers=2, link_mlp_layers=1, embeds=embeds, link_bilinear=True, constraints=constraints) rnn.fit(docs_train, Y_train, docs_val, Y_val) Y_val_pred = rnn.predict(docs_val) Y_pred.extend(Y_val_pred) scores.append(rnn.scores_) with open(fn, "wb") as f: dill.dump((scores, score_at_iter, Y_pred), f) return scores, score_at_iter, Y_pred
def store_optimized_embeddings(dataset, glove_path): from marseille.datasets import get_dataset_loader out_path = os.path.join('data', '{}-glove.npz'.format(dataset)) vocab = set() load, ids = get_dataset_loader(dataset, "train") for doc in load(ids): vocab.update(doc.tokens()) res = optimize_glove(glove_path, vocab) glove_vocab, glove_embeds = res coverage = len(glove_vocab) / len(vocab) np.savez(out_path, vocab=glove_vocab, embeds=glove_embeds) logging.info("GloVe coverage: {:.2f}%".format(100 * coverage))
def test_merge_spans(): from collections import Counter from marseille.datasets import get_dataset_loader load, ids = get_dataset_loader("cdcp", "train") n_nones = 0 label_counts = Counter() for doc in load(ids): label_counts.update(doc.prop_labels) # drops 14 links in training and 8 in test split n_nones += sum(1 for x in doc.prop_labels if x is None) print(label_counts.most_common()) print(n_nones)
def svmstruct_cv_score(dataset, C, class_weight, constraints, compat_features, second_order_features): fn = cache_fname("svmstruct_cv_score", (dataset, C, class_weight, constraints, compat_features, second_order_features)) if os.path.exists(fn): logging.info("Cached file already exists.") with open(fn, "rb") as f: return dill.load(f) load, ids = get_dataset_loader(dataset, split="train") n_folds = 5 if dataset == 'ukp' else 3 # below are boolean logical ops grandparents = second_order_features and dataset == 'ukp' coparents = second_order_features siblings = second_order_features and dataset == 'cdcp' scores = [] all_Y_pred = [] for k, (tr, val) in enumerate(KFold(n_folds).split(ids)): train_docs = list(load(ids[tr])) val_docs = list(load(ids[val])) clf, Y_val, Y_pred = fit_predict(train_docs, val_docs, dataset, C, class_weight, constraints, compat_features, second_order_features, grandparents, coparents, siblings) all_Y_pred.extend(Y_pred) scores.append(clf.model._score(Y_val, Y_pred)) with open(fn, "wb") as f: dill.dump((scores, all_Y_pred), f) return scores, all_Y_pred
def saga_decision_function(dataset, k, link_alpha, prop_alpha, l1_ratio): fn = cache_fname("linear_val_df", (dataset, k, link_alpha, prop_alpha, l1_ratio)) if os.path.exists(fn): logging.info("Loading {}".format(fn)) with open(fn, "rb") as f: return dill.load(f) ds = 'erule' if dataset == 'cdcp' else 'ukp-essays' # sorry path = os.path.join("data", "process", ds, "folds", "{}", "{}") # sorry again: get val docs n_folds = 5 if dataset == 'ukp' else 3 load, ids = get_dataset_loader(dataset, "train") for k_, (_, val) in enumerate(KFold(n_folds).split(ids)): if k_ == k: break val_docs = list(load(ids[val])) X_tr_link, y_tr_link = load_csr(path.format(k, 'train.npz'), return_y=True) X_te_link, y_te_link = load_csr(path.format(k, 'val.npz'), return_y=True) X_tr_prop, y_tr_prop = load_csr(path.format(k, 'prop-train.npz'), return_y=True) X_te_prop, y_te_prop = load_csr(path.format(k, 'prop-val.npz'), return_y=True) baseline = BaselineStruct(link_alpha, prop_alpha, l1_ratio) baseline.fit(X_tr_link, y_tr_link, X_tr_prop, y_tr_prop) Y_marg = baseline.decision_function(X_te_link, X_te_prop, val_docs) with open(fn, "wb") as f: logging.info("Saving {}".format(fn)) dill.dump((Y_marg, baseline), f) return Y_marg, baseline
'Cannot create svg representation by running dot from string: {}' ''.format(dot_string)) return out def render_prediction(doc, Y): labels = ['({}) {:.2}'.format(i, lbl) for i, lbl in enumerate(Y.nodes, 1)] links = doc.link_to_prop[Y.links] return _svg(labels, links) if __name__ == '__main__': dataset = sys.argv[1] load, ids = get_dataset_loader(dataset, split="test") docs = list(load(ids)) Y_true = [doc.label for doc in docs] prop_labels = (['MajorClaim', 'Claim', 'Premise'] if dataset == 'ukp' else ['value', 'policy', 'testimony', 'fact', 'reference']) predictions = dict() model_names = [] doc_scores = [] for method in ("linear", "linear-struct", "rnn", "rnn-struct"): for model in ("bare", "full", "strict"): fn = tpl.format(dataset, method, model) if not os.path.isfile(fn):
import numpy as np from marseille.datasets import get_dataset_loader, load_embeds from marseille.custom_logging import logging from marseille.argrnn import BaselineArgumentLSTM, ArgumentLSTM from marseille.io import load_csr from .exp_svmstruct import fit_predict as fit_pred_pystruct from .exp_linear import BaselineStruct if __name__ == '__main__': exact_test = True dataset = 'cdcp' load_tr, ids_tr = get_dataset_loader(dataset, split="train") train_docs = list(load_tr(ids_tr))[:20] filename = "pickle_test" constraints = '' compat_features = False second_order = False grandparents = coparents = siblings = False Y_train = [doc.label for doc in train_docs] pkl = False if pkl:
# "exact_predictions", "exact=True_{}_{}_{}.predictions.dill") if __name__ == '__main__': dataset = sys.argv[1] if dataset not in ('cdcp', 'ukp'): raise ValueError("Unknown dataset {}. " "Supported: ukp|cdcp.".format(dataset)) link_labels = [False, True] prop_labels = (['MajorClaim', 'Claim', 'Premise'] if dataset == 'ukp' else ['value', 'policy', 'testimony', 'fact', 'reference']) # get true test labels load_te, ids_te = get_dataset_loader(dataset, split='test') Y_true = [doc.label for doc in load_te(ids_te)] print("dataset={}".format(dataset)) scores = dict() for method in ("linear", "linear-struct", "rnn", "rnn-struct"): scores[method] = dict() for model in ("bare", "full", "strict"): scores_ = scores[method][model] = dict() fn = tpl.format(dataset, method, model) if not os.path.isfile(fn): logging.info("Could not find {}".format(fn)) continue
exp_train_test (cdcp|ukp) --method=M --model=N [--dynet-seed N --dynet-mem N] Options: --method: one of (linear, linear-struct, rnn, rnn-struct) --model: one of (bare, full, strict) """ args = docopt(usage) dataset = 'cdcp' if args['cdcp'] else 'ukp' method = args['--method'] model = args['--model'] params = hyperparams[method][model][dataset] load_tr, ids_tr = get_dataset_loader(dataset, split="train") load_te, ids_te = get_dataset_loader(dataset, split="test") train_docs = list(load_tr(ids_tr)) test_docs = list(load_te(ids_te)) logging.info("{} {} on {} ({})".format(method, model, dataset, params)) filename = os.path.join( 'test_results', 'exact={}_{}_{}_{}'.format(exact_test, dataset, method, model)) if not os.path.exists('test_results'): os.makedirs('test_results') # logic for constraints and compat features # note that compat_features and second_order aren't used