Пример #1
0
def saga_decision_function(dataset, k, link_alpha, prop_alpha, l1_ratio):

    fn = cache_fname("linear_val_df",
                     (dataset, k, link_alpha, prop_alpha, l1_ratio))

    if os.path.exists(fn):
        logging.info("Loading {}".format(fn))
        with open(fn, "rb") as f:
            return dill.load(f)

    ds = 'erule' if dataset == 'cdcp' else 'ukp-essays'  # sorry
    path = os.path.join("data", "process", ds, "folds", "{}", "{}")

    # sorry again: get val docs
    n_folds = 5 if dataset == 'ukp' else 3
    load, ids = get_dataset_loader(dataset, "train")
    for k_, (_, val) in enumerate(KFold(n_folds).split(ids)):
        if k_ == k:
            break
    val_docs = list(load(ids[val]))

    X_tr_link, y_tr_link = load_csr(path.format(k, 'train.npz'), return_y=True)
    X_te_link, y_te_link = load_csr(path.format(k, 'val.npz'), return_y=True)

    X_tr_prop, y_tr_prop = load_csr(path.format(k, 'prop-train.npz'),
                                    return_y=True)
    X_te_prop, y_te_prop = load_csr(path.format(k, 'prop-val.npz'),
                                    return_y=True)

    baseline = BaselineStruct(link_alpha, prop_alpha, l1_ratio)
    baseline.fit(X_tr_link, y_tr_link, X_tr_prop, y_tr_prop)

    Y_marg = baseline.decision_function(X_te_link, X_te_prop, val_docs)

    with open(fn, "wb") as f:
        logging.info("Saving {}".format(fn))
        dill.dump((Y_marg, baseline), f)

    return Y_marg, baseline
Пример #2
0
def main():
    from docopt import docopt

    usage = """
    Usage:
        baselines (cdcp|ukp) [--n-folds=N]

    Options:
        --n-folds=N  number of cross-val folds to generate. [default: 3]
    """

    args = docopt(usage)
    n_folds = int(args['--n-folds'])

    all_true = []
    all_false = []
    adjacent = []
    adjacent_ltr = []
    adjacent_rtl = []

    if args['cdcp']:
        path = os.path.join("data", "process", "erule", "folds", "{}", "{}")
    elif args['ukp']:
        path = os.path.join("data", "process", "ukp-essays", "folds", "{}",
                            "{}")

    for k in range(n_folds):
        fname = path.format(k, 'val.npz')
        logging.info("Loading sparse vectorized file {}".format(fname))
        X_te, y_te = load_csr(fname, return_y=True)

        with open(path.format(k, "fnames.txt")) as f:
            fnames = [line.strip() for line in f]

        props_between = fnames.index('nrm__props_between')
        src_precedes_trg = fnames.index('raw__src_precedes_trg')
        trg_precedes_src = fnames.index('raw__trg_precedes_src')

        y_all_true = np.ones_like(y_te)
        y_all_false = np.zeros_like(y_te)

        y_adj = ~(X_te[:, props_between] != 0).A.ravel()
        is_src_first = X_te[:, src_precedes_trg].astype(np.bool).A.ravel()
        is_trg_first = X_te[:, trg_precedes_src].astype(np.bool).A.ravel()

        y_adj_ltr = y_adj & is_src_first
        y_adj_rtl = y_adj & is_trg_first

        def _score(y):
            p, r, f, _ = precision_recall_fscore_support(y_te,
                                                         y,
                                                         pos_label=1,
                                                         average='binary')
            return p, r, f

        all_true.append(_score(y_all_true))
        all_false.append(_score(y_all_false))
        adjacent.append(_score(y_adj))
        adjacent_ltr.append(_score(y_adj_ltr))
        adjacent_rtl.append(_score(y_adj_rtl))

    preds = (all_false, all_true, adjacent, adjacent_ltr, adjacent_rtl)
    preds = [np.array(x).mean(axis=0) for x in preds]
    names = ["All false", "All true", "Adjacent", "Adj s -> t", "Adj t <- s"]

    for name, scores in zip(names, preds):
        print("{:18} {:.4f} {:.4f} {:.4f}".format(name, *scores))
Пример #3
0
def saga_cv(which, alphas, l1_ratio):

    if which == 'cdcp':
        n_folds = 3
        path = os.path.join("data", "process", "erule", "folds", "{}", "{}")
    elif which == 'ukp':
        n_folds = 5
        path = os.path.join("data", "process", "ukp-essays", "folds", "{}",
                            "{}")
    else:
        raise ValueError

    clf_link = SAGAClassifier(loss='smooth_hinge',
                              penalty='l1',
                              tol=1e-4,
                              max_iter=100,
                              random_state=0,
                              verbose=0)
    clf_prop = clone(clf_link)

    link_scores = np.zeros((n_folds, len(alphas)))
    prop_scores = np.zeros_like(link_scores)

    for k in range(n_folds):
        X_tr_link, y_tr_link = load_csr(path.format(k, 'train.npz'),
                                        return_y=True)
        X_te_link, y_te_link = load_csr(path.format(k, 'val.npz'),
                                        return_y=True)

        X_tr_prop, y_tr_prop = load_csr(path.format(k, 'prop-train.npz'),
                                        return_y=True)
        X_te_prop, y_te_prop = load_csr(path.format(k, 'prop-val.npz'),
                                        return_y=True)

        le = LabelEncoder()
        y_tr_prop_enc = le.fit_transform(y_tr_prop)
        y_te_prop_enc = le.transform(y_te_prop)

        link_sw = compute_sample_weight('balanced', y_tr_link)

        for j, alpha in enumerate(alphas):

            beta = alpha * l1_ratio
            alpha *= 1 - l1_ratio
            clf_link.set_params(alpha=alpha, beta=beta)
            clf_prop.set_params(alpha=alpha, beta=beta)

            clf_link.fit(X_tr_link, y_tr_link, sample_weight=link_sw)
            y_pred_link = clf_link.predict(X_te_link)

            clf_prop.fit(X_tr_prop, y_tr_prop_enc)
            y_pred_prop = clf_prop.predict(X_te_prop)

            with warnings.catch_warnings() as w:
                warnings.simplefilter('ignore')
                link_f = f1_score(y_te_link, y_pred_link, average='binary')
                prop_f = f1_score(y_te_prop_enc, y_pred_prop, average='macro')

            link_scores[k, j] = link_f
            prop_scores[k, j] = prop_f

    return link_scores, prop_scores
Пример #4
0
    elif model == 'strict':
        constraints = '{}+strict'.format(dataset)
        compat_features = True
        second_order = True
    else:
        raise ValueError('Invalid model: {}'.format(model))

    # logic for which second order features to use, if any
    grandparents = second_order and dataset == 'ukp'
    coparents = second_order
    siblings = second_order and dataset == 'cdcp'

    if method == 'linear':
        ds = 'erule' if dataset == 'cdcp' else 'ukp-essays'
        path = os.path.join("data", "process", ds, "folds", "traintest", "{}")
        X_tr_link, y_tr_link = load_csr(path.format('train.npz'),
                                        return_y=True)
        X_te_link, y_te_link = load_csr(path.format('test.npz'), return_y=True)

        X_tr_prop, y_tr_prop = load_csr(path.format('prop-train.npz'),
                                        return_y=True)
        X_te_prop, y_te_prop = load_csr(path.format('prop-test.npz'),
                                        return_y=True)

        baseline = BaselineStruct(alpha_link=params['alpha'],
                                  alpha_prop=params['alpha'],
                                  l1_ratio=0,
                                  exact_test=exact_test)
        baseline.fit(X_tr_link, y_tr_link, X_tr_prop, y_tr_prop)
        Y_pred = baseline.predict(X_te_link, X_te_prop, test_docs, constraints)

        with open('{}.model.pickle'.format(filename), "wb") as fp:
Пример #5
0
def saga_score_struct(which, link_alpha, prop_alpha, l1_ratio, decode=False):

    if which == 'cdcp':
        n_folds = 3
        ids = np.array(cdcp_train_ids)
        path = os.path.join("data", "process", "erule", "folds", "{}", "{}")
        _tpl = os.path.join("data", "process", "erule", "{}", "{:05d}")
        _load = lambda which, ks: (CdcpArgumentationDoc(_tpl.format(which, k))
                                   for k in ks)
    elif which == 'ukp':
        n_folds = 5
        ids = np.array(ukp_train_ids)
        path = os.path.join("data", "process", "ukp-essays", "folds", "{}",
                            "{}")
        _tpl = os.path.join("data", "process", "ukp-essays", "essay{:03d}")
        _load = lambda which, ks: (UkpEssayArgumentationDoc(_tpl.format(k))
                                   for k in ks)
    else:
        raise ValueError

    baseline = BaselineStruct(link_alpha, prop_alpha, l1_ratio)

    all_Y_pred = []
    scores = []

    for k, (tr, val) in enumerate(KFold(n_folds).split(ids)):
        val_docs = list(_load("train", ids[val]))
        Y_true = []
        for doc in val_docs:
            y_prop = np.array([str(f['label_']) for f in doc.prop_features])
            y_link = np.array([f['label_'] for f in doc.features])
            Y_true.append(DocLabel(y_prop, y_link))

        X_tr_link, y_tr_link = load_csr(path.format(k, 'train.npz'),
                                        return_y=True)
        X_te_link, y_te_link = load_csr(path.format(k, 'val.npz'),
                                        return_y=True)

        X_tr_prop, y_tr_prop = load_csr(path.format(k, 'prop-train.npz'),
                                        return_y=True)
        X_te_prop, y_te_prop = load_csr(path.format(k, 'prop-val.npz'),
                                        return_y=True)

        baseline.fit(X_tr_link, y_tr_link, X_tr_prop, y_tr_prop)
        Y_marg = baseline.decision_function(X_te_link, X_te_prop, val_docs)

        zero_compat = np.zeros((baseline.n_prop_states, baseline.n_prop_states,
                                baseline.n_link_states))
        if decode:
            statuses = Counter()
            Y_pred = []
            for doc, y in zip(val_docs, Y_marg):
                doc.link_to_node_ = np.array(
                    [(f['src__prop_id_'], f['trg__prop_id_'])
                     for f in doc.features],
                    dtype=np.intp)
                doc.second_order_ = []
                potentials = (y.nodes, y.links, zero_compat, [], [], [])
                y_decoded, status = baseline._inference(doc,
                                                        potentials,
                                                        relaxed=False,
                                                        constraints=which)
                Y_pred.append(y_decoded)
                statuses[status] += 1

            logging.info("Test inference status: " + ", ".join(
                "{:.1f}% {}".format(100 * val / len(val_docs), key)
                for key, val in statuses.most_common()))
        else:
            Y_pred = [
                baseline._round(y.nodes, y.links, inverse_transform=True)
                for y in Y_marg
            ]
            all_Y_pred.extend(Y_pred)

        scores.append(baseline._score(Y_true, Y_pred))

    return scores, all_Y_pred