Пример #1
0
def calc_results():
    toks, tags, groups = load(
        tag_prefix_masks=[]
    )  # [l[:3000] for l in load(tag_prefix_masks=[])]  #

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]]
    Xeval, yeval, _ = [l[test] for l in [toks, tags, groups]]

    maskedtoks, maskedtags = np.vstack(
        (Xtrain, Xeval)), np.append(ytrain, yeval)
    istrain = np.append(np.full(ytrain.shape, True),
                        np.full(yeval.shape, False))

    d = {}
    for params in clfs:
        if params.get('clf', None) == MostFrequentTag:
            clfname = 'baseline'
            clf, total_score = evaluate(params, maskedtoks, maskedtags,
                                        istrain)
            print('known word frac:', clf.knownwords(Xeval))
            clf.scope = 'known'
            _, known_scores = evaluate(params,
                                       maskedtoks,
                                       maskedtags,
                                       istrain,
                                       oldclf=clf)
            clf.scope = 'unk'
            _, unk_scores = evaluate(params,
                                     maskedtoks,
                                     maskedtags,
                                     istrain,
                                     oldclf=clf)
            df = pd.DataFrame([unk_scores, known_scores, total_score]).T
        else:
            clfname = r'\textsc{c}ore\textsc{nlp}'
            with stopwatch():
                # 2clf, 3 scopes, 3 metrics
                _, score = evaluate(params,
                                    maskedtoks,
                                    maskedtags,
                                    istrain,
                                    raw=True)
            df = pd.DataFrame([
                score.loc[measure, :]
                for measure in ['accuracy', 'avg util', 'avg setsize']
            ])
        df.index = ['Accuracy', 'Utility', 'Set size']
        df.columns = ['Unknown', 'Known', 'Total']
        for col in df:
            d[(clfname, col)] = df[col]
    print()
    return d
Пример #2
0
def calc_results():
    doc = 'St2'
    parts = 50
    toks, tags, groups = load(tag_prefix_masks=[])

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    x_t, y_t, g_t = [l[train] for l in [toks, tags, groups]]
    x_e, y_e, g_e = [l[test] for l in [toks, tags, groups]]
    x_t, y_t = [l[g_t == doc] for l in [x_t, y_t]]
    x_e, y_e = [l[g_e == doc] for l in [x_e, y_e]]

    d = {}
    for frac in (1 + np.array([-.5, 0, 1, 2, 5, 10, 19, 34, 49
                               ])) * x_t.shape[0] / parts:
        frac = int(frac)
        print(frac, x_e.shape[0])
        x_t_, y_t_ = [l[:frac] for l in [x_t, y_t]]
        maskedtoks, maskedtags = np.vstack((x_t_, x_e)), np.append(y_t_, y_e)
        for clf in clfs:
            istrain = np.append(np.full(y_t_.shape, True),
                                np.full(y_e.shape, False))
            clfname = 'baseline' if clf.get(
                'clf',
                None) == MostFrequentTag else r'\textsc{c}ore\textsc{nlp}'
            # two clfs, 10 fractions, 1 doc, 1 metrics -> lineplots (both clfs) (over fractions) with marks and dashed line
            #           table (clf row, fracs col)
            with stopwatch():
                _, scores = evaluate(clf, maskedtoks, maskedtags, istrain)
            d[(clfname, frac)] = scores
            print(scores)
        print()
    return d
Пример #3
0
def calc_results():
    toks, tags, groups = load(tag_prefix_masks=[])  # [l[:3000] for l in load(tag_prefix_masks=[])]  #
    ctr = Counter(groups)

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    toks_t, tags_t, groups_t = [l[train] for l in [toks, tags, groups]]

    # take the training data for train/eval cross-validation
    toks_e, tags_e, groups_e = [l[test] for l in [toks, tags, groups]]

    d = {}
    for doc, _ in filter(lambda t: t[1] > 7000, ctr.items()):
        mask = groups != doc
        mask_e = groups_e == doc
        Xtrain, ytrain = toks[mask], tags[mask]
        Xeval, yeval = toks_e[mask_e], tags_e[mask_e]
        print(doc, np.sum(mask), np.sum(mask_e))
        maskedtoks, maskedtags = np.vstack((Xtrain, Xeval)), np.append(ytrain, yeval)
        for clf in clfs:
            istrain = np.append(np.full(ytrain.shape, True), np.full(yeval.shape, False))

            clfname = 'baseline' if clf.get('clf', None) == MostFrequentTag else (
                'TreeTagger' if clf.get('clf', None) == TreeTagger else r'\textsc{c}ore\textsc{nlp}')
            docname = ''.join(((r'\textsc{' + c.lower() + '}' if re.match('[A-Z]', c) != None else c) for c in doc))
            # two clfs, 9 docs, 3 metrics -> two grouped barplots (over datasets), one for acc; util
            #           table (clf row, dataset col, subtable in cell for each metric)
            with stopwatch():
                _, scores = evaluate(clf, maskedtoks, maskedtags, istrain)
            d[(docname, clfname)] = scores
            print(scores)
    print()
    return d
Пример #4
0
def calc_results():
    toks, tags, groups = load(tag_prefix_masks=[])

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]]
    Xeval, yeval, _ = [l[test] for l in [toks, tags, groups]]

    maskedtoks, maskedtags = np.vstack(
        (Xtrain, Xeval)), np.append(ytrain, yeval)
    istrain = np.append(np.full(ytrain.shape, True),
                        np.full(yeval.shape, False))
    with stopwatch():
        df, _ = evaluate(clf, maskedtoks, maskedtags, istrain, raw=True)
    return df
Пример #5
0
def calc_results(betas=None):
    toks, tags, groups = load(tag_prefix_masks=[])

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]]
    Xeval, yeval, eval_g = [l[test] for l in [toks, tags, groups]]

    d = {}
    print(Xtrain.shape[0], Xeval.shape[0])

    for clf in clfs:
        clfname = 'baseline' if clf.get(
            'clf', None) == MostFrequentTag else ('TreeTagger' if clf.get(
                'clf', None) == TreeTagger else r'\textsc{c}ore\textsc{nlp}')
        # two clfs, 9 docs, 3 metrics -> two grouped barplots (over datasets), one for acc; util
        #           table (clf row, dataset col, subtable in cell for each metric)
        with stopwatch():
            old_clf = None
            for g, _ in groupby(eval_g):
                Xeval_, yeval_ = [l[eval_g == g] for l in [Xeval, yeval]]
                maskedtoks, maskedtags = np.vstack(
                    (Xtrain, Xeval_)), np.append(ytrain, yeval_)
                istrain = np.append(np.full(ytrain.shape, True),
                                    np.full(yeval_.shape, False))
                old_clf, scores = evaluate(clf,
                                           maskedtoks,
                                           maskedtags,
                                           istrain,
                                           oldclf=old_clf,
                                           betas=betas)

                d[(clfname, g)] = scores
        print(pd.DataFrame(d).T.loc[clfname].mean().tolist())
    print()
    return d