Пример #1
0
def calc_results():
    toks, tags, groups = load(tag_prefix_masks=[])  # [l[:3000] for l in load(tag_prefix_masks=[])]  #
    ctr = Counter(groups)

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    toks_t, tags_t, groups_t = [l[train] for l in [toks, tags, groups]]

    # take the training data for train/eval cross-validation
    toks_e, tags_e, groups_e = [l[test] for l in [toks, tags, groups]]

    d = {}
    for doc, _ in filter(lambda t: t[1] > 7000, ctr.items()):
        mask = groups != doc
        mask_e = groups_e == doc
        Xtrain, ytrain = toks[mask], tags[mask]
        Xeval, yeval = toks_e[mask_e], tags_e[mask_e]
        print(doc, np.sum(mask), np.sum(mask_e))
        maskedtoks, maskedtags = np.vstack((Xtrain, Xeval)), np.append(ytrain, yeval)
        for clf in clfs:
            istrain = np.append(np.full(ytrain.shape, True), np.full(yeval.shape, False))

            clfname = 'baseline' if clf.get('clf', None) == MostFrequentTag else (
                'TreeTagger' if clf.get('clf', None) == TreeTagger else r'\textsc{c}ore\textsc{nlp}')
            docname = ''.join(((r'\textsc{' + c.lower() + '}' if re.match('[A-Z]', c) != None else c) for c in doc))
            # two clfs, 9 docs, 3 metrics -> two grouped barplots (over datasets), one for acc; util
            #           table (clf row, dataset col, subtable in cell for each metric)
            with stopwatch():
                _, scores = evaluate(clf, maskedtoks, maskedtags, istrain)
            d[(docname, clfname)] = scores
            print(scores)
    print()
    return d
Пример #2
0
def calc_results():
    doc = 'St2'
    parts = 50
    toks, tags, groups = load(tag_prefix_masks=[])

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    x_t, y_t, g_t = [l[train] for l in [toks, tags, groups]]
    x_e, y_e, g_e = [l[test] for l in [toks, tags, groups]]
    x_t, y_t = [l[g_t == doc] for l in [x_t, y_t]]
    x_e, y_e = [l[g_e == doc] for l in [x_e, y_e]]

    d = {}
    for frac in (1 + np.array([-.5, 0, 1, 2, 5, 10, 19, 34, 49
                               ])) * x_t.shape[0] / parts:
        frac = int(frac)
        print(frac, x_e.shape[0])
        x_t_, y_t_ = [l[:frac] for l in [x_t, y_t]]
        maskedtoks, maskedtags = np.vstack((x_t_, x_e)), np.append(y_t_, y_e)
        for clf in clfs:
            istrain = np.append(np.full(y_t_.shape, True),
                                np.full(y_e.shape, False))
            clfname = 'baseline' if clf.get(
                'clf',
                None) == MostFrequentTag else r'\textsc{c}ore\textsc{nlp}'
            # two clfs, 10 fractions, 1 doc, 1 metrics -> lineplots (both clfs) (over fractions) with marks and dashed line
            #           table (clf row, fracs col)
            with stopwatch():
                _, scores = evaluate(clf, maskedtoks, maskedtags, istrain)
            d[(clfname, frac)] = scores
            print(scores)
        print()
    return d
Пример #3
0
def get_ytrain():
    toks, tags, groups = load(tag_prefix_masks=[])

    # train - test split
    train, _ = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    _, ytrain, _ = [l[train] for l in [toks, tags, groups]]
    return ytrain
Пример #4
0
def calc_results():
    toks, tags, groups = load(
        tag_prefix_masks=[]
    )  # [l[:3000] for l in load(tag_prefix_masks=[])]  #

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]]
    Xeval, yeval, _ = [l[test] for l in [toks, tags, groups]]

    maskedtoks, maskedtags = np.vstack(
        (Xtrain, Xeval)), np.append(ytrain, yeval)
    istrain = np.append(np.full(ytrain.shape, True),
                        np.full(yeval.shape, False))

    d = {}
    for params in clfs:
        if params.get('clf', None) == MostFrequentTag:
            clfname = 'baseline'
            clf, total_score = evaluate(params, maskedtoks, maskedtags,
                                        istrain)
            print('known word frac:', clf.knownwords(Xeval))
            clf.scope = 'known'
            _, known_scores = evaluate(params,
                                       maskedtoks,
                                       maskedtags,
                                       istrain,
                                       oldclf=clf)
            clf.scope = 'unk'
            _, unk_scores = evaluate(params,
                                     maskedtoks,
                                     maskedtags,
                                     istrain,
                                     oldclf=clf)
            df = pd.DataFrame([unk_scores, known_scores, total_score]).T
        else:
            clfname = r'\textsc{c}ore\textsc{nlp}'
            with stopwatch():
                # 2clf, 3 scopes, 3 metrics
                _, score = evaluate(params,
                                    maskedtoks,
                                    maskedtags,
                                    istrain,
                                    raw=True)
            df = pd.DataFrame([
                score.loc[measure, :]
                for measure in ['accuracy', 'avg util', 'avg setsize']
            ])
        df.index = ['Accuracy', 'Utility', 'Set size']
        df.columns = ['Unknown', 'Known', 'Total']
        for col in df:
            d[(clfname, col)] = df[col]
    print()
    return d
Пример #5
0
def calculate_setsizes():
    toks, tags, groups = load(tag_prefix_masks=[])

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]]
    Xeval, yeval, _ = [l[test] for l in [toks, tags, groups]]

    clf = CoreNLPTagger()
    clf.fit(Xtrain, ytrain)

    return clf.setsizes(Xeval)
Пример #6
0
def calc_results():
    toks, tags, groups = load(tag_prefix_masks=[])

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]]
    Xeval, yeval, _ = [l[test] for l in [toks, tags, groups]]

    maskedtoks, maskedtags = np.vstack(
        (Xtrain, Xeval)), np.append(ytrain, yeval)
    istrain = np.append(np.full(ytrain.shape, True),
                        np.full(yeval.shape, False))
    with stopwatch():
        df, _ = evaluate(clf, maskedtoks, maskedtags, istrain, raw=True)
    return df
Пример #7
0
def optimize():
    SEED = 1

    toks, tags, groups = load(
        tag_prefix_masks=[]
    )  # [l[:3000] for l in load(tag_prefix_masks=[])]  #
    # train - test split
    train, _ = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    toks, tags, groups = [l[train] for l in [toks, tags, groups]]

    def tae(cfg, seed=0):
        return callee.evaluate(cfg,
                               toks,
                               tags,
                               groups,
                               timeout=1200,
                               seed=seed,
                               k=5)

    scenario_dict = {
        'run_obj': 'quality',  # we optimize quality (alternatively runtime)
        'runcount-limit': 100,
        'algo_runs_timelimit': 60 * 60 * 14,
        'cutoff': 1200,  # stop algorithms after 10x default runtime
        "cs": cs,  # configuration space
        "deterministic": 'true',
        'output_dir': 'smac3_test_treetagger'
    }

    smac = create_or_restore_smac(scenario_dict=scenario_dict,
                                  rng=np.random.RandomState(SEED),
                                  tae=tae)
    incumbent = smac.optimize()

    inc_value = tae(incumbent)

    print("Optimized Value: %.2f" % (inc_value))
Пример #8
0
def calc_results(betas=None):
    toks, tags, groups = load(tag_prefix_masks=[])

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]]
    Xeval, yeval, eval_g = [l[test] for l in [toks, tags, groups]]

    d = {}
    print(Xtrain.shape[0], Xeval.shape[0])

    for clf in clfs:
        clfname = 'baseline' if clf.get(
            'clf', None) == MostFrequentTag else ('TreeTagger' if clf.get(
                'clf', None) == TreeTagger else r'\textsc{c}ore\textsc{nlp}')
        # two clfs, 9 docs, 3 metrics -> two grouped barplots (over datasets), one for acc; util
        #           table (clf row, dataset col, subtable in cell for each metric)
        with stopwatch():
            old_clf = None
            for g, _ in groupby(eval_g):
                Xeval_, yeval_ = [l[eval_g == g] for l in [Xeval, yeval]]
                maskedtoks, maskedtags = np.vstack(
                    (Xtrain, Xeval_)), np.append(ytrain, yeval_)
                istrain = np.append(np.full(ytrain.shape, True),
                                    np.full(yeval_.shape, False))
                old_clf, scores = evaluate(clf,
                                           maskedtoks,
                                           maskedtags,
                                           istrain,
                                           oldclf=old_clf,
                                           betas=betas)

                d[(clfname, g)] = scores
        print(pd.DataFrame(d).T.loc[clfname].mean().tolist())
    print()
    return d
Пример #9
0
import json
import re
from collections import Counter
from itertools import chain

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from natsort import natsorted

from setpos.data.split import load

if __name__ == '__main__':
    _, tags, groups = load()
    docs = natsorted(set(groups), key=lambda x: str(int(not x.startswith('REN'))) + x)

    result = {}
    for doc in docs:
        t = list(chain.from_iterable([json.loads(tags_).keys() for tags_ in tags[groups == doc]]))
        counter = pd.Series(dict(Counter(t)))
        counter /= len(t)
        result[doc] = counter
    df = pd.DataFrame(result).sort_index().T

    t = list(chain.from_iterable([json.loads(tags_).keys() for tags_ in tags]))
    prior = pd.Series(dict(Counter(t))).sort_index()
    prior /= prior.sum()

    doc_sizes = pd.Series([len(tags[groups == doc]) for doc in docs], index=docs)
    prior_df = prior.sort_values(ascending=False).head(10)
    prior_df.name = 'Probability'
Пример #10
0
import json

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix

from setpos.tagger import CoreNLPTagger
from setpos.data.split import MCInDocSplitter, load
from setpos.util import stopwatch

if __name__ == '__main__':
    SEED, n = 7, 6
    toks, tags, groups = load()  # [l[:3000] for l in load()]  #

    with stopwatch():
        targets = []
        preds = []
        for train, test in MCInDocSplitter(seed=SEED).split(
                toks, tags, groups):
            clf = CoreNLPTagger()
            clf.fit(toks[train], tags[train])
            targets.extend(
                [list(json.loads(tags_).keys())[0] for tags_ in tags[test]])
            preds.extend(clf.predict(toks[test]))

    tags = set(targets) | set(preds)
    print(len(tags))
    conf_mat = confusion_matrix(targets, preds)
    with np.errstate(divide='ignore'):
        conf_mat = conf_mat / conf_mat.sum(axis=1)
Пример #11
0
        # pad the probabilities with zero rows for every tag that has been unseen during training
        tag_idx = {t: i for i, t in enumerate(self.clf.classes_)}
        new_probas = np.zeros((probas.shape[0], len(self._tags)))
        for i, t in enumerate(self._tags):
            if t in tag_idx:
                new_probas[:, i] = probas[:, tag_idx[t]]
        return new_probas

    def predict(self, X):
        if self.clf is None:
            raise NotFittedError
        return super().predict(X)


if __name__ == '__main__':
    toks, tags, groups = [l[:] for l in load()]  # load()

    train, test = next(MCInDocSplitter(seed=1).split(toks, tags, groups))
    # train, test = next(LeaveOneGroupOut().split(toks, tags, groups))

    clf = SimpleTagger()
    clf.fit(toks[train], tags[train])
    print(f'meansetsize: {clf.meansetsize(toks[test]):.2f}')
    print(f'knownwords: {clf.knownwords(toks[test]):.2%}')
    print(
        f'accuracy: {cross_val_score(clf, toks, tags, groups, cv=KFoldInDocSplitter(5, seed=1), n_jobs=3).mean():.2%}'
    )
    clf.set_valued = True
    print(
        f'utility: {cross_val_score(clf, toks, tags, groups, cv=KFoldInDocSplitter(5, seed=1), n_jobs=4).mean():.2%}'
    )
Пример #12
0
import json
import shelve
from functools import partial

import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score

from scripts.util.func import param_to_key
from setpos.data.split import load, MCInDocSplitter, is_masked
from setpos.tagger import CoreNLPTagger
from setpos.util import stopwatch, draw_cd_diagram

if __name__ == '__main__':
    SEED, n = 1, 15
    toks, tags, groups = load(
        tag_prefix_masks=[])  # load(tag_prefix_masks=[])  #[l[:3000] for l in load(tag_prefix_masks=[])]    #
    opt = {"augment_setvalued_targets": False, "filter_tags": [],
           "corenlp_train_params": ["-curWordMinFeatureThreshold", "4", "-minFeatureThreshold", "1",
                                    "-rareWordMinFeatureThresh", "1", "-rareWordThresh", "6", "-sigmaSquared",
                                    "0.7676194187745077", "-veryCommonWordThresh", "234", "-arch",
                                    "order(-2,0),prefix(1,0),prefix(2,0),prefix(3,0),suffix(2,0),suffix(3,0),suffix(4,0),suffix(5,0),wordTag(0,-1),words(-3,2)"]}
    opt_set_open_classes = opt.copy()
    opt_set_open_classes['corenlp_train_params'] = opt_set_open_classes['corenlp_train_params'] + ['-openClassTags',
                                                                                                   '"ADJA ADJA<VVPP ADJA<VVPS ADJD ADJN ADJN<VVPP ADJS ADJV CARDA CARDN CARDS NA NE VAFIN VAFIN.* VAFIN.ind VAFIN.konj VAINF VAPP VKFIN.* VKFIN.ind VKFIN.konj VKINF VKPP VKPS VMFIN.* VMFIN.ind  VMFIN.konj VMINF VVFIN.* VVFIN.ind VVFIN.konj VVIMP VVINF VVPP VVPS OA XY FM"',
                                                                                                   '-lang', '']
    opt_lang_english = opt.copy()
    opt_lang_english['corenlp_train_params'] = opt_lang_english['corenlp_train_params'] + ['-lang', 'english']
    opt_lang_german = opt.copy()
    opt_lang_german['corenlp_train_params'] = opt_lang_german['corenlp_train_params'] + ['-lang', 'german']
    opt_learn_closed = opt.copy()
    opt_learn_closed['corenlp_train_params'] = opt_learn_closed['corenlp_train_params'] + ['-lang', '',
Пример #13
0
import pandas as pd

from setpos.data.split import load, join_to_sents

if __name__ == '__main__':
    X, y, groups = load()  # [l[:3000] for l in load()]  #

    results = []
    sents = join_to_sents(X, y)

    for sent in sents:
        new_sents_count_full_carth = pd.Series([len(tags) * 1. for _, tags in sent]).prod()
        new_sents_count_ambig_carth = pd.Series(
            [1.] + [max(1., len(list(filter(lambda v: v >= 1, list(tags.values())))))
                    for _, tags in sent]).prod()
        new_sents_count_each_tag_used = pd.Series([len(tags) for _, tags in sent]).max()

        results.append(dict(before=len(sent),
                            carth=new_sents_count_full_carth * len(sent),
                            ambig_carth=new_sents_count_ambig_carth * len(sent),
                            each_tag_used=new_sents_count_each_tag_used * len(sent)))

    df = pd.DataFrame(results)
    with pd.option_context('display.float_format', '{:,.0f}'.format):
        print(df.sum())
Пример #14
0
import shelve
from functools import partial

import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score

from scripts.util.func import param_to_key
from setpos.data.split import load, is_masked, MCInDocSplitter, KFoldInDocSplitter
from setpos.tagger import CoreNLPTagger
from setpos.util import stopwatch, draw_cd_diagram

if __name__ == '__main__':
    SEED, n = 1, 2
    toks, tags, groups = [
        l[:3000] for l in load(tag_prefix_masks=[])
    ]  # load()  # load(tag_prefix_masks=[])  #[l[:3000] for l in load(tag_prefix_masks=[])]    #
    paramspace = [  # dict(clf=IntergrammTagger),
        {},
        dict(
            augment_setvalued_targets='True',
            corenlp_train_params=[
                '--arch',
                'left5words,suffix(1),prefix(1),suffix(2),prefix(2),suffix(3),prefix(3)'
            ],
            filter_tags=()),
        dict(
            augment_setvalued_targets='False',
            corenlp_train_params=[
                '--arch',
                'left5words,suffix(1),prefix(1),suffix(2),prefix(2),suffix(3),prefix(3)'
Пример #15
0
        results = pd.concat([results, results2])
        if long_result:
            return df, results
        if score is None:
            score = 'accuracy' if self.set_valued == False else 'avg util'
        return results[scope].loc[score]


if __name__ == '__main__':
    import numpy as np
    from setpos.data.split import load, sents_to_dataset
    import logging

    logging.basicConfig()

    toks, tags, groups = load()
    g = list(set(groups))
    clf = CoreNLPTagger(loglevel=logging.INFO)

    clf.fit(toks[np.isin(groups, g[1:2])], tags[np.isin(groups, g[1:2])])
    clf.fit(toks[np.isin(groups, g[0:1])], tags[np.isin(groups, g[0:1])])
    # clf.fit(toks, tags)

    # schap/NA vnde/KON dar/PAVD to/PAVAP hebbe/VAFIN ik/PPER ere/PPER gegeuen/VVPP
    print(
        clf.predict_proba(
            sents_to_dataset(
                ['schap vnde dar to hebbe ik ere gegeuen'.split()])))
    print(
        clf.setpredict(
            sents_to_dataset(
Пример #16
0
):
    dist = dists[end_index][0]
    acc = evaluate(train_docs, eval_doc)

    print("curr max dist:", dist)
    print("newly added:", dists[end_index][1])
    print("training docs:", train_docs)
    print("excluded docs:", set(doc_titles - train_docs))
    for i, val in acc.items():
        print(f'{i}: {val:.4f}')
    print()
    return dict(dist=dist, train_docs=train_docs, **acc)


if __name__ == '__main__':
    X, y, groups = load()
    doc_titles = set(groups)

    metadata = pd.read_csv(
        "corpus.metadata.csv",
        index_col='sigle',
        usecols=["sigle", 'year_norm', 'lon_norm', 'lat_norm'])
    os.makedirs("out", exist_ok=True)

    for seed in range(4):
        for selector in doc_titles:
            prefix = f'{selector}-seed:{seed}'
            if isfile(prefix + '.csv'):
                continue
            print('test doc:', prefix, end="\n-------------------------\n\n")
Пример #17
0
            print_stats(data, "avg util",
                        lambda x: x[prefix + "const_util"].mean()))
        result.update(
            print_stats(data, "avg recall",
                        lambda x: x[prefix + "const_recall"].mean()))
        print(
            f"avg sent recall {data.groupby(['sentID'])[[prefix + 'const_recall']].all().mean().iloc[0]:.2%}"
        )
        result.update(
            print_stats(data, "avg precision",
                        lambda x: x[prefix + "const_precision"].mean()))
        result.update(
            print_stats(
                data,
                "avg agreement of set-prediction with default prediction",
                lambda x: x[prefix + "original_pred_in_set_pred"].mean()))

        return data, pd.DataFrame.from_dict(result, 'index')


if __name__ == '__main__':
    print(score(dict(a=1, b=1), {'b'}))
    print(score(dict(a=.6, b=.3), {'b'}))
    print(score(dict(a=1), {'b'}))
    print(score(dict(b=1), {'b'}))

    df, tagsdict = load()
    df, _ = print_classical_pred_stats(df, tagsdict)
    df, _ = print_set_valued_pred_stats(df, tagsdict, True)
    df.to_excel("out.xlsx")
Пример #18
0
import json
from collections import defaultdict

import matplotlib.pyplot as plt
import pandas as pd

from setpos.data.split import load

if __name__ == '__main__':
    X, y, g = load()

    # dict of tag -> set of words
    word_tags_map = defaultdict(set)
    for tags, (_, word) in zip(y, X):
        tags = json.loads(tags).keys()
        for tag in tags:
            word_tags_map[word].add(tag)

    series = pd.Series(word_tags_map).apply(len)
    print(series.describe())
    series.hist()
    plt.show()
Пример #19
0
import logging

from sklearn.model_selection import LeaveOneGroupOut, cross_val_score

from setpos.tagger import CoreNLPTagger, TreeTagger
from setpos.data.split import load

if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)

    toks, tags, groups = load(n=5000)

    clf = CoreNLPTagger()
    scores = cross_val_score(clf,
                             toks,
                             tags,
                             groups,
                             cv=LeaveOneGroupOut(),
                             n_jobs=-1)

    print(f"total accuracy: {scores.mean():.2%} ± {scores.std():.2%}")