Пример #1
0
    def __init__(
        self,
        vocab: Vocab,
        name: str = "morphologizer",
        *,
        overwrite_lemma: bool = False,
    ) -> None:
        super().__init__()

        self.name = name
        self.vocab = vocab
        self.voikko = libvoikko.Voikko("fi")
        self.lookups = Lookups()
        self.overwrite_lemma = overwrite_lemma
        self.aux_labels = [vocab.strings.add(x) for x in ["aux", "aux:pass"]]
        self.cop_labels = [vocab.strings.add(x) for x in ["cop", "cop:own"]]
        self.nsubj_labels = [
            vocab.strings.add(x) for x in ["nsubj", "nsubj:cop"]
        ]
        self.ccomp_labels = [
            vocab.strings.add(x)
            for x in ["csubj", "csubj:cop", "xcomp", "xcomp:ds"]
        ]
        self.relcl_labels = [
            vocab.strings.add(x) for x in ["acl:relcl", "ccomp"]
        ]
        self.foreign_tag = vocab.strings.add('Foreign')
Пример #2
0
def main():
    assert os.environ.get('LASER'), 'Please set the enviornment variable LASER'

    voikko = libvoikko.Voikko('fi')
    args = parse_args()
    hyperparameters = Hyperparameters(args.hyperparameters)
    if args.fast:
        hyperparameters.set_logreg()

    tasks = [
        TDTCategoryClassificationTask('TDT categories',
                                      'data/UD_Finnish-TDT',
                                      use_dev_set=args.dev_set,
                                      verbose=args.verbose),
        OpusparcusTask('Opusparcus',
                       'data/opusparcus/opusparcus_v1',
                       use_dev_set=args.dev_set,
                       verbose=args.verbose),
        YlilautaConsecutiveSentencesTask('Ylilauta',
                                         'data/ylilauta',
                                         use_dev_set=args.dev_set,
                                         verbose=args.verbose),
        EduskuntaVKKClassificationTask('Eduskunta-VKK',
                                       'data/eduskunta-vkk',
                                       use_dev_set=args.dev_set,
                                       verbose=args.verbose),
    ]

    models = [
        model_tfidf(voikko),
        model_w2v(),
        model_fasttext(),
        model_sif(),
        model_borep(),
        model_finbert(),
        model_laser(os.environ['LASER'], args.verbose),
    ]

    print(f'Running evaluation on {len(tasks)} tasks and {len(models)} models')

    scores = []
    for k in range(args.num_trials):
        if args.num_trials > 1:
            print(f'Trial {k+1}/{args.num_trials}')

        scores.append(evaluate_models(models, tasks, hyperparameters))

        save_scores(scores, args.resultdir)
Пример #3
0
 def __init__(self):
     self.name = 'Voikko'
     self.voikko = libvoikko.Voikko('fi')
     self.tag_map = {
         'nimisana': 'NOUN',
         'laatusana': 'ADJ',
         'nimisana_laatusana': 'ADJ',
         'teonsana': 'VERB',
         'seikkasana': 'ADV',
         'asemosana': 'PRON',
         'suhdesana': 'ADP',
         'huudahdussana': 'INTJ',
         'sidesana': 'CCONJ',
         'etunimi': 'PROPN',
         'sukunimi': 'PROPN',
         'paikannimi': 'PROPN',
         'nimi': 'PROPN',
         'kieltosana': 'AUX',
         'lyhenne': 'ADV',
         'lukusana': 'NUM',
         'etuliite': 'X'
     }
Пример #4
0
from math import inf
from typing import Callable, List, Optional, Tuple

import cairocffi as cairo
import numpy as np
import pangocairocffi as pangocairo
import pangocffi as pango
from tqdm.cli import tqdm
from voikko import libvoikko

from .document import (Chapter, DocumentObj, Eval, Paragraph, Subenvironment,
                       Table, VSpace, fixMarkup, stripMarkup)
from .params import Parameters

try:
    voikko = libvoikko.Voikko("fi")
except:
    voikko = None

debug = False


def irange(a, b, s=1) -> range:
    return range(a, b + 1 if s > 0 else b - 1, s)


FixXY = Callable[[float, float], Tuple[float, float]]


class Line:
    outline: Optional[Tuple[int, str]]
Пример #5
0
 def __init__(self):
     self.name = 'FinnPos'
     self.voikko = libvoikko.Voikko('fi')
Пример #6
0
 def __init__(self, lookups, *args, **kwargs):
     super(FinnishLemmatizer, self).__init__(lookups, *args, **kwargs)
     self.voikko = libvoikko.Voikko("fi")
Пример #7
0
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import re
from collections import defaultdict
from voikko import libvoikko as lv
from voikko.inflect_word import inflect_word
from . import patternparser as pp

DICTIONARY = defaultdict(list)

voikko = lv.Voikko("fi-x-morpho")


def tokenize(text):
    tokens = []
    for token in voikko.tokens(text):
        if token.tokenType == lv.Token.WHITESPACE:
            continue
        if "-" in token.tokenText:
            index = token.tokenText.rindex("-") + 1
            lastPart = token.tokenText[index:]
            baseformPrefix = token.tokenText[:index].lower()
        else:
            lastPart = token.tokenText
            baseformPrefix = ""
        alternatives = []
Пример #8
0
def tune():
    voikko = libvoikko.Voikko('fi')
    tasks = [
        TDTCategoryClassificationTask('TDT categories',
                                      'data/UD_Finnish-TDT',
                                      use_dev_set=True),
        OpusparcusTask('Opusparcus',
                       'data/opusparcus/opusparcus_v1',
                       use_dev_set=True),
        YlilautaConsecutiveSentencesTask('Ylilauta',
                                         'data/ylilauta',
                                         use_dev_set=True),
        EduskuntaVKKClassificationTask('Eduskunta-VKK',
                                       'data/eduskunta-vkk',
                                       use_dev_set=True),
    ]

    def model_w2v():
        return PooledWord2Vec('Pooled word2vec',
                              'pretrained/fin-word2vec/fin-word2vec.bin')

    def model_fasttext():
        return PooledFastText('Pooled FastText',
                              'pretrained/fasttext-fi/cc.fi.300.bin')

    def model_finbert(layers):
        return Bert('FinBERT', 'TurkuNLP/bert-base-finnish-cased-v1', layers)

    def model_tfidf(min_df):
        return TfidfVectors('TF-IDF', voikko, int(min_df))

    def model_sif():
        return SIF('SIF', 'data/finnish_vocab/finnish_vocab.txt.gz',
                   'pretrained/fin-word2vec/fin-word2vec.bin')

    def model_borep():
        return BOREP('BOREP', 'pretrained/fin-word2vec/fin-word2vec.bin', 4096)

    def model_laser():
        return Laser('LASER', os.path.join(os.getcwd(), 'LASER'))

    evaluations = itertools.chain(
        evaluations_for_model(
            model_w2v, tasks, {
                'hidden_dim1': hp.quniform('hidden_dim1', 10, 300, 10),
                'dropout_prop': hp.uniform('dropout_prop', 0.2, 0.8),
            }),
        evaluations_for_model(
            model_fasttext, tasks, {
                'hidden_dim1': hp.quniform('hidden_dim1', 10, 300, 10),
                'dropout_prop': hp.uniform('dropout_prop', 0.2, 0.8),
            }),
        evaluations_for_model(
            model_finbert, tasks, {
                'hidden_dim1':
                hp.quniform('hidden_dim1', 30, 768, 10),
                'dropout_prop':
                hp.uniform('dropout_prop', 0.2, 0.8),
                'embedding_layers':
                hp.choice('embedding_layers',
                          [[-1], [-2], [-3], [-4], [-1, -2, -3, -4]]),
            }),
        evaluations_for_model(
            model_tfidf, tasks, {
                'hidden_dim1': hp.quniform('hidden_dim1', 30, 1000, 10),
                'dropout_prop': hp.uniform('dropout_prop', 0.2, 0.8),
                'embedding_min_df': hp.quniform('embedding_min_df', 2, 8, 2),
            }),
        evaluations_for_model(
            model_sif, tasks, {
                'hidden_dim1': hp.quniform('hidden_dim1', 10, 300, 10),
                'dropout_prop': hp.uniform('dropout_prop', 0.2, 0.8),
            }),
        evaluations_for_model(
            model_borep, tasks, {
                'hidden_dim1': hp.quniform('hidden_dim1', 30, 300, 10),
                'dropout_prop': hp.uniform('dropout_prop', 0.2, 0.8),
            }),
        evaluations_for_model(
            model_laser, tasks, {
                'hidden_dim1': hp.quniform('hidden_dim1', 30, 300, 10),
                'dropout_prop': hp.uniform('dropout_prop', 0.2, 0.8),
            }))

    os.makedirs('results', exist_ok=True)

    best_params = {}
    for kv in evaluations:
        task = kv['task']
        embedding_model = None
        X_train = None
        y_train = None
        X_test = None
        y_test = None

        def objective(params):
            nonlocal embedding_model, X_train, y_train, X_test, y_test

            (embedding_params, classifier_params) = \
                split_embedding_and_classifier_params(params)

            if embedding_params or embedding_model is None:
                if embedding_params:
                    print('Reinitializing the embedding model '
                          'because parameters have changed')

                builder = kv['embedding_model_builder']
                embedding_model = builder(**embedding_params)
                X_train, y_train, X_test, y_test = \
                    task.prepare_data(embedding_model)

            print(f'{embedding_model.name}, {task.name}')
            print(params)

            clf = task.train_classifier(X_train, y_train, classifier_params)
            return -task.compute_optimization_score(clf, X_test, y_test)

        trials = Trials()
        fmin_res = fmin(fn=objective,
                        space=kv['space'],
                        algo=tpe.suggest,
                        max_evals=50,
                        trials=trials)
        best = space_eval(kv['space'], fmin_res)
        best_score = -np.min(trials.losses())
        print(
            f'best score for {embedding_model.name} in task {task.name}: {best_score}'
        )
        print('parameters:')
        print(best)

        best_params.setdefault(task.name, {})[embedding_model.name] = \
            serialize_results(best, best_score)

        with open('results/hyperparameters.json', 'w') as f:
            json.dump(best_params, f, indent=2)
Пример #9
0
 def __init__(self, vocab: Vocab, name: str = "lemmatizer", overwrite: bool = False) -> None:
     super().__init__(vocab, model=None, name=name, mode="voikko", overwrite=overwrite)
     self.voikko = libvoikko.Voikko("fi")
Пример #10
0
# This discards suffixes and some classes of complex compounds
# GPL3 Copyright Théo Friberg 2018

from voikko import libvoikko
from bs4 import BeautifulSoup
import sys

# Read the Kaino dataset

f = open(sys.argv[1])
parsed = BeautifulSoup(f.read(), 'lxml')
f.close()

# Initialise voikko

v = libvoikko.Voikko('fi')

# Accumulate words into a seet

words = set()

for word in parsed.find_all('s'):  # The s-tag in Kaino denotes a word

    s = word.string
    if s.lower() != s or "-" in s:  # Discard suffixes and certain compounds
        continue

    analysis_ = v.analyze(
        s)  # Analyse the word using Voikko; skip if Voikko gets confused
    if len(analysis_) == 0:
        continue