Exemplo n.º 1
0
def _execute_pos_tagger_training(check_pos_tagger_file_exists):
    from ClassifierBasedGermanTagger.ClassifierBasedGermanTagger import ClassifierBasedGermanTagger
    # derived from https://datascience.blog.wzb.eu/2016/07/13/accurate-part-of-speech-tagging-of-german-texts-with-nltk/
    # and https://github.com/ptnplanet/NLTK-Contributions

    import nltk
    print(
        "Download corpus from https://www.ims.uni-stuttgart.de/documents/ressourcen/korpora/TIGERCorpus/download/start.html!"
    )
    corp = nltk.corpus.ConllCorpusReader(
        '.',
        'tiger_release_aug07.corrected.16012013.conll09',
        ['ignore', 'words', 'ignore', 'ignore', 'pos'],
        encoding='utf-8')

    tagged_sents = list(corp.tagged_sents())
    random.shuffle(tagged_sents)

    # set a split size: use 90% for training, 10% for testing
    split_perc = 0.1
    split_size = int(len(tagged_sents) * split_perc)
    train_sents, test_sents = tagged_sents[
        split_size:], tagged_sents[:split_size]

    tagger = ClassifierBasedGermanTagger(train=train_sents)

    print('Trained tagger result: ', tagger.evaluate(test_sents))

    with open(check_pos_tagger_file_exists, 'wb') as f:
        pickle.dump(tagger, f, protocol=2)

    return tagger
Exemplo n.º 2
0
def ingest_train_data():
    global tagger
    # corpusFile = "tiger_release_aug07.corrected.16012013.conll09" #request.args['corpusFile']"
    taggerFile = 'SerializedTagger.pickle'
    corp = ConllCorpusReader('../templates',
                             'tiger_release_aug07.corrected.16012013.conll09',
                             ['ignore', 'words', 'ignore', 'ignore', 'pos'],
                             encoding='utf-8')
    tagged_sents = corp.tagged_sents()
    split_percentage = 0.25
    split_size = int(len(tagged_sents) * split_percentage)
    train_sents, test_sents = tagged_sents[
        split_size:], tagged_sents[:split_size]
    tagger = ClassifierBasedGermanTagger(train=train_sents)
    with open(taggerFile, 'wb') as output:
        pickle.dump(tagger, output)
    return json.dumps({'message': 'data ingested'})
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

# Einlesen des Tiger Korpus
corp = nltk.corpus.reader.conll.ConllCorpusReader(
    '.',
    'tiger_release_aug07.corrected.16012013.conll09',
    ['ignore', 'words', 'ignore', 'ignore', 'pos'],
    encoding='utf-8')

# Sätze mischen
tagged_sents = list(corp.tagged_sents())
random.shuffle(tagged_sents)

# Trainingsset definieren 10% für Test
split_perc = 0.1
split_size = int(len(tagged_sents) * split_perc)
train_sents, test_sents = tagged_sents[split_size:], tagged_sents[:split_size]

# Trainieren
tagger = ClassifierBasedGermanTagger(train=train_sents)

#Genauigkeit ausgeben
accuracy = tagger.evaluate(test_sents)
print(accuracy)

# Model abspeichern
with open('nltk_german_classifier_data.pickle', 'wb') as f:
    pickle.dump(tagger, f, protocol=2)

print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
Exemplo n.º 4
0
    COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
'''
#Choose the root
root = '/output/'  #For a server like FloydHub
#root = '/Users/Till/Dropbox/Deep Learning Udacity/deep-learning/KommasetzungAusweichordner'
fileid = 'tiger_release_aug07.corrected.16012013.conll09'
columntypes = ['ignore', 'words', 'ignore', 'ignore', 'pos']

#Load corpus
corp = nltk.corpus.ConllCorpusReader(root,
                                     fileid,
                                     columntypes,
                                     encoding='utf8')

#Train on whole corpus (normally it has a accuracy around 94% - 96% depending on text)
tagger = ClassifierBasedGermanTagger(train=corp.tagged_sents())
#tagger = UnigramTagger(corp.tagged_sents())

# In[8]:

#Part of speech tagging on data set
idx = int(len(woerter) * 1.0)
w_classes = tagger.tag(woerter[:idx])

# In[9]:

print(w_classes[:10])

# In[10]:

woerter_classes = []
""" part of the script is adapted from https://datascience.blog.wzb.eu/2016/07/13/accurate-part-of-speech-tagging-of-german-texts-with-nltk/"""

import nltk
import numpy as np
from ClassifierBasedGermanTagger.ClassifierBasedGermanTagger import ClassifierBasedGermanTagger
import pickle

corp = nltk.corpus.ConllCorpusReader(
    '.',
    'tiger_release_aug07.corrected.16012013.conll09',
    ['ignore', 'words', 'ignore', 'ignore', 'pos'],
    encoding='utf-8')
tagged_sents = corp.tagged_sents()
#random_perm = np.random.permutation(len(tagged_sents))
#tagged_sents = [tagged_sents[i] for i in random_perm]
#split_percentage = 0.1
#split_size = int(len(tagged_sents)*split_percentage)
#train_sents, test_sents = tagged_sents[split_size:],tagged_sents[:split_size]
tagger = ClassifierBasedGermanTagger(train=tagged_sents)
with open('nltk_german_classifier.pickle', 'wb') as f:
    pickle.dump(tagger, f, protocol=2)
Exemplo n.º 6
0
from ClassifierBasedGermanTagger.ClassifierBasedGermanTagger import ClassifierBasedGermanTagger
import nltk
import random
import pickle

if __name__ == "__main__":
    corp = nltk.corpus.ConllCorpusReader(
        '.',
        'tiger_release_aug07.corrected.16012013.conll09',
        ['ignore', 'words', 'ignore', 'ignore', 'pos'],
        encoding='utf-8')
    tagged_sents = corp.tagged_sents()
    #random.shuffle(tagged_sents)

    # set a split size: use 90% for training, 10% for testing
    split_perc = 0.1
    split_size = int(len(tagged_sents) * split_perc)
    train_sents, test_sents = tagged_sents[
        split_size:], tagged_sents[:split_size]
    tagger = ClassifierBasedGermanTagger(train=train_sents)
    accuracy = tagger.evaluate(test_sents)
    print(tagger.tag(['Das', 'ist', 'ein', 'einfacher', 'Test']))
    with open('nltk_german_classifier_data.pickle', 'wb') as f:
        pickle.dump(tagger, f, protocol=2)
    print("SUCCESS!")