def _execute_pos_tagger_training(check_pos_tagger_file_exists): from ClassifierBasedGermanTagger.ClassifierBasedGermanTagger import ClassifierBasedGermanTagger # derived from https://datascience.blog.wzb.eu/2016/07/13/accurate-part-of-speech-tagging-of-german-texts-with-nltk/ # and https://github.com/ptnplanet/NLTK-Contributions import nltk print( "Download corpus from https://www.ims.uni-stuttgart.de/documents/ressourcen/korpora/TIGERCorpus/download/start.html!" ) corp = nltk.corpus.ConllCorpusReader( '.', 'tiger_release_aug07.corrected.16012013.conll09', ['ignore', 'words', 'ignore', 'ignore', 'pos'], encoding='utf-8') tagged_sents = list(corp.tagged_sents()) random.shuffle(tagged_sents) # set a split size: use 90% for training, 10% for testing split_perc = 0.1 split_size = int(len(tagged_sents) * split_perc) train_sents, test_sents = tagged_sents[ split_size:], tagged_sents[:split_size] tagger = ClassifierBasedGermanTagger(train=train_sents) print('Trained tagger result: ', tagger.evaluate(test_sents)) with open(check_pos_tagger_file_exists, 'wb') as f: pickle.dump(tagger, f, protocol=2) return tagger
def ingest_train_data(): global tagger # corpusFile = "tiger_release_aug07.corrected.16012013.conll09" #request.args['corpusFile']" taggerFile = 'SerializedTagger.pickle' corp = ConllCorpusReader('../templates', 'tiger_release_aug07.corrected.16012013.conll09', ['ignore', 'words', 'ignore', 'ignore', 'pos'], encoding='utf-8') tagged_sents = corp.tagged_sents() split_percentage = 0.25 split_size = int(len(tagged_sents) * split_percentage) train_sents, test_sents = tagged_sents[ split_size:], tagged_sents[:split_size] tagger = ClassifierBasedGermanTagger(train=train_sents) with open(taggerFile, 'wb') as output: pickle.dump(tagger, output) return json.dumps({'message': 'data ingested'})
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # Einlesen des Tiger Korpus corp = nltk.corpus.reader.conll.ConllCorpusReader( '.', 'tiger_release_aug07.corrected.16012013.conll09', ['ignore', 'words', 'ignore', 'ignore', 'pos'], encoding='utf-8') # Sätze mischen tagged_sents = list(corp.tagged_sents()) random.shuffle(tagged_sents) # Trainingsset definieren 10% für Test split_perc = 0.1 split_size = int(len(tagged_sents) * split_perc) train_sents, test_sents = tagged_sents[split_size:], tagged_sents[:split_size] # Trainieren tagger = ClassifierBasedGermanTagger(train=train_sents) #Genauigkeit ausgeben accuracy = tagger.evaluate(test_sents) print(accuracy) # Model abspeichern with open('nltk_german_classifier_data.pickle', 'wb') as f: pickle.dump(tagger, f, protocol=2) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE) ''' #Choose the root root = '/output/' #For a server like FloydHub #root = '/Users/Till/Dropbox/Deep Learning Udacity/deep-learning/KommasetzungAusweichordner' fileid = 'tiger_release_aug07.corrected.16012013.conll09' columntypes = ['ignore', 'words', 'ignore', 'ignore', 'pos'] #Load corpus corp = nltk.corpus.ConllCorpusReader(root, fileid, columntypes, encoding='utf8') #Train on whole corpus (normally it has a accuracy around 94% - 96% depending on text) tagger = ClassifierBasedGermanTagger(train=corp.tagged_sents()) #tagger = UnigramTagger(corp.tagged_sents()) # In[8]: #Part of speech tagging on data set idx = int(len(woerter) * 1.0) w_classes = tagger.tag(woerter[:idx]) # In[9]: print(w_classes[:10]) # In[10]: woerter_classes = []
""" part of the script is adapted from https://datascience.blog.wzb.eu/2016/07/13/accurate-part-of-speech-tagging-of-german-texts-with-nltk/""" import nltk import numpy as np from ClassifierBasedGermanTagger.ClassifierBasedGermanTagger import ClassifierBasedGermanTagger import pickle corp = nltk.corpus.ConllCorpusReader( '.', 'tiger_release_aug07.corrected.16012013.conll09', ['ignore', 'words', 'ignore', 'ignore', 'pos'], encoding='utf-8') tagged_sents = corp.tagged_sents() #random_perm = np.random.permutation(len(tagged_sents)) #tagged_sents = [tagged_sents[i] for i in random_perm] #split_percentage = 0.1 #split_size = int(len(tagged_sents)*split_percentage) #train_sents, test_sents = tagged_sents[split_size:],tagged_sents[:split_size] tagger = ClassifierBasedGermanTagger(train=tagged_sents) with open('nltk_german_classifier.pickle', 'wb') as f: pickle.dump(tagger, f, protocol=2)
from ClassifierBasedGermanTagger.ClassifierBasedGermanTagger import ClassifierBasedGermanTagger import nltk import random import pickle if __name__ == "__main__": corp = nltk.corpus.ConllCorpusReader( '.', 'tiger_release_aug07.corrected.16012013.conll09', ['ignore', 'words', 'ignore', 'ignore', 'pos'], encoding='utf-8') tagged_sents = corp.tagged_sents() #random.shuffle(tagged_sents) # set a split size: use 90% for training, 10% for testing split_perc = 0.1 split_size = int(len(tagged_sents) * split_perc) train_sents, test_sents = tagged_sents[ split_size:], tagged_sents[:split_size] tagger = ClassifierBasedGermanTagger(train=train_sents) accuracy = tagger.evaluate(test_sents) print(tagger.tag(['Das', 'ist', 'ein', 'einfacher', 'Test'])) with open('nltk_german_classifier_data.pickle', 'wb') as f: pickle.dump(tagger, f, protocol=2) print("SUCCESS!")