def lemma_via_patternlib(token, pos): if pos == 'NP': # singularize noun return singularize(token) elif pos.startswith('V'): # get infinitive of verb return conjugate(token) elif pos.startswith('ADJ') or pos.startswith('ADV'): # get baseform of adjective or adverb return predicative(token) return token
def test_predicative(self): # Assert the accuracy of the predicative algorithm ("großer" => "groß"). from pattern.db import Datasheet i, n = 0, 0 for tag, pred, attr in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")): if tag == "a": if de.predicative(attr) == pred: i +=1 n += 1 self.assertTrue(float(i) / n > 0.98) print("pattern.de.predicative()")
def test_predicative(self): # Assert the accuracy of the predicative algorithm ("großer" => "groß"). from pattern.db import Datasheet i, n = 0, 0 for tag, pred, attr in Datasheet.load(os.path.join(PATH, "corpora", "celex-wordforms-de.csv")): if tag == "a": if de.predicative(attr) == pred: i +=1 n += 1 self.assertTrue(float(i) / n > 0.98) print "pattern.de.predicative()"
# pprint(tt_en.tag('Does this thing work?')) tagger = ttw.TreeTagger(TAGLANG='de', TAGDIR='/home/niklas/treetagger/') # satz = u'Dies ist ein Testsatz.' # print type(satz) # satzu = satz.decode('utf-8') # tags = tagger.tag_text(satz) # pprint.pprint(tags) datei = open('196.txt', 'r') dat = datei.read() s = parse(dat, tagset='STTS') s = split(s) print s.sentences[0] print predicative('neugierige') with open('196.txt', 'r') as openfile: for line in openfile: nltk.tag.brill.BrillTagger(line) # datu = dat.decode('utf-8') # print tagger.tag_text(dat) # print datu # tags = tagger.TagText(datu) # # for tag in tags: # # print tag datei.close() # for dirpath, dirs, files in os.walk('../temp'): # for filename in fnmatch.filter(files, '*.txt'):
# run with PYTHONIOENCODING=utf-8 import sys import re import codecs # not used, but possibly interesting http://www.nltk.org/ # http://www.clips.ua.ac.be/pages/pattern-de from pattern.de import lemma, tag, predicative, singularize # possible parts of speech: # PRP$, FW, VBN, WDT, JJ, WP, DT, RP, NN, TO, PRP, # RB, NNS, NNP, VB, WRB, CC, LS, CD, IN, MD, UH part_of_speech_command = { 'PRP$': lambda word: predicative(word), # pronomina 'VBN': lambda word: lemma(word), # verba 'DT': lambda word: predicative(word), # pronomina 'VB': lambda word: lemma(word), # verba 'NN': lambda word: singularize(word), # nomina 'JJ': lambda word: predicative(word) # preposice } pattern_word = re.compile('[a-zA-Z]') pattern_punctuation = re.compile(ur'[—\-|«»…–<>]') def transform(tagword): word = tagword[0] part = tagword[1] # if part == 'VBN':