예제 #1
0
def lemma_via_patternlib(token, pos):
    if pos == 'NP':  # singularize noun
        return singularize(token)
    elif pos.startswith('V'):  # get infinitive of verb
        return conjugate(token)
    elif pos.startswith('ADJ') or pos.startswith('ADV'):  # get baseform of adjective or adverb
        return predicative(token)

    return token
예제 #2
0
 def test_predicative(self):
     # Assert the accuracy of the predicative algorithm ("großer" => "groß").
     from pattern.db import Datasheet
     i, n = 0, 0
     for tag, pred, attr in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")):
         if tag == "a":
             if de.predicative(attr) == pred:
                 i +=1
             n += 1
     self.assertTrue(float(i) / n > 0.98)
     print("pattern.de.predicative()")
예제 #3
0
 def test_predicative(self):
     # Assert the accuracy of the predicative algorithm ("großer" => "groß").
     from pattern.db import Datasheet
     i, n = 0, 0
     for tag, pred, attr in Datasheet.load(os.path.join(PATH, "corpora", "celex-wordforms-de.csv")):
         if tag == "a":
             if de.predicative(attr) == pred:
                 i +=1
             n += 1
     self.assertTrue(float(i) / n > 0.98)
     print "pattern.de.predicative()"
예제 #4
0
# pprint(tt_en.tag('Does this thing work?'))

tagger = ttw.TreeTagger(TAGLANG='de', TAGDIR='/home/niklas/treetagger/')
# satz = u'Dies ist ein Testsatz.'
# print type(satz)
# satzu = satz.decode('utf-8')
# tags = tagger.tag_text(satz)
# pprint.pprint(tags)

datei = open('196.txt', 'r')
dat = datei.read()

s = parse(dat, tagset='STTS')
s = split(s)
print s.sentences[0]
print predicative('neugierige')

with open('196.txt', 'r') as openfile:
    for line in openfile:
        nltk.tag.brill.BrillTagger(line)

# datu = dat.decode('utf-8')
# print tagger.tag_text(dat)
# print datu
# tags = tagger.TagText(datu)
# # for tag in tags:
# #     print tag
datei.close()

# for dirpath, dirs, files in os.walk('../temp'):
#     for filename in fnmatch.filter(files, '*.txt'):
예제 #5
0
# run with PYTHONIOENCODING=utf-8

import sys
import re
import codecs

# not used, but possibly interesting http://www.nltk.org/

# http://www.clips.ua.ac.be/pages/pattern-de
from pattern.de import lemma, tag, predicative, singularize

# possible parts of speech:
# PRP$, FW, VBN, WDT, JJ, WP, DT, RP, NN, TO, PRP,
# RB, NNS, NNP, VB, WRB, CC, LS, CD, IN, MD, UH
part_of_speech_command = {
    'PRP$': lambda word: predicative(word),  # pronomina
    'VBN': lambda word: lemma(word),  # verba
    'DT': lambda word: predicative(word),  # pronomina
    'VB': lambda word: lemma(word),  # verba
    'NN': lambda word: singularize(word),  # nomina
    'JJ': lambda word: predicative(word)  # preposice
}

pattern_word = re.compile('[a-zA-Z]')
pattern_punctuation = re.compile(ur'[—\-|«»…–<>]')


def transform(tagword):
    word = tagword[0]
    part = tagword[1]
    # if part == 'VBN':