def preprocess(self): tokenizedTweets_writer = open( './daten/tokenized_tweets_normalized.txt', 'w') preprocTweets_writer = open( './daten/preprocessed_tweets_normalized.txt', 'w') pp = Pipeline(self.this_file, "./autosarkasmus/rsrc/de-tiger.map") tweets_tkn, tweets_proc, labels = pp.process() assert (len(tweets_tkn) == len(tweets_proc) == len(labels)) # filter stopwords + normalize tokens lemmatizer = IWNLPWrapper( lemmatizer_path='daten/IWNLP.Lemmatizer_20170501.json') lemmatized_tokens = [] for x in range(len(tweets_tkn)): tweet = [] for token in tweets_tkn[x]: if token.lower() in stopwords.words('german'): continue try: lemma = lemmatizer.lemmatize_plain(token, ignore_case=True) if (lemma): tweet.append(lemma[0]) else: tweet.append(token) except Exception as e: print(e) lemmatized_tokens.append(tweet) assert (len(lemmatized_tokens) == len(tweets_proc) == len(labels)) # write preprocessing results to file for x in range(len(lemmatized_tokens)): t_tweet = (" ").join(lemmatized_tokens[x]) p_tweet = (" ").join( [str(x) + "/" + str(y) for x, y in tweets_proc[x]]) label = labels[x] tokenizedTweets_writer.write(t_tweet + "\t" + label + "\n") preprocTweets_writer.write(p_tweet + "\t" + label + "\n")
class spaCyIWNLP(object): def __init__(self, lemmatizer_path, use_plain_lemmatization=False, ignore_case=False): self.lemmatizer = IWNLPWrapper(lemmatizer_path=lemmatizer_path) self.use_plain_lemmatization = use_plain_lemmatization self.ignore_case = ignore_case Token.set_extension('iwnlp_lemmas', getter=self.get_lemmas, force=True) def __call__(self, doc): for token in doc: token._.iwnlp_lemmas = self.get_lemmas(token) return doc def get_lemmas(self, token): if self.use_plain_lemmatization: return self.lemmatizer.lemmatize_plain( token.text, ignore_case=self.ignore_case) else: return self.lemmatizer.lemmatize(token.text, pos_universal_google=token.pos_)
class LemmatizerPlus(object): def __init__(self, lemmatizer_path, nlp): self.lemmatizer = IWNLPWrapper(lemmatizer_path=lemmatizer_path) self.stringstore = nlp.vocab.strings # self.matcher = PhraseMatcher(nlp.vocab) Token.set_extension('iwnlp_lemmas', getter=self.lemmatize, force=True) self.lookup = { ('fast', ADV): 'fast', } def __call__(self, doc): for token in doc: token._.iwnlp_lemmas = self.lemmatize(token) return doc def lemmatize(self, token): """ TODO: This doc is slightly outdated This function uses the IWNLP lemmatizer with a few enhancements for compund nouns and nouns with uncommon capitalization. Can also be used to lemmatize tokens with different POS-tags. Do not use this function to lemmatize phrases. :param token: white space stripped single token (str) :return: str # TODO: tuple of type (str, bool) value[0]: The lemma of the token if a lemma can be derived, else None. # TODO: value[1]: True if the token can be retrieved from the Wiktionary database as is, # else False. """ text = token.text.strip() pos = token.pos_ # nothing to lemmatize here if pos in {PHRASE, NPHRASE, PUNCT, SPACE, SYM}: return text # lemmatiaztions are odd on DET and NUM, so better leave it alone if pos in {DET, NUM}: return None # Wiktionary has no POS PROPN if pos == PROPN: pos = NOUN # first lookup token for given POS in dictionary if (text, pos) in self.lookup: return self.lookup[(text, pos)] value = None # default IWNLP lemmatization lemm = self.lemmatizer.lemmatize(text, pos) # default lemmatization hit? if lemm: value = lemm[0] # default lemmatization miss? # apply some rules to derive a lemma from the original token (nouns only) elif pos == NOUN: # first try default noun capitalization lemm = self.lemmatizer.lemmatize(text.title(), pos) if lemm: value = lemm[0] else: # still no results: try all noun suffixes # TODO: search for a more efficient implementation text_low = text.lower() tolerance = 3 for i in range(1, len(text) - tolerance): # looks ugly, but avoids full captitalization text_edit = text_low[i].upper() + text_low[i + 1:] lemm = self.lemmatizer.lemmatize(text_edit, pos) if lemm: value = (text[:i] + lemm[0]).title() break # last try: plain lemmatization for all remaining POS tags else: lemm = self.lemmatizer.lemmatize_plain(text, ignore_case=True) if lemm: value = lemm[0] if value and pos in { ADJ, ADP, ADV, AUX, CCONJ, CONJ, INTJ, PART, PRON, SCONJ, VERB }: value = value.lower() if value: self.stringstore.add(value) self.lookup[(text, pos)] = value return value
class IWNLPWrapperTest(unittest.TestCase): @classmethod def setUpClass(self): self.iwnlp = IWNLPWrapper(lemmatizer_path='data/IWNLP.Lemmatizer_20170501.json') def test_lemmatize_plain_example1(self): predicted = self.iwnlp.lemmatize_plain('Hallo') self.assertEqual(predicted, ['Hallo']) def test_lemmatize_plain_example2(self): predicted = self.iwnlp.lemmatize_plain('Hallo', ignore_case=False) self.assertEqual(predicted, ['Hallo']) def test_lemmatize_plain_example3(self): predicted = self.iwnlp.lemmatize_plain('birne', ignore_case=False) self.assertEqual(predicted, None) def test_lemmatize_plain_example4(self): predicted = self.iwnlp.lemmatize_plain('birne', ignore_case=True) self.assertEqual(predicted, ['Birne']) def test_lemmatize_plain_example5(self): predicted = self.iwnlp.lemmatize_plain('gespielt') self.assertEqual(predicted, ['spielen']) def test_lemmatize_plain_example6(self): predicted = self.iwnlp.lemmatize_plain('schnell') self.assertCountEqual(predicted, ['schnellen', 'schnell']) def test_lemmatize_plain_example7(self): predicted = self.iwnlp.lemmatize_plain('Gartenhäuser') self.assertEqual(predicted, ['Gartenhaus']) def test_contains_entry_example1(self): self.assertEqual(self.iwnlp.contains_entry('Birne'), True) def test_contains_entry_example2(self): self.assertEqual(self.iwnlp.contains_entry('birne', ignore_case=False), False) def test_contains_entry_example3(self): self.assertEqual(self.iwnlp.contains_entry('birne', ignore_case=True), True) def test_contains_entry_example4(self): self.assertEqual(self.iwnlp.contains_entry('groko'), False) def test_contains_entry_example5(self): self.assertEqual(self.iwnlp.contains_entry('GroKo'), True) def test_contains_entry_example6(self): self.assertEqual(self.iwnlp.contains_entry('groko', ignore_case=True), True) def test_contains_entry_example7(self): self.assertEqual(self.iwnlp.contains_entry('groko', pos='Noun'), False) def test_contains_entry_example8(self): self.assertEqual(self.iwnlp.contains_entry('groko', pos='X'), False) def test_contains_entry_example9(self): self.assertEqual(self.iwnlp.contains_entry('groko', pos='AdjectivalDeclension'), False) def test_contains_entry_example10(self): self.assertEqual(self.iwnlp.contains_entry('groko', pos=["Noun", "X"], ignore_case=True), True) def test_lemmatize_example1(self): predicted = self.iwnlp.lemmatize('Lkws', pos_universal_google='NOUN') self.assertEqual(predicted, ['Lkw']) def test_lemmatize_example2(self): predicted = self.iwnlp.lemmatize('gespielt', pos_universal_google='VERB') self.assertEqual(predicted, ['spielen']) def test_get_lemmas_example1(self): predicted = self.iwnlp.get_lemmas('groko', pos=["Noun", "X"], ignore_case=True) self.assertEqual(predicted, ['GroKo'])