def __init__(self): with Timer() as self.model_load_time: from iwnlp.iwnlp_wrapper import IWNLPWrapper from stts2upos import conv_table data_loc = "/opt/iwnlp/IWNLP.Lemmatizer_20181001.json" self.lemmatizer = IWNLPWrapper(lemmatizer_path=data_loc) def myprocessor(myinput): mydoc = string2doc(myinput) for sent in mydoc: for tok in sent: try: matching_lemmas = self.lemmatizer.lemmatize( tok.word, conv_table.get(tok.xpos)) if matching_lemmas is None: tok.lemma = "_" # elif len(matching_lemmas) > 1: # print("lots o lemmas!", matching_lemmas) else: # unclear how to select best alternative # just use first item in list tok.lemma = matching_lemmas[0] except ValueError: tok.lemma = "_" # don't repeat gold pos in output tok.hide_fields(HIDDEN_FIELDS) return mydoc self.processor = myprocessor
def __init__(self, lemmatizer_path, nlp): self.lemmatizer = IWNLPWrapper(lemmatizer_path=lemmatizer_path) self.stringstore = nlp.vocab.strings # self.matcher = PhraseMatcher(nlp.vocab) Token.set_extension('iwnlp_lemmas', getter=self.lemmatize, force=True) self.lookup = { ('fast', ADV): 'fast', }
def __init__(self, lemmatizer_path, use_plain_lemmatization=False, ignore_case=False): self.lemmatizer = IWNLPWrapper(lemmatizer_path=lemmatizer_path) self.use_plain_lemmatization = use_plain_lemmatization self.ignore_case = ignore_case Token.set_extension('iwnlp_lemmas', getter=self.get_lemmas, force=True)
def __init__(self): self.logger = logging.getLogger() self.logger.setLevel(logging.DEBUG) self.lemmatizer = IWNLPWrapper( lemmatizer_path='data/IWNLP/IWNLP.Lemmatizer_20170501.json') self.sentiws = SentiWSWrapper(sentiws_path='data/sentiws') self.logger.debug('Loading Spacy model') self.nlp = spacy.load('de') self.logger.debug('Spacy model loaded')
class Lemmatizer(BaseEstimator): def __init__(self, lang): self.lang = lang self.nlp = nlp = spacy.load(lang) current_dir = os.path.dirname(__file__) self.iwnlp = IWNLPWrapper(lemmatizer_path=current_dir + '/../resources/IWNLP.Lemmatizer_20170501.json') def fit(self, X, y=None): return self def transform(self, X): ret_list = [] for row in X: doc = self.nlp(row) # workaround until german lemmatizer is integrated in spacy if self.lang == 'de': new_row = self._lemmatize_german(doc) else: new_row = ' '.join([word.lemma_ for word in doc]) ret_list.append(new_row) return ret_list def _lemmatize_german(self, doc): new_row = '' for word in doc: lemmatized = self.iwnlp.lemmatize(str(word), word.pos_) if lemmatized is not None: new_row += ' ' + lemmatized[0] else: new_row += ' ' + str(word) return new_row
def preprocess(self): tokenizedTweets_writer = open( './daten/tokenized_tweets_normalized.txt', 'w') preprocTweets_writer = open( './daten/preprocessed_tweets_normalized.txt', 'w') pp = Pipeline(self.this_file, "./autosarkasmus/rsrc/de-tiger.map") tweets_tkn, tweets_proc, labels = pp.process() assert (len(tweets_tkn) == len(tweets_proc) == len(labels)) # filter stopwords + normalize tokens lemmatizer = IWNLPWrapper( lemmatizer_path='daten/IWNLP.Lemmatizer_20170501.json') lemmatized_tokens = [] for x in range(len(tweets_tkn)): tweet = [] for token in tweets_tkn[x]: if token.lower() in stopwords.words('german'): continue try: lemma = lemmatizer.lemmatize_plain(token, ignore_case=True) if (lemma): tweet.append(lemma[0]) else: tweet.append(token) except Exception as e: print(e) lemmatized_tokens.append(tweet) assert (len(lemmatized_tokens) == len(tweets_proc) == len(labels)) # write preprocessing results to file for x in range(len(lemmatized_tokens)): t_tweet = (" ").join(lemmatized_tokens[x]) p_tweet = (" ").join( [str(x) + "/" + str(y) for x, y in tweets_proc[x]]) label = labels[x] tokenizedTweets_writer.write(t_tweet + "\t" + label + "\n") preprocTweets_writer.write(p_tweet + "\t" + label + "\n")
class spaCyIWNLP(object): def __init__(self, lemmatizer_path, use_plain_lemmatization=False, ignore_case=False): self.lemmatizer = IWNLPWrapper(lemmatizer_path=lemmatizer_path) self.use_plain_lemmatization = use_plain_lemmatization self.ignore_case = ignore_case Token.set_extension('iwnlp_lemmas', getter=self.get_lemmas, force=True) def __call__(self, doc): for token in doc: token._.iwnlp_lemmas = self.get_lemmas(token) return doc def get_lemmas(self, token): if self.use_plain_lemmatization: return self.lemmatizer.lemmatize_plain( token.text, ignore_case=self.ignore_case) else: return self.lemmatizer.lemmatize(token.text, pos_universal_google=token.pos_)
class SpacyWrapper(object): def __init__(self): self.logger = logging.getLogger() self.logger.setLevel(logging.DEBUG) self.lemmatizer = IWNLPWrapper( lemmatizer_path='data/IWNLP/IWNLP.Lemmatizer_20170501.json') self.sentiws = SentiWSWrapper(sentiws_path='data/sentiws') self.logger.debug('Loading Spacy model') self.nlp = spacy.load('de') self.logger.debug('Spacy model loaded') def process_sentence(self, sentence): result = self.nlp(sentence) tokens = [] dependencies = [] for token in result: iwnlp_lemma = self.lemmatizer.lemmatize( token.text, pos_universal_google=token.pos_) sentiws = self.sentiws.determine(token.text, pos_universal_google=token.pos_) token_model = Token(token.i + 1, text=token.text, spacy_pos_stts=token.tag_, spacy_pos_universal_google=token.pos_, iwnlp_lemma=iwnlp_lemma, spacy_ner_type=token.ent_type_, spacy_ner_iob=token.ent_iob_, spacy_is_punct=token.is_punct, spacy_is_space=token.is_space, spacy_like_num=token.like_num, spacy_like_url=token.like_url, spacy_shape=token.shape_, polarity_sentiws=sentiws) tokens.append(token_model) dependency_model = Dependency(token.i + 1, token.dep_, token.head.i + 1) dependencies.append(dependency_model) # print(token_model.token_index_in_sentence, token_model.text.encode('utf-8'), # format_iwnlp_lemma(token_model.iwnlp_lemma), token_model.spacy_pos_stts, # token_model.spacy_pos_universal_google, token_model.spacy_ner_type, token_model.spacy_ner_iob) return {'tokens': tokens, 'dependencies': dependencies}
def load(cls, lemmatizer_path): lemmatizer = IWNLPWrapper(lemmatizer_path=lemmatizer_path) return cls(lemmatizer)
class LemmatizerPlus(object): def __init__(self, lemmatizer_path, nlp): self.lemmatizer = IWNLPWrapper(lemmatizer_path=lemmatizer_path) self.stringstore = nlp.vocab.strings # self.matcher = PhraseMatcher(nlp.vocab) Token.set_extension('iwnlp_lemmas', getter=self.lemmatize, force=True) self.lookup = { ('fast', ADV): 'fast', } def __call__(self, doc): for token in doc: token._.iwnlp_lemmas = self.lemmatize(token) return doc def lemmatize(self, token): """ TODO: This doc is slightly outdated This function uses the IWNLP lemmatizer with a few enhancements for compund nouns and nouns with uncommon capitalization. Can also be used to lemmatize tokens with different POS-tags. Do not use this function to lemmatize phrases. :param token: white space stripped single token (str) :return: str # TODO: tuple of type (str, bool) value[0]: The lemma of the token if a lemma can be derived, else None. # TODO: value[1]: True if the token can be retrieved from the Wiktionary database as is, # else False. """ text = token.text.strip() pos = token.pos_ # nothing to lemmatize here if pos in {PHRASE, NPHRASE, PUNCT, SPACE, SYM}: return text # lemmatiaztions are odd on DET and NUM, so better leave it alone if pos in {DET, NUM}: return None # Wiktionary has no POS PROPN if pos == PROPN: pos = NOUN # first lookup token for given POS in dictionary if (text, pos) in self.lookup: return self.lookup[(text, pos)] value = None # default IWNLP lemmatization lemm = self.lemmatizer.lemmatize(text, pos) # default lemmatization hit? if lemm: value = lemm[0] # default lemmatization miss? # apply some rules to derive a lemma from the original token (nouns only) elif pos == NOUN: # first try default noun capitalization lemm = self.lemmatizer.lemmatize(text.title(), pos) if lemm: value = lemm[0] else: # still no results: try all noun suffixes # TODO: search for a more efficient implementation text_low = text.lower() tolerance = 3 for i in range(1, len(text) - tolerance): # looks ugly, but avoids full captitalization text_edit = text_low[i].upper() + text_low[i + 1:] lemm = self.lemmatizer.lemmatize(text_edit, pos) if lemm: value = (text[:i] + lemm[0]).title() break # last try: plain lemmatization for all remaining POS tags else: lemm = self.lemmatizer.lemmatize_plain(text, ignore_case=True) if lemm: value = lemm[0] if value and pos in { ADJ, ADP, ADV, AUX, CCONJ, CONJ, INTJ, PART, PRON, SCONJ, VERB }: value = value.lower() if value: self.stringstore.add(value) self.lookup[(text, pos)] = value return value
class Preprocess: # zur Lemmatisierung im Deutschen nlp = spacy.load('de') # IWNLP German Lemmatizations: dirname = os.path.dirname(__file__) iwnlp_file = os.path.join(dirname, 'data/IWNLP.Lemmatizer_20181001.json') #iwnlp = spaCyIWNLP(lemmatizer_path='data/IWNLP.Lemmatizer_20181001.json', ignore_case=True) lemmatizer = IWNLPWrapper(lemmatizer_path=iwnlp_file) #add custom tokenizer nlp.tokenizer = custom_tokenizer(nlp) ''' try: # add pipes nlp.add_pipe(iwnlp) # nlp.add_pipe(__set_custom_boundaries, before='parser') except Exception: pass ''' stopwords_to_remove_from_default_set = [ "schlecht", "mensch", "menschen", "beispiel", "gott", "jahr", "jahre", "jahren", "nicht", "uhr" ] for stopword in stopwords_to_remove_from_default_set: nlp.vocab[stopword].is_stop = False #Spacy Token Tags, which will be removed by preprocessing tags_to_remove = [ '$(', '$,', '$.', 'APPR', 'APPO', 'APPRART', 'APZR', 'ART', 'ITJ', 'KOKOM', 'KON', 'KOUI', 'KOUS', # 'CARD', 'PDS', 'PAV', 'PROAV', 'PDAT', 'PIAT', 'PIDAT', 'PIS', 'PPER', 'PPOSAT', 'PPOSS', 'PRELAT', 'PRELS', 'PRF', 'PTKA', # 'PTKANT', 'PTKVZ', 'PTKZU', 'PWAT', 'PWAV', 'PWS', 'TRUNC', 'XY', 'SP', 'WRP' ] def __init__(self, text, split_in_sentences=True, with_pos=False): ''' :param text: input text :param split_in_sentences: split text in sentences --> sub-arrays for sentences in Preprocess-result :param with_pos: true: give tripel with (<startpos in orig-text>, <endpos in origtext>, token), else only tokens ''' self.text = text self.nlp_text = self.nlp(text) self.maintain_indeces = [] self.noun_chunks = self.get_noun_chunks(cleaned=True, flattened=True) self.maintain_indeces.extend(index for index in self.noun_chunks if index not in self.maintain_indeces) self.named_entities = self.get_named_entities(flattened=True) self.maintain_indeces.extend(index for index in self.named_entities if index not in self.maintain_indeces) self.maintain_indeces.sort() self.preprocessed = self.preprocess(sentence_split=split_in_sentences, with_pos=with_pos) def __get_lemma(self, token): ''' take lemma of IWNLP, if given, else spacy lemma :param token: spacy-token :return: lemmatization ''' #lemma_iwnlp_list = token._.iwnlp_lemmas lemma_iwnlp_list = self.lemmatizer.lemmatize_plain(token.text, ignore_case=False) if lemma_iwnlp_list: lemma_iwnlp = lemma_iwnlp_list[0] #print(token, ":::", lemma_iwnlp_list[0]) return lemma_iwnlp return token.lemma_ def get_named_entities(self, only_indeces=True, flattened=False): ''' return array of named entities (PER: Person, LOC: Location, ORG: Named corporate, governmental, or other organizational entity, MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art) :param only_indeces: :param flattened: returns only 1d array, else related entities are in sup-arrays :return: array with named entities ''' if flattened: named_ents = [ word.i if only_indeces else (word.i, word, ents.label_) for ents in self.nlp_text.ents for word in ents ] else: named_ents = [[ word.i if only_indeces else (word.i, word, ents.label_) for word in ents ] for ents in self.nlp_text.ents] return named_ents def get_noun_chunks(self, only_indices=True, cleaned=True, flattened=False): ''' return array of noun_chunks/noun_phrases of the Text object :param only_indices: :param cleaned: noun phrases without stopword, punctuation :param flattened: returns only 1d array, else related phrases are in sup-arrays :return: array with noun-phrases ''' # noun_words = [(word.i, word) for ent in text.noun_chunks for word in ent] # noun_words = [[(word.i, word) for word in ent] for ent in text.noun_chunks] if flattened: if cleaned: noun_words = [ word.i if only_indices else (word.i, word) for ent in self.nlp_text.noun_chunks for word in ent if self.__is_valid_token(word) ] else: noun_words = [ word.i if only_indices else (word.i, word) for ent in self.nlp_text.noun_chunks for word in ent ] else: if cleaned: noun_words = [[ word.i if only_indices else (word.i, word) for word in ent if self.__is_valid_token(word) ] for ent in self.nlp_text.noun_chunks] else: noun_words = [[ word.i if only_indices else (word.i, word) for word in ent ] for ent in self.nlp_text.noun_chunks] return noun_words def __is_valid_token(self, token): ''' checks if token is valid: no stopword, punctuation oder whitespace :param token: spacy-token :return: bool ''' # nlp(token.lower_)[0] wegen spacy bug --> z.B. "Der" würde nicht als stopwort erkannt werden, "der" aber schon if not self.nlp( token.lower_ )[0].is_stop and not token.is_punct and not token.is_space: return True return False def __tokenize_words(self, doc, with_pos=False): ''' tokenizes text and removes unimportant tokens :param doc: input spacy doc :param with_pos: true: give tripel with (<startpos in orig-text>, <endpos in origtext>, token), else only tokens :return: 1d array of tokens ''' tokenized_text = [ (token.idx, token.idx + len(token), self.__get_lemma(token).lower()) if with_pos else self.__get_lemma(token).lower() for token in doc if self.__is_valid_token(token) and not token.tag_ in self.tags_to_remove or token.i in self.maintain_indeces ] return tokenized_text def __tokenize_to_list_sentences(self, with_pos=False): ''' tokenizes text and removes unimportant tokens, split by sentences :param with_pos: true: give tripel with (<startpos in orig-text>, <endpos in origtext>, token), else only tokens :return: 2d array of tokens in sub-arrays (sentences) ''' filtered_text = [] for sentence in self.nlp_text.sents: filtered_sentence = self.__tokenize_words(sentence, with_pos=with_pos) filtered_text.append(filtered_sentence) return filtered_text def preprocess(self, sentence_split=True, with_pos=False): ''' preprocess text. removes unimportant tokens :param sentence_split: split by sentences :param with_pos: true: give tripel with (<startpos in orig-text>, <endpos in origtext>, token), else only tokens :return: 1d or 2d array with preprocessed text ''' if sentence_split: preprocessed_text = self.__tokenize_to_list_sentences( with_pos=with_pos) else: preprocessed_text = self.__tokenize_words(self.nlp_text, with_pos=with_pos) return preprocessed_text
def setUpClass(self): self.iwnlp = IWNLPWrapper(lemmatizer_path='data/IWNLP.Lemmatizer_20170501.json')
class IWNLPWrapperTest(unittest.TestCase): @classmethod def setUpClass(self): self.iwnlp = IWNLPWrapper(lemmatizer_path='data/IWNLP.Lemmatizer_20170501.json') def test_lemmatize_plain_example1(self): predicted = self.iwnlp.lemmatize_plain('Hallo') self.assertEqual(predicted, ['Hallo']) def test_lemmatize_plain_example2(self): predicted = self.iwnlp.lemmatize_plain('Hallo', ignore_case=False) self.assertEqual(predicted, ['Hallo']) def test_lemmatize_plain_example3(self): predicted = self.iwnlp.lemmatize_plain('birne', ignore_case=False) self.assertEqual(predicted, None) def test_lemmatize_plain_example4(self): predicted = self.iwnlp.lemmatize_plain('birne', ignore_case=True) self.assertEqual(predicted, ['Birne']) def test_lemmatize_plain_example5(self): predicted = self.iwnlp.lemmatize_plain('gespielt') self.assertEqual(predicted, ['spielen']) def test_lemmatize_plain_example6(self): predicted = self.iwnlp.lemmatize_plain('schnell') self.assertCountEqual(predicted, ['schnellen', 'schnell']) def test_lemmatize_plain_example7(self): predicted = self.iwnlp.lemmatize_plain('Gartenhäuser') self.assertEqual(predicted, ['Gartenhaus']) def test_contains_entry_example1(self): self.assertEqual(self.iwnlp.contains_entry('Birne'), True) def test_contains_entry_example2(self): self.assertEqual(self.iwnlp.contains_entry('birne', ignore_case=False), False) def test_contains_entry_example3(self): self.assertEqual(self.iwnlp.contains_entry('birne', ignore_case=True), True) def test_contains_entry_example4(self): self.assertEqual(self.iwnlp.contains_entry('groko'), False) def test_contains_entry_example5(self): self.assertEqual(self.iwnlp.contains_entry('GroKo'), True) def test_contains_entry_example6(self): self.assertEqual(self.iwnlp.contains_entry('groko', ignore_case=True), True) def test_contains_entry_example7(self): self.assertEqual(self.iwnlp.contains_entry('groko', pos='Noun'), False) def test_contains_entry_example8(self): self.assertEqual(self.iwnlp.contains_entry('groko', pos='X'), False) def test_contains_entry_example9(self): self.assertEqual(self.iwnlp.contains_entry('groko', pos='AdjectivalDeclension'), False) def test_contains_entry_example10(self): self.assertEqual(self.iwnlp.contains_entry('groko', pos=["Noun", "X"], ignore_case=True), True) def test_lemmatize_example1(self): predicted = self.iwnlp.lemmatize('Lkws', pos_universal_google='NOUN') self.assertEqual(predicted, ['Lkw']) def test_lemmatize_example2(self): predicted = self.iwnlp.lemmatize('gespielt', pos_universal_google='VERB') self.assertEqual(predicted, ['spielen']) def test_get_lemmas_example1(self): predicted = self.iwnlp.get_lemmas('groko', pos=["Noun", "X"], ignore_case=True) self.assertEqual(predicted, ['GroKo'])
from iwnlp.iwnlp_wrapper import IWNLPWrapper output = 'output/Baselist.txt' path = "Baselist.txt" lemmatizer = IWNLPWrapper(lemmatizer_path='IWNLP.Lemmatizer_20170501.json') with open(path, 'r') as read_file: data = read_file.read().splitlines() tokens = [] lemmatized = [] tags = [] for i in data: j = i.split('|') tokens.append(j[0].lower()) tags.append(j[1]) for token, tag in zip(tokens, tags): lemma = lemmatizer.lemmatize(token, pos_universal_google=tag) lemmatized.append(lemma) for token, lemma in zip(tokens, lemmatized): '' # print(token, lemma) with open(output, 'w') as write_file: for i, lemma in enumerate(lemmatized): if (lemmatized[i] == None): write_file.write(tokens[i] + '|' + tags[i] + "\n") else:
def __init__(self, lang): self.lang = lang self.nlp = nlp = spacy.load(lang) current_dir = os.path.dirname(__file__) self.iwnlp = IWNLPWrapper(lemmatizer_path=current_dir + '/../resources/IWNLP.Lemmatizer_20170501.json')