def __init__(self, BASEDIR, session_only=False, cycle_time=1): super().__init__(BASEDIR, session_only, cycle_time) self.name = 'contentrank' mapper = Mapping() self.rec_mapping = mapper.get_header_rec() self.event_mapping = mapper.get_header_event() self.update_mapping = mapper.get_header_update() self.item_id_idx = self.rec_mapping.index('ITEM_SOURCE') self.publisher_id_idx = self.rec_mapping.index('PUBLISHER') self.recs_idx = self.event_mapping.index('recs') self.limit_idx = self.rec_mapping.index('limit') self.title_idx = self.update_mapping.index('title') self.text_idx = self.update_mapping.index('text') self.update_id_idx = self.update_mapping.index('id') self.update_domainid_idx = self.update_mapping.index('domainid') self.germanStemmer = GermanStemmer(ignore_stopwords=True) self.stopwords = stopwords.words('german') self.stems = {} # (item, [stem, stem, stem]) self.correct = 0 self.total_events = 0 self.nrrows = 0 self.counts = {}
def cosine_preprocess(texts, pickle_name, pickle_folder='pickle'): pickle_path = os.path.join(pickle_folder, pickle_name) # Return from disk if possible for efficiency reasons if os.path.exists(pickle_path): with open(pickle_path, 'rb') as f: return pickle.load(f) processed = [] for text in tqdm(texts): stemmer = GermanStemmer() words = stopwords.words('german') tokens = [ stemmer.stem(token) for token in word_tokenize(text) if token not in words ] processed.append(' '.join(tokens)) # Pickle the output if not os.path.exists(pickle_folder): os.makedirs(pickle_folder) with open(pickle_path, 'wb') as f: pickle.dump(processed, f) return processed
def evaluate_dnn(path:str): with open(os.path.join(path, "tag_to_int.json"), "rt") as f: tag_to_int = json.load(f) with open(os.path.join(path, "int_to_tag.json"), "rt") as f: int_to_tag = json.load(f) cv = pickle.load(open(os.path.join(path, "cv.p"), "rb")) stemmer = GermanStemmer() model_name = "dnn_intent_classification.h5" model = load_model(os.path.join(path, model_name)) with open(os.path.join("Data", "commands", "Test", "testingdata.json"), "rt") as f: val_data = json.load(f) X = [] y = [] for tag, commands in val_data.items(): for command in commands: command = " ".join(stemmer.stem(c) for c in sorted(word_tokenize(command))) X.append(transform_command_BoW(command, cv)) y.append(tag_to_int[tag]) X = np.array(X) y = np.array(y) predictions = model.predict(X) predicted_indices = np.argmax(predictions, 1) print("acc: ", accuracy_score(y, predicted_indices)) cm = confusion_matrix(y, predicted_indices) cm = pd.DataFrame(cm, index=int_to_tag.values(), columns=int_to_tag.values()) print(cm) return (accuracy_score(y, predicted_indices), cm)
def build_stems(pattern: str, category: Category, elements: List[Tuple[Category, Set[str]]], total_stems: Set[str]) -> Set[str]: """ Builds a set of stems for all words used in the pattern. Args: pattern: The pattern to tokenize and stem. category: The category of the pattern. elements: A mutable list of (category, stem) pairs that the new stems will be appended to. total_stems: The set of total stems before this function was invoked. Will not be mutated. Returns: The union of total_stems and stems found in the pattern. """ # Tokenize pattern into words words = nltk.word_tokenize(pattern) # Get stems for the pattern's words, as a set to avoid duplicates stemmer = GermanStemmer() stems: Set[str] = {stemmer.stem(w.lower()) for w in words} # Add stems associated with association to the category to the # pattern list. elements.append((category, stems)) # Add stems to total set of stems, needed for conversion to numeric # TensorFlow training array total_stems |= stems return total_stems
def __init__(self, config): self.config = config if config.stem: if config.lang == 'en': self.stemmer = PorterStemmer() elif config.lang == 'de': self.stemmer = GermanStemmer() else: self.stemmer = IdStemmer()
def _check_NE_yeah(gram): tag = entities.get(" ".join(gram), "O") if tag == "O": if len(gram) == 2: first, last = gram if first in vornamen and last in nachnamen: tag = "PER" if tag == "O": try: tag = entities.get( " ".join([GermanStemmer().stem(g) for g in gram]), "O") except: tag = entities.get( " ".join([ GermanStemmer().stem(g.decode(encoding="UTF-8")) for g in gram ]), "O") return tag
def ner_features(sentence, i, history): # TODO: try using TreeTagger's POS tag wordO = sentence[i] word = wordO.string pos = wordO.pos stemmed = GermanStemmer().stem(word) if i == 0: prevword, prevpos = "<START>", "<START>" last = "<START>" prevstemmed = "<START>" else: last = history[-1] prevword = sentence[i - 1].string prevpos = sentence[i - 1].pos prevstemmed = GermanStemmer().stem(sentence[i - 1].string) chunk = [] if not wordO.chunk: chunk.append("START") knowledge_sources = "O" else: knowledge_sources = check_NE(convert(wordO.string), wordO.chunk) chunk = [w.string for w in wordO.chunk] stem_is_word = stemmed == word.lower() knowledge_sources_stemmed = _check_NE_yeah([stemmed]) return { "knowledge": knowledge_sources, "knowledge_lemma": knowledge_sources_stemmed, "history": "+".join(history)[-2:], "pos": pos, "word": word, "stemmed": stemmed }
def remove_stop_words(msg): # remove stop words and stem words stemmer = GermanStemmer() tokenizer = RegexpTokenizer(r'\w+') words = tokenizer.tokenize(msg) stop_words = set(stopwords.words('german')) words_filtered = [] for w in words: if w not in stop_words: words_filtered.append(stemmer.stem(w)) return words_filtered
def __init__(self): self.tweets = 0 self.related_tweets = 0 self.stopwords = {} self.stemmers = {} self.stemmers["es"] = SpanishStemmer() self.stemmers["en"] = PorterStemmer() self.stemmers["fr"] = FrenchStemmer() self.stemmers["de"] = GermanStemmer() self.stopwords["es"] = self.load_stopwords_file( "spanish_stopwords.txt") self.stopwords["en"] = self.load_stopwords_file( "english_stopwords.txt") self.stopwords["fr"] = self.load_stopwords_file("french_stopwords.txt") self.stopwords["ge"] = self.load_stopwords_file("german_stopwords.txt") self.output_file = open(sys.argv[2], 'a')
def __init__(self, essay: str, name: str, gazetteer_version: int = 1): """ Initalizes the Stringmatcher. Takes a path to an essay and the gazetteer version, that should be used. See the above dict :param file_path: path to the essay that is to be processed :param gazetteer_version: the gazetteer version that should be used. See the above defined dict "version_subfolder" for what values are possible """ # Initialize data structures self.essay = essay self.essay_name = name self.gazetteer_version = gazetteer_version self.tokens_without_stopwords = [] self.found_entities = dict() self.stemmer = GermanStemmer() self.fastText_model = None self.spacy_model = None self.file_path = RESULTS_PATH + name if not os.path.exists(self.file_path): os.makedirs(self.file_path) # retrieve the gazetteers that should be used for annotation self.gazetteers = sorted([ f for f in os.listdir(PATH_GAZETTEERS + version_subfolder[gazetteer_version]) if os.path.isfile(PATH_GAZETTEERS + version_subfolder[gazetteer_version] + f) ]) print("Used gazetteers: %s" % (gazetteer_version)) # retrieve gazetteers with already preprocessed entries if available (for efficiency reasons) or create new one if os.path.isfile(PATH_GAZETTEERS + "tokenized_gazetteers"): self.tokenized_gazetteers = pickle.load( open(PATH_GAZETTEERS + "tokenized_gazetteers", "rb")) else: self.tokenized_gazetteers = dict() changed = False for gazetteer_filename in self.gazetteers: # if there is not already a tokenized version of this gazetteer, tokenize it if not gazetteer_filename in self.tokenized_gazetteers.keys(): self.tokenized_gazetteers[ gazetteer_filename] = self.tokenize_gazetteer( gazetteer_filename) changed = True if changed: pickle.dump(self.tokenized_gazetteers, open(PATH_GAZETTEERS + "tokenized_gazetteers", "wb"))
def __init__(self, lang, strip_accents=None, ngram_range=(1, 1), max_df=1.0, min_df=1, stop_words=None): if lang == 'de': self.stemmer = GermanStemmer() else: self.stemmer = EnglishStemmer() super(self.__class__, self).__init__(stop_words=stop_words, strip_accents=strip_accents, ngram_range=ngram_range, max_df=max_df, min_df=min_df)
def stemWord(self, word, lng): '''Separates the word's changeable part with a '|' for wordfast''' if lng == 'ru': stemmer = RussianStemmer() elif lng == 'en': stemmer = PorterStemmer() elif lng == 'de': stemmer = GermanStemmer() else: print('Language error. Exiting...') sys.exit(1) word = word.lower() #otherwise the stemmer fails if len(word) <= 3: return word elif len(word) == len(stemmer.stem(word)): return "{0}|{1}".format(word[:-1], word[-1]) else: return "{0}|{1}".format(word[:len(stemmer.stem(word))], \ word[len(stemmer.stem(word)):])
def _preprocess(text, mode=None): '''helper function to preprocess text. returns List of Sentences''' sentences = split_single(text) if mode: nlp = spacy.load('de_core_news_sm') if mode == 'lemmatize': sentences = [ Sentence((' ').join([token.lemma_ for token in nlp(s)])) for s in sentences ] elif mode == 'stem': stemmer = GermanStemmer() sentences = [ Sentence((' ').join( [stemmer.stem(token.text) for token in nlp(s)])) for s in sentences ] else: sentences = [Sentence(s, use_tokenizer=True) for s in sentences] return sentences
def clean_text(text): """ :param text: :return: """ # stopwords = set(nltk.corpus.stopwords.words('german')) file_path = r'etc/models/german.txt' with open(file_path) as file: file_data = file.read() stopwords = file_data.split('\n') gs = GermanStemmer() text_cleaned = "" text_cleaned = re.sub('[^a-zA-Z]', ' ', text) # Keep only alphabet and space characters text_cleaned = text_cleaned.lower() # All character to lowercase text_cleaned = text_cleaned.split( ) # Split to list of word (split by space specify character) text_cleaned = [ gs.stem(word) for word in text_cleaned if not word in stopwords ] text_cleaned = ' '.join(text_cleaned) return text_cleaned
def text_cleaner(text): use_GermanStemmer = False tokens = False # Remove username handles # -? do we need the user names text = remove_handles(text) # Remove punctuation marks text_blob = TextBlob(text) text = ' '.join(text_blob.words) # replace the umlauts # ============================================================================= # text = re.sub('ä', 'ae', text) # text = re.sub('ö', 'oe', text) # text = re.sub('ü', 'ue', text) # text = re.sub('Ä', 'Ae', text) # text = re.sub('Ö', 'Oe', text) # text = re.sub('Ü', 'Ue', text) # text = re.sub('ß', 'ss', text) # ============================================================================= # remove the numbers text = re.sub(r'[0-9]+', '', text) # Remove emojis german_char = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZäöüÄÖÜ" text = ''.join(c for c in text if c in german_char) tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True) if tokens: return tokenizer.tokenize(text) elif use_GermanStemmer: stemmer = GermanStemmer() return [stemmer.stem(token) for token in tokenizer.tokenize(text)] else: return text
def set_stemmer(stemmer_language): if (stemmer_language == "GER"): stemmers = GermanStemmer() else: stemmers = EnglishStemmer() return stemmers
sorted_d = np.sort([int(x["Veröffentlichungsdatum"].split("-")[0]) for x in d]) year_indices = {} for ind, ind_year in enumerate( sorted([np.where(sorted_d == x)[0][0] for x in set(sorted_d)])): year_indices.update({list(range(2010, 2019 + 1))[ind]: ind_year}) year_indices[2020] = None # IMPORTANT! # 70 k times 588 k is big, sizing down therefore year = 2010 d = d[year_indices[year]:year_indices[year + 1]] # d[:500] nltk.download("stopwords") stop_words_en = stopwords.words('english') stemmer = GermanStemmer() # Cistem() with open("stop_full.pkl", "rb") as f: stop_words = pickle.load(f) stop_words = [x.strip() for x in stop_words] + stop_words_en def preprocess(text): text = text.lower().split() # text = [w.split(".")[0].split(",")[0].split(":")[0].split(";")[0] for w in text] text = " ".join(text) remove_punctuation_regex = re.compile( r"[^A-ZÄÖÜäöüßa-z ]" ) # regex for all characters that are NOT A-Z, a-z and space " " text = re.sub(remove_punctuation_regex, "", text) text = text.split()
from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from io import StringIO from boilerpipe.extract import Extractor from nltk.stem.snowball import GermanStemmer from nltk import word_tokenize import nltk.data import os import re logger = logging.getLogger(__name__) logging.getLogger('pdfminer').setLevel(logging.CRITICAL) satztokenizer = nltk.data.load('tokenizers/punkt/german.pickle') stemmer = GermanStemmer() stoppwörter = [] '''Lädt Stopwortliste''' with open('traindata/german', 'r') as f: for line in f: wort = line.split('\n')[0] stoppwörter.append(wort.lower()) def preprocess(text): '''Filterregelungen, um Text zu vereinheitlichen.''' try: text = re.sub( "/innen|\*innen|/-innen", "innen", text) # Vereinheitlicht unterschiedliche Gender-Varianten text = re.sub("-\s*\n", "", text) # Entfernt Silbentrennung
def __init__(self): self.stemmer = GermanStemmer()
test_df.reset_index(inplace=True) print test_df.isnull().sum() print 'Unique restaurants: {}'.format(len(data['restaurant_id'].unique())) print 'Unique menu_category: {}'.format(len(data['menu_category'].unique())) print 'Unique product_name: {}'.format(len(data['product_name'].unique())) print 'Unique ingredients: {}'.format(len(data['ingredients'].unique())) print test_df.shape encode_menu = test_df['menu_category'].str.encode('ascii', errors='ignore') print len(encode_menu.unique()) encode_menu.replace({r'[^a-zA-Z0-9\s,]':''}, regex=True, inplace=True) print len(encode_menu.unique()) encode_menu = encode_menu.apply(lambda x:GermanStemmer().stem(x)) print len(encode_menu.unique()) encode_name = test_df['product_name'].str.encode('ascii', errors='ignore') print len(encode_name.unique()) encode_name.replace({r'[^a-zA-Z0-9\s,]':''}, regex=True, inplace=True) print len(encode_name.unique()) encode_name = encode_name.apply(lambda x:GermanStemmer().stem(x)) print len(encode_name.unique()) # X = pd.concat([encode_menu, encode_name, test_df['restaurant_id'].astype('str')], axis=1) # le = preprocessing.LabelEncoder() # X_2 = X.apply(le.fit_transform) # print X_2.head() # print X_2.shape
class StringHandler: _STEMMER = GermanStemmer() _P_SIMILARITY_THRESHOLD: float = 0.9 def __init__(self, string_series: pd.Series): self._ds = string_series.str.lower() self.ds_origin = string_series def optimize(self): self.remove_noise() self.split_text() self.build_sentence() self.stem_words() # self.correct_spelling() def reset(self): self.ds = self.ds_origin.copy() # string manipulation ################################## def stem_words(self): self.ds = self.ds.apply(StringHandler.stem_sentence) def split_text(self): self.ds = self.ds.str.split(' ') def remove_noise(self): self.ds = self.ds.str.replace(r'[^a-zA-Z0-9]', ' ') # remove leftover isolated substrings that are not words/digits def build_sentence(self): self.ds = self.ds.apply(lambda x: ' '.join(word.strip() for word in x if word)) # nlp manipulation ################################## def correct_spelling(self): uniques = self.get_unique_series uniques.apply(lambda x: list(i for i in uniques if i != x and SequenceMatcher(None, x, i).ratio() > 0.9)) @classmethod def stem_sentence(cls, sentence: str, split_char: str = ' '): return ' '.join(cls._STEMMER.stem(word) for word in sentence.split(split_char)) # properties ################################## @property def get_unique_series(self): return pd.Series(self.ds.unique()).sort_values().reset_index(drop=True) @property def ds(self): return self._ds @ds.setter def ds(self, ds: pd.Series): if isinstance(ds, pd.Series) and not ds.empty: self._ds = ds else: raise TypeError('Wrong variable type or empty series')
] res.append("\n".join(lines)) return res # In[3]: def subwords(word): return [word[:2], word[2:]] # In[27]: stem = GermanStemmer().stem cnt_vect_splits = [ ("short", lambda doc: [line for line in doc if len(line) <= 1], {}), ("long", lambda doc: [line for line in doc if len(line) > 1], {}), ("subwords", lambda doc: [ list(map(stem, concat(subwords(word) for word in line))) for line in doc ], { "ngram_range": (1, 1) }), ] doc_funcs = [ ("num_char", lambda doc: len(re.findall("[A-Za-zäöüÄÖÜß]", doc))), ]
import nltk import sys from string import punctuation import re from nltk.stem.snowball import GermanStemmer reload(sys) sys.setdefaultencoding('utf-8') #pre-processing tools sents_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') #sents_tokenizer_de = nltk.data.load('tokenizers/punkt/german.pickle') stemmerEn = nltk.PorterStemmer() # uses nltk Porter stemmer wnl = nltk.WordNetLemmatizer() stemmerDe = GermanStemmer() # uses nltk Snowballs stemmer for German def split_into_sentences(text): import re caps = "([A-Z])" prefixes = "(Mr|St|Mrs|Ms|Dr|dr|etc|vs|doc|art|no|inc|mr)[.]" suffixes = "(Inc|Ltd|Jr|Sr|Co|gdp|hon)" starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" acronyms = "([A-Za-z][.][A-Za-z][.](?:[A-Za-z][.])?)" websites = "[.](com|net|org|io|gov|de|fr|il|mk)" dates = "(\d\d?)\.(\s+(januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember|jahrestag))" #dates = "(\d\d?)\." www = "(www)\." times = "(\d\d?)\.(\s?\d\d?)" full_date ="(\d\d?)\.(\s?\d\d?)\.(\s?\d\d\d?\d?)"
import re import preprocess_files from nltk.stem.snowball import GermanStemmer gs = GermanStemmer() punctuations = '''!()-[]{};:'"\,<>/?@#$%^&*_~''' def match_synms(tokens): syn_dict = preprocess_files.read_synms_list() for t in tokens: for (idx, val) in enumerate(t): if val in syn_dict: t[idx] = syn_dict[val] return tokens def _remove_punctuation(tokens): tokens_filt = [] for gT in tokens: if gT not in punctuations: tokens_filt.append(gT) return tokens_filt def _remove_stopwords(tokens): '''Remove stop words from an array of tokens''' stopWords = ['the', 'to', '-', 'pr', 'der', 'is', 'of', 'die', 'in', 'and', 'und', '–', '•', '✔', '●', 'a']
def get_stem_relations(sentences, gn): """Gets verb-noun relations between two sentences. Returns Array of word-pairs between two sentences """ # Init word pairs word_pairs = [] # Init stemmer stemmer = GermanStemmer(ignore_stopwords=True) # Loop over every sentence for val, sentence in enumerate(sentences): # Is current sentence not the last # sentence? If so carry on if val != (len(sentences) - 1): # Get stems of all words in current sentence stems_next_sentence = map(lambda x: stemmer.stem(x['lemma']), sentences[val + 1]) # Nouns in next sentence nouns_next_sentence = [ word['lemma'] for word in sentences[val + 1] if word['noun'] ] # Nouns of current sentence words_current_sentence = [ word for word in sentence if word['noun'] ] # Loop over every word in current sentece for word in sentences[val]: # Stem of current word stem_current_word = stemmer.stem(word['lemma']) # Is the stemmed word in the next sentence, great. # If word is a lame 'sein', ignore it if (stem_current_word in stems_next_sentence) and word['lemma'] != 'sein': # Get index of stem that is related to current word index_word_next_sentence = stems_next_sentence.index( stem_current_word) # Corresponding word in next sentence corresponding_word = sentences[val + 1][index_word_next_sentence] # Only add word pairs if verb or noun if word['noun'] or word['verb']: # Get dictionary of word in next sentence dict_next = sentences[val + 1][index_word_next_sentence] # We do not want to combine words # that have the same grammatical function # A noun should not be combined with a noun # We are only interested in verb-noun relations if word['verb'] and dict_next['noun']: # Get all combinations of corresponding noun # in next sentence an all nouns in current sentence for wordCurrent in words_current_sentence: # Append to list word_pairs.append({ 'source': { 'word': corresponding_word['orth'], 'lemma': corresponding_word['lemma'], 'sentence': val }, 'target': { 'word': wordCurrent['orth'], 'lemma': wordCurrent['lemma'], 'sentence': val + 1 }, 'device': 'verb noun relation' }) # Current word is noun and corresponding word is # verb elif word['noun'] and dict_next['verb']: # Get all combinations of of noun in this sentence # with nouns in next sentence for wordNext in sentences[val + 1]: # Do not use stupid 'sein' if wordNext['noun']: # Append to list word_pairs.append({ 'source': { 'word': word['orth'], 'lemma': word['lemma'], 'sentence': val }, 'target': { 'word': wordNext['orth'], 'lemma': wordNext['lemma'], 'sentence': val + 1 }, 'device': 'noun verb relation' }) return word_pairs
def stem_words(self, words): stemmer = GermanStemmer() stemmed_words = [] for word in words: stemmed_words.append(stemmer.stem(word)) return stemmed_words
def load_stemmer(self): self._stemmer = None if self._stemming_lang == Language.GERMAN: self._stemmer = GermanStemmer() else: self._stemmer = EnglishStemmer()
from nltk.tokenize import word_tokenize from nltk.stem.snowball import GermanStemmer import os import json import nltk import pandas as pd import numpy as np stemmer = GermanStemmer(ignore_stopwords=True) CONFLICT_OUTPUT_PATH = os.path.join("Output") CREATE_VOCABULARY = os.path.join("Output") def combine_data_panning(dirpath: str, output_name: str = None): ACTION = "actions" TAG = "tag" COMMANDS = "commands" # with open(os.path.join("Data", "stopwords.txt"), "rt") as f: # stopwords = set(f.read().splitlines()) document_pathes = [os.path.join(dirpath, x) for x in os.listdir(dirpath)] new_data = {} for i, document in enumerate(document_pathes): with open(document, "rt") as f: commands = json.load(f) repeat = set() for action in commands[ACTION]:
from nlingua.stemmers import GermanSnowballStemmer from nltk.stem.snowball import GermanStemmer import codecs if __name__ == '__main__': l = [] with codecs.open("german_words.txt", encoding="utf-8", mode="r") as f: words = f.readlines() words = [x[:-1] for x in words] correct = 0 stemmer = GermanSnowballStemmer() stemmer2 = GermanStemmer() for word in words: a = stemmer.stem(word) b = stemmer2.stem(word) if a == b: correct += 1 else: print(word, a, b) print(f"{correct}/{len(words)} correct")