def merge_entities(df): """Merges similar entities based on their lemmatized form with the purpose to reduce spelling/capitalization/inflection variations. """ lemmatizer = lemmy.load("sv") lemmatize = lambda x: lemmatizer.lemmatize("PROPN", x)[0].lower() remove = [] for i in df.index: i_w = df["word"][i] if len(i_w) < 3: continue i_l = lemmatize(i_w) for j in df.index[i + 1:]: j_w = df["word"][j] # Continue outer loop if the first letter has changed if not i_w.lower()[0] == j_w.lower()[0]: break j_l = lemmatize(j_w) if i_l == j_l or i_w == j_l[0]: df.at[i, "article_ids"] += df.at[j, "article_ids"] remove += [j_w] deduplicated = df[df["word"].apply(lambda x: x not in remove)] return deduplicated
def word_normalize(s: str, method: str = "l") -> str: """ Splits a string and lemmatizes every single word, except acronyms """ if method not in ["s", "l"]: raise ValueError("Method must be either 's' or 'l' for either" "stemming or lemmatizing") # TODO: change this to match language lemmatizer = lemmy.load("da") stemmer = DanishStemmer() words = s.split(" ") norm_words = [] for w in words: if w.isupper(): norm_words.append(w) else: if method == "l": w = lemmatizer.lemmatize("", w) norm_words.extend(w) else: w = stemmer.stem(w) norm_words.append(w) return " ".join(norm_words)
def preprocess_text(text): # text = re.sub(r'[^\w\s]', '', str(text).lower().strip()) text = str(text).lower().strip() # caveat: this might conflict with the english text da_stop_words = stopwords.words('danish') stemmer = DanishStemmer() lemmatizer = lemmy.load("da") # remove plurals textblob = TextBlob(text) singles = [stemmer.stem(word) for word in textblob.words] # remove danish stopwords no_stop_words = [word for word in singles if word not in da_stop_words] # join text so it can be lemmatized joined_text = " ".join(no_stop_words) # lemmatization final_text = lemmatizer.lemmatize("", joined_text) return final_text[0]
import joblib import re import dill import lemmy STOP_WORDS = joblib.load('../data/stops.pkl') lemmatizer = lemmy.load('da') def tokenizer(blob, stop_words=STOP_WORDS, remove_digits=True): if stop_words is None: stop_words = {} blob = blob.lower() # eyes [nose] mouth | mouth [nose] eyes pattern emoticons = r"(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)" emoticon_re = re.compile(emoticons, re.VERBOSE | re.I | re.UNICODE) text = re.sub(r'[\W]+', ' ', blob) # remove 3+ repetitive characters i.e. hellllo -> hello, jaaaa -> jaa repetitions = re.compile(r'(.)\1{2,}') text = repetitions.sub(r'\1\1', text) # remove 2+ repetitive words e.g. hej hej hej -> hej repetitions = re.compile(r'\b(\w+)\s+(\1\s*)+\b') text = repetitions.sub(r'\1 ', text)
def lemmatizer(request): return lemmy.load('da')
import re import lemmy import lemmy.pipe import nltk from polyglot.text import Text import pycld2 as cld2 import pandas as pd from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator import matplotlib.pyplot as plt import numpy as np from PIL import Image import stopwordsiso as stopwords lemmatizer = lemmy.load("da") # Stop words + cumstoms stopwordlist = stopwords.stopwords("da") stopwordlist.update([ 'du', 'og', 'til', 'kan', 'vores', 'brug', 'dine', 'første', 'ved', 'find', 'dit', 'mere', 'blevet', 'tager', 'søg', 'http', 'dk', 'søg', 'læs' ]) # Open file and lower case letters with open("pfa.txt", "r") as file: text = file.read().lower() # Remove numbers from text text = ''.join([i for i in text if not i.isdigit()]) # Remove all special characters text = re.sub(r'[-()\"#_/@;:<>{}`+=~|.!?,]', ' ', text)
def lemmatizer(request): return lemmy.load("da")
def stem_lem(words, documents, stem_or_lem: bool = False): """ Updates a word list and a corpus to use stemmed words. :param stem_or_lem: bool indicating whether to apply stemming or lemmatizer. True is stem, False is lem. :param corpus: a list of sentences (strings of words separated by spaces) :param words: a list of words :return: new corpus and words list, were all words have been replaced by stemmed/lemmetized versions. """ stop_words = stopwords.words('danish') stop_words.extend( list(utility.load_vector_file("word_datasets/stopwords.csv").values())) if stem_or_lem: # Stemming stemmer = DanishStemmer() # Update word list to use stemmed words translator = {} add = [] remove = [] for word in tqdm(words): stem = stemmer.stem(word) if stem != word: if word not in remove: remove.append(word) if stem not in add and stem not in stop_words: add.append(stem) if word not in translator and stem not in stop_words: translator[word] = stem words = [x for x in words if x not in remove] words.extend([x for x in add if x not in words]) else: lemmer = lemmy.load("da") # build up dictionary that translates old words into their new versions translator = {} add = [] remove = [] for word in tqdm(words): lem = lemmer.lemmatize("", word) other = [x for x in lem if x != word] if len(other) > 0: if word not in lem and word not in remove: remove.append(word) # add all lem options if they are not stopwords add.extend( [x for x in lem if x not in stop_words and x not in add]) if word not in translator and lem not in stop_words: lem = " ".join(lem) translator[word] = lem words = [x for x in words if x not in remove] words.extend([x for x in add if x not in words]) # update corpus to use stemmed words for x in tqdm(range(len(documents))): sentence = documents[x] for i in range(len(sentence)): word = sentence[i] if word in translator: sentence[i] = translator[word] sentence = ' '.join(sentence) sentence = sentence.split(' ') documents[x] = sentence diction = gensim.corpora.Dictionary(documents) d_words = diction.token2id good_ids = [d_words[x] for x in words] diction.filter_tokens(good_ids=good_ids) diction.compactify() return diction, documents