Пример #1
0
def merge_entities(df):
    """Merges similar entities based on their lemmatized form with the purpose
    to reduce spelling/capitalization/inflection variations.
    """
    lemmatizer = lemmy.load("sv")
    lemmatize = lambda x: lemmatizer.lemmatize("PROPN", x)[0].lower()
    remove = []

    for i in df.index:
        i_w = df["word"][i]
        if len(i_w) < 3:
            continue
        i_l = lemmatize(i_w)

        for j in df.index[i + 1:]:
            j_w = df["word"][j]

            # Continue outer loop if the first letter has changed
            if not i_w.lower()[0] == j_w.lower()[0]:
                break
            j_l = lemmatize(j_w)

            if i_l == j_l or i_w == j_l[0]:
                df.at[i, "article_ids"] += df.at[j, "article_ids"]
                remove += [j_w]

    deduplicated = df[df["word"].apply(lambda x: x not in remove)]

    return deduplicated
Пример #2
0
def word_normalize(s: str, method: str = "l") -> str:
    """
    Splits a string and lemmatizes every single word, except acronyms
    """
    if method not in ["s", "l"]:
        raise ValueError("Method must be either 's' or 'l' for either"
                         "stemming or lemmatizing")

    # TODO: change this to match language
    lemmatizer = lemmy.load("da")
    stemmer = DanishStemmer()

    words = s.split(" ")

    norm_words = []
    for w in words:
        if w.isupper():
            norm_words.append(w)
        else:
            if method == "l":
                w = lemmatizer.lemmatize("", w)
                norm_words.extend(w)
            else:
                w = stemmer.stem(w)
                norm_words.append(w)

    return " ".join(norm_words)
Пример #3
0
def preprocess_text(text):
    # text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = str(text).lower().strip()

    # caveat: this might conflict with the english text
    da_stop_words = stopwords.words('danish')
    stemmer = DanishStemmer()
    lemmatizer = lemmy.load("da")

    # remove plurals
    textblob = TextBlob(text)
    singles = [stemmer.stem(word) for word in textblob.words]

    # remove danish stopwords
    no_stop_words = [word for word in singles if word not in da_stop_words]

    # join text so it can be lemmatized
    joined_text = " ".join(no_stop_words)

    # lemmatization
    final_text = lemmatizer.lemmatize("", joined_text)

    return final_text[0]
Пример #4
0
import joblib
import re
import dill
import lemmy


STOP_WORDS = joblib.load('../data/stops.pkl')
lemmatizer = lemmy.load('da')

def tokenizer(blob, stop_words=STOP_WORDS, remove_digits=True):
    
    if stop_words is None:
        stop_words = {}
    
    blob = blob.lower()
    
     # eyes [nose] mouth | mouth [nose] eyes pattern
    emoticons = r"(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)"
    emoticon_re = re.compile(emoticons, re.VERBOSE | re.I | re.UNICODE)
    
    text = re.sub(r'[\W]+', ' ', blob)
    
    # remove 3+ repetitive characters i.e. hellllo -> hello, jaaaa -> jaa 
    repetitions = re.compile(r'(.)\1{2,}')
    text = repetitions.sub(r'\1\1', text)
    
    # remove 2+ repetitive words e.g. hej hej hej -> hej
    
    repetitions = re.compile(r'\b(\w+)\s+(\1\s*)+\b')
    text = repetitions.sub(r'\1 ', text)
    
Пример #5
0
def lemmatizer(request):
    return lemmy.load('da')
Пример #6
0
import re
import lemmy
import lemmy.pipe
import nltk
from polyglot.text import Text
import pycld2 as cld2
import pandas as pd
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import stopwordsiso as stopwords

lemmatizer = lemmy.load("da")

# Stop words + cumstoms
stopwordlist = stopwords.stopwords("da")
stopwordlist.update([
    'du', 'og', 'til', 'kan', 'vores', 'brug', 'dine', 'første', 'ved', 'find',
    'dit', 'mere', 'blevet', 'tager', 'søg', 'http', 'dk', 'søg', 'læs'
])

# Open file and lower case letters
with open("pfa.txt", "r") as file:
    text = file.read().lower()

# Remove numbers from text
text = ''.join([i for i in text if not i.isdigit()])

# Remove all special characters
text = re.sub(r'[-()\"#_/@;:<>{}`+=~|.!?,]', ' ', text)
Пример #7
0
def lemmatizer(request):
    return lemmy.load("da")
Пример #8
0
def stem_lem(words, documents, stem_or_lem: bool = False):
    """
    Updates a word list and a corpus to use stemmed words.
    :param stem_or_lem: bool indicating whether to apply stemming or lemmatizer. True is stem, False is lem.
    :param corpus: a list of sentences (strings of words separated by spaces)
    :param words: a list of words
    :return: new corpus and words list, were all words have been replaced by stemmed/lemmetized versions.
    """
    stop_words = stopwords.words('danish')
    stop_words.extend(
        list(utility.load_vector_file("word_datasets/stopwords.csv").values()))
    if stem_or_lem:
        # Stemming
        stemmer = DanishStemmer()
        # Update word list to use stemmed words
        translator = {}
        add = []
        remove = []
        for word in tqdm(words):
            stem = stemmer.stem(word)
            if stem != word:
                if word not in remove:
                    remove.append(word)
                if stem not in add and stem not in stop_words:
                    add.append(stem)
                if word not in translator and stem not in stop_words:
                    translator[word] = stem
        words = [x for x in words if x not in remove]
        words.extend([x for x in add if x not in words])
    else:
        lemmer = lemmy.load("da")
        # build up dictionary that translates old words into their new versions
        translator = {}
        add = []
        remove = []
        for word in tqdm(words):
            lem = lemmer.lemmatize("", word)
            other = [x for x in lem if x != word]
            if len(other) > 0:
                if word not in lem and word not in remove:
                    remove.append(word)
                # add all lem options if they are not stopwords
                add.extend(
                    [x for x in lem if x not in stop_words and x not in add])
                if word not in translator and lem not in stop_words:
                    lem = " ".join(lem)
                    translator[word] = lem
        words = [x for x in words if x not in remove]
        words.extend([x for x in add if x not in words])

    # update corpus to use stemmed words
    for x in tqdm(range(len(documents))):
        sentence = documents[x]
        for i in range(len(sentence)):
            word = sentence[i]
            if word in translator:
                sentence[i] = translator[word]
        sentence = ' '.join(sentence)
        sentence = sentence.split(' ')
        documents[x] = sentence

    diction = gensim.corpora.Dictionary(documents)
    d_words = diction.token2id
    good_ids = [d_words[x] for x in words]
    diction.filter_tokens(good_ids=good_ids)
    diction.compactify()

    return diction, documents