예제 #1
0
 def stem_using_stempel(self, stem_type="default", words=None):
     if stem_type == "polimorf":
         stemmer = StempelStemmer.polimorf()
     else:
         stemmer = StempelStemmer.default()
     if words is None:
         words = self.words
     stem_words = [stemmer.stem(w) for w in words]
     return stem_words
예제 #2
0
def get_java_stemmer(stemmer_table_fpath, jar_fpath):
    os.environ['CLASSPATH'] = jar_fpath

    from jnius import autoclass

    FileInputStream = autoclass('java.io.FileInputStream')
    StempelStemmer = autoclass(
        'org.apache.lucene.analysis.stempel.StempelStemmer')
    stemmerTrie = StempelStemmer.load(FileInputStream(stemmer_table_fpath))
    return StempelStemmer(stemmerTrie)
 def __init__(self):
     self.DATASET_PATH = DATA_CONFIG["data_path"]
     self.corpus = None
     self.bayes = None
     self.svm_model = None
     self.stemmer = StempelStemmer.polimorf()
     self.processor = BasicProcessor()
     self.tfidf = TfidfVectorizer(max_features=5000)
     self.encoder = LabelEncoder()
     self.stop_words = self.read_file(
         DATA_CONFIG["stopwords_path"]).split('\n')
     self.codes_map = {
         "POWTRZ":
         "Wniosek o powtórzenie roku studiów/powtarzanie przedmiotu",
         "PRZEP": "Wniosek o przepisanie oceny",
         "WYKR": "Wniosek o wykreślenie z listy studentów",
         "IPS": "Wniosek o Indywidualny Program Studiów",
         "ECTS": "Wniosek o kontynuowanie studiów z deficytem punktów ECTS",
         "INZ": "Rejestracja pracy inżynierskiej",
         "DZIEKAN": "Podanie do dziekana",
         "PRAKT": "Wniosek o zgodę na odbycie praktyki studenckiej",
         "WARUN": "Wniosek o wpis warunkowy",
         "REAKT": "Wniosek o reaktywację studiów",
         "LIC": "Rejestracja pracy licencjackiej"
     }
     self.get_data()
     self.train_model()
예제 #4
0
 def add_keyword_if_not_exists(self, word):
     if word[0] != "\"":
         stemmer = StempelStemmer.polimorf()
         word = stemmer.stem(word)
     else:
         word = word[1:-1]
     if self.get_keyword(word) is None:
         return Keyword(word=word).save()
     else:
         return self.get_keyword(word)
def cleanall(df):

    #converting given variable to Series for further cleaning
    df = pd.Series(df)

    #making all letters lowercase, to avoid further issues regarding case sensivity
    df = df.str.lower()

    #declaring a function to find and remove certain patterns using regex
    def remove_pattern(text,pattern):

        #re.findall() finds the patterns, e.g. @user, and puts it in a list for further task
        r = re.findall(pattern,text)

        #re.sub() removes pattern from the sentences in the dataset
        for i in r:
            text = re.sub(i,"",text)

        return text

    #removing '@user' pattern through vectorize function
    df = np.vectorize(remove_pattern)(df, "@[\w]*")

    #removing 'https://t.co/' pattern through vectorize function
    #since in our database there are tweets with links, we need to remove links as well
    df = np.vectorize(remove_pattern)(df, "https://t.co/[\w]*")

    df = pd.Series(df)

    #replacing the symbols and punctuation with spaces, leaving all the polish special characters in place
    df = df.str.replace("[^\w]", " ")

    #getting rid of all the numbers in the tweets
    df = df.str.replace("[0-9]", " ")

    #removing '#hashtag' pattern, since in our database hashtags are rarely used and it makes more sense to just get rid of them
    df = np.vectorize(remove_pattern)(df, "#[\w]*")

    df = pd.Series(df)

    #tokenizing the tweets
    df = df.apply(lambda x: x.split())

    #reading a .txt list of stopwords into a python list (stopwords were taken from https://github.com/bieli/stopwords)
    with open('../lib/polishstopwords.txt', 'r') as stopwords:
        stop = stopwords.read().splitlines()

    #getting rid of the stop words
    for i in range(len(df)):
        df[i] = [j for j in df[i] if j not in stop]

    #stemming the tweets (stripping the suffixes) using pystempel library
    ps = StempelStemmer.polimorf()
    df = df.apply(lambda x: [ps.stem(i) for i in x])

    #Polish stemmer is defective and sometimes turns random words into NoneType, which prevents stiching the tokens back into strings
    #we need to remove all NoneType objects in order to proceed
    for i in range(len(df)):
         df[i] = [j for j in df[i] if j]

    #stiching the tweets back together
    for i in range(len(df)):
        df[i] = ' '.join(df[i])

    return df
예제 #6
0
import sys
sys.path.insert(0, "../../VoiceAssistant")

# check operating system version
import platform
isLinux = 'Linux' == platform.system()

import tkinter as tk
from tkinter import font as tk_font
import command_manager
import threading
from UI.HomePage.home_page_layout import HomePage
from UI.Login.login_layout import LoginPage
from UI.Register.resgister_layout import RegisterPage
from stempel import StempelStemmer
stemmer = StempelStemmer.polimorf()


class Main(tk.Tk):
    def __init__(self, *args, **kwargs):
        tk.Tk.__init__(self, *args, **kwargs)

        self.title_font = tk_font.Font(family='Helvetica',
                                       size=18,
                                       weight="bold",
                                       slant="italic")

        container = tk.Frame(self)
        container.pack(side="top", fill="both", expand=True)
        container.grid_rowconfigure(0, weight=1)
        container.grid_columnconfigure(0, weight=1)
예제 #7
0
with open('resources/generated/news_data.json', 'r') as f:
    data = json.load(f)

all_words = []
all_articles = []
all_categories = []

for article in data['articles']:
    art = str(article['description']).lower()
    tokens = nltk.wordpunct_tokenize(art)
    all_words.extend(tokens)
    all_articles.append(art.lower())
    all_categories.append(article['category'])


stemmer = StempelStemmer.default()
all_words = [stemmer.stem(word) for word in all_words]

all_words = list(set(all_words))
all_words.remove(None)
all_words = sorted(all_words)

with open('resources/generated/input_layer_words.txt', 'w') as datafile:
    json.dump(all_words, datafile)

unique_categories = ['sports', 'health', 'business', 'entertainment', 'technology']

x = []
y = []

for article in all_articles:
예제 #8
0
def test_polimorf():
    stemmer = StempelStemmer.from_file('../data/polimorf/stemmer_polimorf.tbl.gz')
    assert stemmer.stem('jabłkami') == 'jabłko'
예제 #9
0
    def __init__(self, text_df):

        self.text_df = text_df
        self.stemmer = StempelStemmer.default()
예제 #10
0
def get_python_stemmer(stemmer_table_fpath):
    from stempel import StempelStemmer
    return StempelStemmer.from_file(stemmer_table_fpath)
 def get_stemmed_word(self, word):
     stemmer = StempelStemmer.polimorf()
     return stemmer.stem(word)
예제 #12
0
import spacy
import platform
import functools
import KeyExt.config
from keybert import KeyBERT
from string import punctuation
from nltk.stem import SnowballStemmer
from stempel import StempelStemmer

# Initialize all required stemmers once.
stemmers = {
    'english': SnowballStemmer('english'),
    'french': SnowballStemmer('french'),
    'spanish': SnowballStemmer('spanish'),
    'portuguese': SnowballStemmer('portuguese'),
    'polish': StempelStemmer.default()
}


def load_models():
    """
    Function which loads the english NLP model, and the Keybert model.
    This needs to run once since all models need a few seconds to load.
    """
    return (spacy.load('en_core_web_sm'),
            KeyBERT('distiluse-base-multilingual-cased-v2'))


def preprocess(lis, language):
    """
    Function which applies stemming to a