Пример #1
0
 def stem_using_stempel(self, stem_type="default", words=None):
     if stem_type == "polimorf":
         stemmer = StempelStemmer.polimorf()
     else:
         stemmer = StempelStemmer.default()
     if words is None:
         words = self.words
     stem_words = [stemmer.stem(w) for w in words]
     return stem_words
Пример #2
0
with open('resources/generated/news_data.json', 'r') as f:
    data = json.load(f)

all_words = []
all_articles = []
all_categories = []

for article in data['articles']:
    art = str(article['description']).lower()
    tokens = nltk.wordpunct_tokenize(art)
    all_words.extend(tokens)
    all_articles.append(art.lower())
    all_categories.append(article['category'])


stemmer = StempelStemmer.default()
all_words = [stemmer.stem(word) for word in all_words]

all_words = list(set(all_words))
all_words.remove(None)
all_words = sorted(all_words)

with open('resources/generated/input_layer_words.txt', 'w') as datafile:
    json.dump(all_words, datafile)

unique_categories = ['sports', 'health', 'business', 'entertainment', 'technology']

x = []
y = []

for article in all_articles:
Пример #3
0
    def __init__(self, text_df):

        self.text_df = text_df
        self.stemmer = StempelStemmer.default()
Пример #4
0
import spacy
import platform
import functools
import KeyExt.config
from keybert import KeyBERT
from string import punctuation
from nltk.stem import SnowballStemmer
from stempel import StempelStemmer

# Initialize all required stemmers once.
stemmers = {
    'english': SnowballStemmer('english'),
    'french': SnowballStemmer('french'),
    'spanish': SnowballStemmer('spanish'),
    'portuguese': SnowballStemmer('portuguese'),
    'polish': StempelStemmer.default()
}


def load_models():
    """
    Function which loads the english NLP model, and the Keybert model.
    This needs to run once since all models need a few seconds to load.
    """
    return (spacy.load('en_core_web_sm'),
            KeyBERT('distiluse-base-multilingual-cased-v2'))


def preprocess(lis, language):
    """
    Function which applies stemming to a