def stem_using_stempel(self, stem_type="default", words=None): if stem_type == "polimorf": stemmer = StempelStemmer.polimorf() else: stemmer = StempelStemmer.default() if words is None: words = self.words stem_words = [stemmer.stem(w) for w in words] return stem_words
with open('resources/generated/news_data.json', 'r') as f: data = json.load(f) all_words = [] all_articles = [] all_categories = [] for article in data['articles']: art = str(article['description']).lower() tokens = nltk.wordpunct_tokenize(art) all_words.extend(tokens) all_articles.append(art.lower()) all_categories.append(article['category']) stemmer = StempelStemmer.default() all_words = [stemmer.stem(word) for word in all_words] all_words = list(set(all_words)) all_words.remove(None) all_words = sorted(all_words) with open('resources/generated/input_layer_words.txt', 'w') as datafile: json.dump(all_words, datafile) unique_categories = ['sports', 'health', 'business', 'entertainment', 'technology'] x = [] y = [] for article in all_articles:
def __init__(self, text_df): self.text_df = text_df self.stemmer = StempelStemmer.default()
import spacy import platform import functools import KeyExt.config from keybert import KeyBERT from string import punctuation from nltk.stem import SnowballStemmer from stempel import StempelStemmer # Initialize all required stemmers once. stemmers = { 'english': SnowballStemmer('english'), 'french': SnowballStemmer('french'), 'spanish': SnowballStemmer('spanish'), 'portuguese': SnowballStemmer('portuguese'), 'polish': StempelStemmer.default() } def load_models(): """ Function which loads the english NLP model, and the Keybert model. This needs to run once since all models need a few seconds to load. """ return (spacy.load('en_core_web_sm'), KeyBERT('distiluse-base-multilingual-cased-v2')) def preprocess(lis, language): """ Function which applies stemming to a