def __init__(self): nltk.download('stopwords') self.stop_words = set(stopwords.words('english')) self.ps = EnglishStemmer()
import string import spacy import transformers import tqdm import re import logging import nltk from nltk.stem import WordNetLemmatizer from nltk.stem.snowball import EnglishStemmer logger = logging.getLogger("spacy") logger.setLevel(logging.ERROR) nltk.download("punkt") lemmatizer = WordNetLemmatizer() stemmer = EnglishStemmer() # inspired by # https://prrao87.github.io/blog/spacy/nlp/performance/2020/05/02/spacy-multiprocess.html pattern = re.compile(r"[A-Za-z0-9\-]{3,25}") nlp = spacy.load("en_core_web_trf", disable=["parser", "ner", "tagger", "attribute_ruler"]) # nlp.add_pipe("sentencizer") stopwords = nlp.Defaults.stop_words summarizer = transformers.pipeline("summarization") def load_data_blocks(huge=False) -> dd.DataFrame: data_blocks = dd.read_csv( "all-the-news-2-1.csv" if huge else "articles*.csv", blocksize="8MB",
X_train_feat = vect.fit_transform(X_train, y_train) X_train_feat = X_train_feat.toarray() ### Label Encoder pickle_in = open('./label_encoder.p', 'rb') le = pickle.load(pickle_in) pickle_in.close() ### Stop words and stemmer stop_words = pickle.load(open('./stopwords.p', 'rb')) semantic_words = pickle.load(open('./whitelist_dicts/semantic_words_py34.p', 'rb')) porter = PorterStemmer() snowball = EnglishStemmer() ### Tokenizer tokenizer = lambda text: text.split() tokenizer_porter = lambda text: [porter.stem(word) for word in text.split()] tokenizer_snowball = lambda text: [snowball.stem(word) for word in text.split()] tokenizer_whitelist = lambda text: [word for word in text.split() if word in semantic_words] tokenizer_porter_wl = lambda text: [porter.stem(word) for word in text.split() if word in semantic_words] tokenizer_snowball_wl = lambda text: [snowball.stem(word) for word in text.split() if word in semantic_words] ### Vectorizer vect = TfidfVectorizer(binary=False, stop_words=stop_words, ngram_range=(1,1),
from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem.snowball import EnglishStemmer import nltk import string import re nltk.download('stopwords') ps = EnglishStemmer() def data_tokenize_clean(text): text = text.translate(str.maketrans( '', '', string.punctuation)) # remove punctuations tokens = word_tokenize(text) # tokenize the comment filtered = set() for word in tokens: word = ps.stem(word) word = word.lower() filtered.add(word) return filtered class DataCleaner: def __init__(self): nltk.download('stopwords') self.stop_words = set(stopwords.words('english')) self.ps = EnglishStemmer()
def stemmed_words_count(doc): stemmer = EnglishStemmer() analyzer = CountVectorizer().build_analyzer() return (stemmer.stem(w) for w in analyzer(doc))
def stemmed_words_tfidf(doc): stemmer = EnglishStemmer() analyzer = TfidfVectorizer().build_analyzer() return (stemmer.stem(w) for w in analyzer(doc))
from nltk.stem.snowball import EnglishStemmer from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.linear_model import LogisticRegression import helpers import hash_replacers import pickle # =========================================================================== # # ============================ GLOABAL VARIABLES ============================ # # =========================================================================== # STEMMER = EnglishStemmer(ignore_stopwords=True) # =========================================================================== # # ================================ FUNCTIONS ================================ # # =========================================================================== # def tokenize(text): """ Tokenize passed text by replacing repeating letters and stemming. :param text: string to be tokenized :return: tokenized and pre-processed text """ # Tokenize tokens = nltk.word_tokenize(text)
def token_file(self, folder_path): # file = open("tf.txt", "a") tokenizer = RegexpTokenizer(r'\w+') stemmer = EnglishStemmer() wordsdict = {} i = 0 word_position = 0 body = "" title = "" h1 = "" h2 = "" h3 = "" bold = "" strong = "" data = open(folder_path).read() # it counts title, h1, h2, h3 , b ,strong again for i in BeautifulSoup(data, "lxml").find_all("html"): body = i.text + body + " " body_t = tokenizer.tokenize(body) # title in this project is quoted by <P> </P>, only counts the first one for i in BeautifulSoup(data, "lxml").find_all("p"): title = i.text + title + " " break title_t = tokenizer.tokenize(title) for i in BeautifulSoup(data, "lxml").find_all("h1"): h1 = i.text + h1 + " " h1_t = tokenizer.tokenize(h1) for i in BeautifulSoup(data, "lxml").find_all("h2"): h2 = i.text + h2 + " " h2_t = tokenizer.tokenize(h2) for i in BeautifulSoup(data, "lxml").find_all("h3"): h3 = i.text + h1 + " " h3_t = tokenizer.tokenize(h3) for i in BeautifulSoup(data, "lxml").find_all("b"): bold = i.text + bold + " " bold_t = tokenizer.tokenize(bold) for i in BeautifulSoup(data, "lxml").find_all("strong"): strong = i.text + strong + " " strong_t = tokenizer.tokenize(strong) print("Folder: ", folder_path) for word in body_t: if word.lower() in wordsdict.keys(): wordsdict[word.lower()][0] += 1 wordsdict[word.lower()].append(word_position) word_position += 1 else: wordsdict[word.lower()] = [] wordsdict[word.lower()].append(1) wordsdict[word.lower()].append(word_position) word_position += 1 counter = 0 if len(title_t) > 0: for word in title_t: wordsdict[word.lower()][0] += 9 counter += 1 if counter == 10: break if len(h1_t) > 0: for word in h1_t: wordsdict[word.lower()][0] += 4 if len(h2_t) > 0: for word in h2_t: wordsdict[word.lower()][0] += 4 if len(h3_t) > 0: for word in h3_t: wordsdict[word.lower()][0] += 4 if len(bold_t) > 0: for word in bold_t: wordsdict[word.lower()][0] += 2 if len(strong_t) > 0: for word in strong_t: wordsdict[word.lower()][0] += 2 return wordsdict
import nltk from nltk import word_tokenize from collections import defaultdict from nltk.stem.snowball import EnglishStemmer # Assuming we're working with English import pandas as pd from tqdm import tqdm from gensim.corpora import Dictionary from gensim.models import TfidfModel import ntpath from indexing import path_leaf, Index index = Index(nltk.word_tokenize, EnglishStemmer(), nltk.corpus.stopwords.words('english')) corpus_train = [ "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/train/storyzy_en_train.tsv", "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/train/storyzy_fr_train.tsv", "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/train/storyzy_yt_train.tsv" ] corpus_test1 = [ "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/test1/storyzy_en_test1.tsv", "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/test1/storyzy_fr_test1.tsv", "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/test1/storyzy_yt_test1.tsv" ] corpus_test2 = [ "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/test2/storyzy_en_test2.tsv", "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/test2/storyzy_fr_test2.tsv", "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/test2/storyzy_yt_test2.tsv"
class Sentiment: mwe_tknzr = MWETokenizer(separator=' ') # Multi-word-expression tokenizer stemmer = EnglishStemmer() # Word stemmer scores = {} # dict of words and their scores def __init__(self, corpus): self.load_data(corpus) def check_mwe(self, word): """If there are multiple words in the line, add the multi-word-expression to the tokenizer""" tokens = word.split() if len(tokens) != 1: self.mwe_tknzr.add_mwe(tokens) def load_data(self, directory): """Extract words and their scores from the desired corpus""" # Read data in from files with open(os.path.join(directory, 'positives.txt')) as f: for line in f.read().splitlines(): row = line.rstrip('\n').split(',') word = row[0] self.check_mwe(word) intensity = float(row[-1]) self.scores[word] = intensity with open(os.path.join(directory, 'negatives.txt')) as f: for line in f.read().splitlines(): row = line.rstrip('\n').split(',') word = row[0] self.check_mwe(word) intensity = float(row[-1]) self.scores[word] = intensity def extract_words(self, sentence): """Convert a sentence into a set of tokens taking in to account multi-word-expressions""" # Create simple tokens of all words in the sentence words = [word.lower() for word in nltk.word_tokenize(sentence) if any(c.isalpha() for c in word)] # Split the tokens into multi-word-expressions, if any tokens = self.mwe_tknzr.tokenize(words) # print(tokens) return set(tokens) def compute_score(self, sentence): """Calculate the sentiment score for the given sentence""" document_words = self.extract_words(sentence) score = 0 for word in document_words: grade = self.scores.get(word.lower(), 0) if grade == 0: # If the word isn't in the scores dict, try to get the stemmed version of the word from the dict (cars becomes car, abandoned becomes abandon, etc.) grade = self.scores.get(self.stemmer.stem(word.lower()), 0) score += grade # Convert the score in to a -1 to 1 scale score = score / len(document_words) # print(score) return score def get_sentiment(self, sentence): """Classify the sentence to be positive or negative""" score = self.compute_score(sentence) if score > 0: return ("Positive", score) else: return ("Negative", score) def start(self): print("Keep entering sentences to get a sentiment estimation from the AI") print("Type 'exit' to quit") while True: s = input("Sentence: ") if s == 'exit': break print(self.get_sentiment(s))
text_rem = [x for x in text_3 if x not in text_4] ##we're going to use a similar format to apply various stemming/lemmatizing/synonyms algorithms from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() from nltk.stem import PorterStemmer pt = PorterStemmer() from nltk.stem.snowball import EnglishStemmer sb = EnglishStemmer() from nltk.stem.wordnet import WordNetLemmatizer wn = WordNetLemmatizer() ##let's examine the word ``better" st.stem('better') pt.stem('better') sb.stem('better') wn.lemmatize('better', 'a') wn.lemmatize('families', 'n') ## ##applying the porter stemmer to the gettysburg address
Devuelve el rendimiento final sobre un conjunto de pruebas que no se ha usado en el entrenamiento ni en la validación """ text, target, classes, classes_reverse = recover_from_files(eval_filepath) # Convert class to numeric y_test = [classes_reverse[x] for x in target] #Vectorize data x_test = pipe.transform(text) print("Final test") for key in scoring: print(key, scoring[key](clf, x_test, y_test)) stemmer = EnglishStemmer(ignore_stopwords=True) class StemmedCountVectorizer(CountVectorizer): """ Versión del CountVectorizer que aplica stemming a las palabras, reduciendo el tamaño del vocabulario """ def build_analyzer(self): analyzer = super(StemmedCountVectorizer, self).build_analyzer() return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)]) if __name__ == "__main__": text, target, classes, classes_reverse = recover_from_files( design_filepath)
def __init__(self): self.__snowball = EnglishStemmer() self.counts = defaultdict(lambda: defaultdict(int))
def apply(self, df): stemmer = EnglishStemmer() return pd.DataFrame([stemmer.stem(text) for text in df.values.ravel()], columns=df.columns)
from nltk.tokenize import WordPunctTokenizer from nltk.tokenize import TreebankWordTokenizer from nltk.tokenize import RegexpTokenizer #from nltk.tokenize import ToktokTokenizer from nltk.probability import FreqDist #import lib to detect language import utils.nltk_detect_lang as nltk_detect_lang import utils.nltk_common as nltk_common import utils.tok as tok #name stemmers stemmer_fr = FrenchStemmer() stemmer_en = EnglishStemmer() # Load tokenizer # You can choose the most efficient, however wordpunct is working well #tokenizer = TreebankWordTokenizer() #tokenizer = WordPunctTokenizer() #tokenizer = RegexpTokenizer(r'\w+') #tokenizer = ToktokTokenizer() tokenizer = tok.ToktokTokenizer() # Get a sorted list of repetitive words def freqDist(tokens): dist = {} for t in tokens: if len(t) == 1: continue