Пример #1
0
 def __init__(self):
     nltk.download('stopwords')
     self.stop_words = set(stopwords.words('english'))
     self.ps = EnglishStemmer()
Пример #2
0
import string
import spacy
import transformers
import tqdm
import re
import logging
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import EnglishStemmer

logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

nltk.download("punkt")
lemmatizer = WordNetLemmatizer()
stemmer = EnglishStemmer()

# inspired by
# https://prrao87.github.io/blog/spacy/nlp/performance/2020/05/02/spacy-multiprocess.html
pattern = re.compile(r"[A-Za-z0-9\-]{3,25}")
nlp = spacy.load("en_core_web_trf",
                 disable=["parser", "ner", "tagger", "attribute_ruler"])
# nlp.add_pipe("sentencizer")
stopwords = nlp.Defaults.stop_words
summarizer = transformers.pipeline("summarization")


def load_data_blocks(huge=False) -> dd.DataFrame:
    data_blocks = dd.read_csv(
        "all-the-news-2-1.csv" if huge else "articles*.csv",
        blocksize="8MB",
Пример #3
0
X_train_feat = vect.fit_transform(X_train, y_train)
X_train_feat = X_train_feat.toarray()

### Label Encoder

pickle_in = open('./label_encoder.p', 'rb')
le = pickle.load(pickle_in)
pickle_in.close()

### Stop words and stemmer

stop_words = pickle.load(open('./stopwords.p', 'rb'))
semantic_words = pickle.load(open('./whitelist_dicts/semantic_words_py34.p', 'rb'))
porter = PorterStemmer()
snowball = EnglishStemmer()

### Tokenizer

tokenizer = lambda text: text.split()
tokenizer_porter = lambda text: [porter.stem(word) for word in text.split()]
tokenizer_snowball = lambda text: [snowball.stem(word) for word in text.split()]
tokenizer_whitelist = lambda text: [word for word in text.split() if word in semantic_words]
tokenizer_porter_wl = lambda text: [porter.stem(word) for word in text.split() if word in semantic_words]
tokenizer_snowball_wl = lambda text: [snowball.stem(word) for word in text.split() if word in semantic_words]

### Vectorizer

vect = TfidfVectorizer(binary=False,
                       stop_words=stop_words,
                       ngram_range=(1,1),
Пример #4
0
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import EnglishStemmer
import nltk
import string
import re

nltk.download('stopwords')
ps = EnglishStemmer()


def data_tokenize_clean(text):
    text = text.translate(str.maketrans(
        '', '', string.punctuation))  # remove punctuations
    tokens = word_tokenize(text)  # tokenize the comment
    filtered = set()

    for word in tokens:
        word = ps.stem(word)
        word = word.lower()

        filtered.add(word)

    return filtered


class DataCleaner:
    def __init__(self):
        nltk.download('stopwords')
        self.stop_words = set(stopwords.words('english'))
        self.ps = EnglishStemmer()
Пример #5
0
def stemmed_words_count(doc):
    stemmer = EnglishStemmer()
    analyzer = CountVectorizer().build_analyzer()
    return (stemmer.stem(w) for w in analyzer(doc))
Пример #6
0
def stemmed_words_tfidf(doc):
    stemmer = EnglishStemmer()
    analyzer = TfidfVectorizer().build_analyzer()
    return (stemmer.stem(w) for w in analyzer(doc))
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression

import helpers
import hash_replacers

import pickle


# =========================================================================== #
# ============================ GLOABAL VARIABLES ============================ #
# =========================================================================== #

STEMMER = EnglishStemmer(ignore_stopwords=True)


# =========================================================================== #
# ================================ FUNCTIONS ================================ #
# =========================================================================== #

def tokenize(text):
    """
    Tokenize passed text by replacing repeating letters and stemming.
    :param text: string to be tokenized
    :return: tokenized and pre-processed text
    """
    # Tokenize
    tokens = nltk.word_tokenize(text)    
    
Пример #8
0
	def token_file(self, folder_path):
		# file = open("tf.txt", "a")
		tokenizer = RegexpTokenizer(r'\w+')
		stemmer = EnglishStemmer()
		wordsdict = {}
		i = 0
		word_position = 0

		body = ""
		title = ""
		h1 = ""
		h2 = ""
		h3 = ""
		bold = ""
		strong = ""
		data = open(folder_path).read()
		# it counts title, h1, h2, h3 , b ,strong again
		for i in BeautifulSoup(data, "lxml").find_all("html"):
			body = i.text + body + " "
		body_t = tokenizer.tokenize(body)
		# title in this project is quoted by <P> </P>, only counts the first one
		for i in BeautifulSoup(data, "lxml").find_all("p"):
			title = i.text + title + " "
			break
		title_t = tokenizer.tokenize(title)
		for i in BeautifulSoup(data, "lxml").find_all("h1"):
			h1 = i.text + h1 + " "
		h1_t = tokenizer.tokenize(h1)
		for i in BeautifulSoup(data, "lxml").find_all("h2"):
			h2 = i.text + h2 + " "
		h2_t = tokenizer.tokenize(h2)
		for i in BeautifulSoup(data, "lxml").find_all("h3"):
			h3 = i.text + h1 + " "
		h3_t = tokenizer.tokenize(h3)
		for i in BeautifulSoup(data, "lxml").find_all("b"):
			bold = i.text + bold + " "
		bold_t = tokenizer.tokenize(bold)
		for i in BeautifulSoup(data, "lxml").find_all("strong"):
			strong = i.text + strong + " "
		strong_t = tokenizer.tokenize(strong)


		print("Folder: ", folder_path)
		
		for word in body_t:
			if word.lower() in wordsdict.keys():
				wordsdict[word.lower()][0] += 1
				wordsdict[word.lower()].append(word_position)
				word_position += 1
			else:
				wordsdict[word.lower()] = []
				wordsdict[word.lower()].append(1)
				wordsdict[word.lower()].append(word_position)
				word_position += 1

		counter = 0
		if len(title_t) > 0:		
			for word in title_t:
				wordsdict[word.lower()][0] += 9
				counter += 1
				if counter == 10:
					break

		if len(h1_t) > 0:
			for word in h1_t:
				wordsdict[word.lower()][0] += 4
		if len(h2_t) > 0:
			for word in h2_t:
				wordsdict[word.lower()][0] += 4
		if len(h3_t) > 0:
			for word in h3_t:
				wordsdict[word.lower()][0] += 4
		if len(bold_t) > 0:
			for word in bold_t:
				wordsdict[word.lower()][0] += 2
		if len(strong_t) > 0:
			for word in strong_t:
				wordsdict[word.lower()][0] += 2

		return wordsdict
Пример #9
0
import nltk
from nltk import word_tokenize
from collections import defaultdict
from nltk.stem.snowball import EnglishStemmer  # Assuming we're working with English
import pandas as pd
from tqdm import tqdm
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
import ntpath

from indexing import path_leaf, Index

index = Index(nltk.word_tokenize, EnglishStemmer(),
              nltk.corpus.stopwords.words('english'))

corpus_train = [
    "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/train/storyzy_en_train.tsv",
    "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/train/storyzy_fr_train.tsv",
    "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/train/storyzy_yt_train.tsv"
]

corpus_test1 = [
    "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/test1/storyzy_en_test1.tsv",
    "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/test1/storyzy_fr_test1.tsv",
    "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/test1/storyzy_yt_test1.tsv"
]

corpus_test2 = [
    "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/test2/storyzy_en_test2.tsv",
    "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/test2/storyzy_fr_test2.tsv",
    "/home/thiziri/Documents/DOCTORAT/EVENTS/HACKATON_CORIA18/test2/storyzy_yt_test2.tsv"
Пример #10
0
class Sentiment:
    mwe_tknzr = MWETokenizer(separator=' ') # Multi-word-expression tokenizer
    stemmer = EnglishStemmer() # Word stemmer
    scores = {} # dict of words and their scores

    def __init__(self, corpus):
        self.load_data(corpus)

    
    def check_mwe(self, word):
        """If there are multiple words in the line, add the multi-word-expression to the tokenizer"""
        tokens = word.split()
        if len(tokens) != 1:
            self.mwe_tknzr.add_mwe(tokens)


    def load_data(self, directory):
        """Extract words and their scores from the desired corpus"""
        # Read data in from files
        with open(os.path.join(directory, 'positives.txt')) as f:
            for line in f.read().splitlines():
                
                row = line.rstrip('\n').split(',')
                word = row[0]
                self.check_mwe(word)
                intensity = float(row[-1])
                
                self.scores[word] = intensity

        with open(os.path.join(directory, 'negatives.txt')) as f:
            for line in f.read().splitlines():

                row = line.rstrip('\n').split(',')
                word = row[0]
                self.check_mwe(word)
                intensity = float(row[-1])
                
                self.scores[word] = intensity


    def extract_words(self, sentence):
        """Convert a sentence into a set of tokens taking in to account multi-word-expressions"""
        # Create simple tokens of all words in the sentence
        words = [word.lower() for word in nltk.word_tokenize(sentence) if any(c.isalpha() for c in word)]
        # Split the tokens into multi-word-expressions, if any
        tokens = self.mwe_tknzr.tokenize(words)
        # print(tokens)
        return set(tokens)


    def compute_score(self, sentence):
        """Calculate the sentiment score for the given sentence"""
        document_words = self.extract_words(sentence)
        score = 0
        for word in document_words:
            grade = self.scores.get(word.lower(), 0)
            if grade == 0:
                # If the word isn't in the scores dict, try to get the stemmed version of the word from the dict (cars becomes car, abandoned becomes abandon, etc.)
                grade = self.scores.get(self.stemmer.stem(word.lower()), 0)
            score += grade
        # Convert the score in to a -1 to 1 scale
        score = score / len(document_words)
        # print(score)
        return score


    def get_sentiment(self, sentence):
        """Classify the sentence to be positive or negative"""
        score = self.compute_score(sentence)
        if score > 0:
            return ("Positive", score)
        else:
            return ("Negative", score)
    
    def start(self):
        print("Keep entering sentences to get a sentiment estimation from the AI")
        print("Type 'exit' to quit")
        while True:
            s = input("Sentence: ")
            if s == 'exit':
                break
            print(self.get_sentiment(s))
Пример #11
0
text_rem = [x for x in text_3 if x not in text_4]

##we're going to use a similar format to apply various stemming/lemmatizing/synonyms algorithms

from nltk.stem.lancaster import LancasterStemmer

st = LancasterStemmer()

from nltk.stem import PorterStemmer

pt = PorterStemmer()

from nltk.stem.snowball import EnglishStemmer

sb = EnglishStemmer()

from nltk.stem.wordnet import WordNetLemmatizer

wn = WordNetLemmatizer()

##let's examine the word ``better"
st.stem('better')
pt.stem('better')
sb.stem('better')
wn.lemmatize('better', 'a')

wn.lemmatize('families', 'n')

##
##applying the porter stemmer to the gettysburg address
Пример #12
0
    Devuelve el rendimiento final sobre un conjunto de pruebas que no se ha usado en el entrenamiento ni en la validación
    """
    text, target, classes, classes_reverse = recover_from_files(eval_filepath)

    # Convert class to numeric
    y_test = [classes_reverse[x] for x in target]

    #Vectorize data
    x_test = pipe.transform(text)

    print("Final test")
    for key in scoring:
        print(key, scoring[key](clf, x_test, y_test))


stemmer = EnglishStemmer(ignore_stopwords=True)


class StemmedCountVectorizer(CountVectorizer):
    """
    Versión del CountVectorizer que aplica stemming a las palabras, reduciendo el tamaño del vocabulario
    """
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])


if __name__ == "__main__":
    text, target, classes, classes_reverse = recover_from_files(
        design_filepath)
Пример #13
0
 def __init__(self):
     self.__snowball = EnglishStemmer()
     self.counts = defaultdict(lambda: defaultdict(int))
Пример #14
0
 def apply(self, df):
     stemmer = EnglishStemmer()
     return pd.DataFrame([stemmer.stem(text) for text in df.values.ravel()],
                         columns=df.columns)
Пример #15
0
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import RegexpTokenizer
#from nltk.tokenize import ToktokTokenizer

from nltk.probability import FreqDist

#import lib to detect language
import utils.nltk_detect_lang as nltk_detect_lang
import utils.nltk_common as nltk_common

import utils.tok as tok

#name stemmers
stemmer_fr = FrenchStemmer()
stemmer_en = EnglishStemmer()

# Load tokenizer
# You can choose the most efficient, however wordpunct is working well
#tokenizer = TreebankWordTokenizer()
#tokenizer = WordPunctTokenizer()
#tokenizer = RegexpTokenizer(r'\w+')
#tokenizer = ToktokTokenizer()
tokenizer = tok.ToktokTokenizer()


# Get a sorted list of repetitive words
def freqDist(tokens):
    dist = {}
    for t in tokens:
        if len(t) == 1: continue