def vocab_features_X(word_lsts, word2Id, num_of_annotators = 1): x = [] for i in range(len(word_lsts)): row = [stemmer().stem(word.lower()) for word in word_lsts[i]] row = [word2Id[word] if word in word2Id else -1 for word in row] x += [row] * num_of_annotators return x
def get_bio_freqs_word(bio_freqs, word_lsts): bio_freqs_word = defaultdict(int) for i in range(len(bio_freqs)): for j in range(len(bio_freqs[i])): bio = bio_freqs[i][j].split("|") word_stem = stemmer().stem(word_lsts[i][j].lower()) bio_freqs_word[word_stem] += (int(bio[0]) + int(bio[1])) return bio_freqs_word
def setUp(self): self.df = pd.read_csv('./tests/data.csv') self.st_words = stopwords.words('english') self.stemmer = stemmer('english') self.tmp_docs = [ txtp.full_preprocess(d, set(self.st_words)) for d in self.df.values ] self.v = txtp.preproc_cpp.generate_vocab(self.tmp_docs) self.tf = txtp.preproc_cpp.tf_cpp(self.tmp_docs, self.v)
def get_bold_occurrences_word(word_vocab, word_probs): """return word occurences with maximum probabilty in each senteces """ bold_occurrences_word = defaultdict(int) for i in range(len(word_vocab)): max_indx , max_value = -1, 0 for j in range(len(word_vocab[i])): word_prob = float(word_probs[i][j]) if max_value < word_prob: max_value, max_indx = word_prob , j bold_occurrences_word[stemmer().stem(word_vocab[i][max_indx].lower())] += 1 return bold_occurrences_word
def AvaliarConteudo(self, regras, mensagem): stem = stemmer() mensagem_f = [stem.stem(msg) for msg in Resposta.__flatten_tupla(mensagem)] print(regras) print(mensagem) regras_validas = [regra for regra in regras if regra[0] != "*"] #Removo as regras wildcard palavras = [] for regra in regras_validas: for palavra in regra: p = stem.stem(palavra) if p in mensagem_f: print(regra, mensagem) palavras.append(regra) break #A regra já foi atendida return len(palavras) == len(regras_validas)
def cleanWord(chaine): #Enlever les ponctuations chaine = re.sub(u"[^\w\d\s]+", " ", chaine) #Découper les mots (Tokenization) tokens = word_tokenize(chaine) #Mettre les mots en miniscule for i in tokens: i = i.lower() #Eliminer les top Words if i in STOPWORDS: continue #Stemming des mots stemming = stemmer() i = stemming.stem(i) #Retourner la liste des mots résultats sous forme d'une chaine liste = [] liste.append(i) return (" ".join(liste))
import pandas as pd import gensim from gensim.utils import simple_preprocess from gensim.parsing.preprocessing import STOPWORDS from nltk.stem import WordNetLemmatizer, SnowballStemmer as stemmer from nltk.stem.porter import * import numpy as np import nltk from gensim import corpora, models from pprint import pprint from itertools import chain stemmer = stemmer('english') def lemmatize_stemming(text): return stemmer.stem(WordNetLemmatizer().lemmatize(text)) def preprocess(text): result = [] for token in gensim.utils.simple_preprocess(text): if token not in gensim.parsing.preprocessing.STOPWORDS and len( token) > 3: result.append(lemmatize_stemming(token)) return result processed_docs = [] data = pd.read_csv('meme1sentiment.csv', error_bad_lines=False)
def get_occurrences_word(word_vocab): words_vocab = [stemmer().stem(word.lower()) for innerlist in word_vocab for word in innerlist] occurrences_word = defaultdict(int) for word in words_vocab: occurrences_word[word] += 1 return occurrences_word
def build_vocab(word_lsts): words = [word for sent in word_lsts for word in sent] words = set([stemmer().stem(word.lower()) for word in words]) Id2word = dict(enumerate(sorted(words))) word2Id = dict(map(reversed, Id2word.items())) return Id2word, word2Id
def get_dataId2word(word_lsts, words_id): ids = [id for sent in words_id for id in sent] words = [word for sent in word_lsts for word in sent] words = ([stemmer().stem(word.lower()) for word in words]) return dict(zip(ids, words))
from nltk.tokenize import sent_tokenize as sents from nltk.stem import SnowballStemmer as stemmer from nltk.stem import WordNetLemmatizer as lemma from nltk import pos_tag as pos from nltk import RegexpParser as regex_parse from os import listdir from pprint import pprint from math import * from sklearn.cluster import Ward import matplotlib.pyplot as plt from pulp import * STOP_WORDS = set(stop.words('english')) GOOD_POS = {'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS'} #,'VB','VBD','VBG','VBN','VBP','VBZ'} SNOW = stemmer('english') LEMMATE = lemma() CHUNK = regex_parse('CONCEPT: {<JJ>*<NN>(<CC|IN>?<NN>)?}') #concept tagging FORCE_TAGS = {'prof': 'NP'} NPTEL_STOP_WORDS = { 'time', 'slide', 'refer', 'x', 'one', 'equal', 'minus', 'u', 'also', 'plus', 'see', 'like', 'mean', 'get', 'let', 'value', 'say', 'point', 'two', 'ha', 'n', 'y', 'case', 'd', 'going', 'go', 'c', 'know', '1', '2', '3', 'take', 'r', 'p', 'look', 'would', 'wa', 'v', 'term', 'k', 'first', 'f', 'thing', 'may', 'e', 'particular', 'example', 'different', 'find', 'problem', 'actually', 'use', 'therefore', 'given', 'm', 'give', 'way', 'come', 'z', 'called', 'need', 'want', '4', 'write', 'change', 'l', 'h', 'another', 'using', 'right', 'second', 'used', 'kind', 'much', 'important', 'g', 'le', 'side', 'doe', 'well', 'constant', 'method', 'part', 'j', 'something', 'alpha', 'beta', 'gamma', 'delta', 'could', 'q', 'next', 'variable', 'course', 'lecture', 'three', 'put', 'done', 'output', 'high',
import numpy as np import pandas as pd import string from nltk.stem import WordNetLemmatizer from nltk.stem import SnowballStemmer as stemmer import nltk from nltk.corpus import stopwords from collections import Counter import nltk.stem # Special vocabulary module from shoyu import vocabulary_hdp as vocab # In[69]: stemmer = stemmer("english") # In[70]: def preprocess(doc): return [ stemmer.stem(WordNetLemmatizer().lemmatize(w, pos='v')) for w in doc.translate(str.maketrans( '', '', string.punctuation)).lower().split(' ') ] def rm_stopwords_and_short_words(words): results = [] for i in words: