Пример #1
0
 def vocab_features_X(word_lsts, word2Id, num_of_annotators = 1):
     x = []
     for i in range(len(word_lsts)):
         row = [stemmer().stem(word.lower()) for word in word_lsts[i]]
         row = [word2Id[word] if word in word2Id else -1 for word in row]
         x += [row] * num_of_annotators
     return x
Пример #2
0
def get_bio_freqs_word(bio_freqs, word_lsts):
    bio_freqs_word = defaultdict(int)
    for i in range(len(bio_freqs)):
        for j in range(len(bio_freqs[i])):
            bio = bio_freqs[i][j].split("|")
            word_stem = stemmer().stem(word_lsts[i][j].lower())
            bio_freqs_word[word_stem] += (int(bio[0]) + int(bio[1]))
    return bio_freqs_word
Пример #3
0
 def setUp(self):
     self.df = pd.read_csv('./tests/data.csv')
     self.st_words = stopwords.words('english')
     self.stemmer = stemmer('english')
     self.tmp_docs = [
         txtp.full_preprocess(d, set(self.st_words)) for d in self.df.values
     ]
     self.v = txtp.preproc_cpp.generate_vocab(self.tmp_docs)
     self.tf = txtp.preproc_cpp.tf_cpp(self.tmp_docs, self.v)
Пример #4
0
def get_bold_occurrences_word(word_vocab, word_probs):
    """return word occurences with  maximum probabilty in each senteces """
    bold_occurrences_word = defaultdict(int)
    for i in range(len(word_vocab)):
        max_indx , max_value = -1, 0
        for j in range(len(word_vocab[i])):
            word_prob = float(word_probs[i][j])
            if max_value < word_prob:
                max_value, max_indx = word_prob , j
        bold_occurrences_word[stemmer().stem(word_vocab[i][max_indx].lower())] += 1
    return bold_occurrences_word
Пример #5
0
    def AvaliarConteudo(self, regras, mensagem):
        stem = stemmer()

        mensagem_f = [stem.stem(msg) for msg in Resposta.__flatten_tupla(mensagem)]

        print(regras)
        print(mensagem)

        regras_validas = [regra for regra in regras if regra[0] != "*"] #Removo as regras wildcard

        palavras = []
        for regra in regras_validas:
            for palavra in regra:
                p = stem.stem(palavra)
                if p in mensagem_f:
                    print(regra, mensagem)
                    palavras.append(regra)
                    break #A regra já foi atendida

        return len(palavras) == len(regras_validas)
def cleanWord(chaine):
    #Enlever les ponctuations
    chaine = re.sub(u"[^\w\d\s]+", " ", chaine)

    #Découper les mots (Tokenization)
    tokens = word_tokenize(chaine)

    #Mettre les mots en miniscule
    for i in tokens:
        i = i.lower()
        #Eliminer les top Words
        if i in STOPWORDS:
            continue

        #Stemming des mots
        stemming = stemmer()
        i = stemming.stem(i)

        #Retourner la liste des mots résultats sous forme d'une chaine
        liste = []
        liste.append(i)
        return (" ".join(liste))
Пример #7
0
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer as stemmer
from nltk.stem.porter import *
import numpy as np
import nltk
from gensim import corpora, models
from pprint import pprint
from itertools import chain

stemmer = stemmer('english')


def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text))


def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(
                token) > 3:
            result.append(lemmatize_stemming(token))
    return result


processed_docs = []
data = pd.read_csv('meme1sentiment.csv', error_bad_lines=False)
Пример #8
0
def get_occurrences_word(word_vocab):
    words_vocab = [stemmer().stem(word.lower()) for innerlist in word_vocab for word in innerlist]
    occurrences_word = defaultdict(int)
    for word in words_vocab:
        occurrences_word[word] += 1
    return occurrences_word
Пример #9
0
def build_vocab(word_lsts):
    words = [word for sent in word_lsts for word in sent]
    words = set([stemmer().stem(word.lower()) for word in words])
    Id2word = dict(enumerate(sorted(words)))
    word2Id = dict(map(reversed, Id2word.items()))
    return Id2word, word2Id
Пример #10
0
def get_dataId2word(word_lsts, words_id):
    ids = [id for sent in words_id for id in sent]
    words = [word for sent in word_lsts for word in sent]
    words = ([stemmer().stem(word.lower()) for word in words])
    return dict(zip(ids, words))
Пример #11
0
from nltk.tokenize import sent_tokenize as sents
from nltk.stem import SnowballStemmer as stemmer
from nltk.stem import WordNetLemmatizer as lemma
from nltk import pos_tag as pos
from nltk import RegexpParser as regex_parse
from os import listdir
from pprint import pprint
from math import *
from sklearn.cluster import Ward
import matplotlib.pyplot as plt
from pulp import *

STOP_WORDS = set(stop.words('english'))
GOOD_POS = {'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP',
            'NNPS'}  #,'VB','VBD','VBG','VBN','VBP','VBZ'}
SNOW = stemmer('english')
LEMMATE = lemma()
CHUNK = regex_parse('CONCEPT: {<JJ>*<NN>(<CC|IN>?<NN>)?}')  #concept tagging
FORCE_TAGS = {'prof': 'NP'}
NPTEL_STOP_WORDS = {
    'time', 'slide', 'refer', 'x', 'one', 'equal', 'minus', 'u', 'also',
    'plus', 'see', 'like', 'mean', 'get', 'let', 'value', 'say', 'point',
    'two', 'ha', 'n', 'y', 'case', 'd', 'going', 'go', 'c', 'know', '1', '2',
    '3', 'take', 'r', 'p', 'look', 'would', 'wa', 'v', 'term', 'k', 'first',
    'f', 'thing', 'may', 'e', 'particular', 'example', 'different', 'find',
    'problem', 'actually', 'use', 'therefore', 'given', 'm', 'give', 'way',
    'come', 'z', 'called', 'need', 'want', '4', 'write', 'change', 'l', 'h',
    'another', 'using', 'right', 'second', 'used', 'kind', 'much', 'important',
    'g', 'le', 'side', 'doe', 'well', 'constant', 'method', 'part', 'j',
    'something', 'alpha', 'beta', 'gamma', 'delta', 'could', 'q', 'next',
    'variable', 'course', 'lecture', 'three', 'put', 'done', 'output', 'high',
Пример #12
0
import numpy as np
import pandas as pd
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer as stemmer
import nltk
from nltk.corpus import stopwords
from collections import Counter
import nltk.stem

# Special vocabulary module from shoyu
import vocabulary_hdp as vocab

# In[69]:

stemmer = stemmer("english")

# In[70]:


def preprocess(doc):
    return [
        stemmer.stem(WordNetLemmatizer().lemmatize(w, pos='v'))
        for w in doc.translate(str.maketrans(
            '', '', string.punctuation)).lower().split(' ')
    ]


def rm_stopwords_and_short_words(words):
    results = []
    for i in words: