files_path = "..\\corpus\\SFU_Spanish_Review_Corpus\\lavadoras"
errs = 0

pol_dict = {}
inputt = open('diccionario_polaridades.pk1', 'rb')
pol_dict = load(inputt)
inputt.close()


def lemmatize_sent(sent):
    return utils.lemmatize_text(
        utils.remove_unalphabetic_words(nltk.word_tokenize(sent)))


sents = []
for file_name in utils.find_all_files_in_path('*.txt', files_path):
    sents += nltk.sent_tokenize(
        open(file_name).read().replace('\n\n', '.').replace('\n', '.'))

print('aspect', 'pos', 'neg')
for aspect in aspects:
    aspect_sent_avg_pos_count = 0
    aspect_sent_avg_neg_count = 0
    sents_of_aspect_count = 0
    for sent in sents:
        lemmatized_sent = lemmatize_sent(sent)
        if (aspect in sent) or (aspect in lemmatized_sent):
            sent_pos_count = 0
            sent_neg_count = 0
            sents_of_aspect_count += 1
            for word in lemmatized_sent:
예제 #2
0
"""
import utils
from _pickle import load
from bs4 import BeautifulSoup as Soup
import gensim
import gensim.corpora as corpora

ruta_archivos = "..\\corpus\\corpusCine\\corpusCriticasCine"
sustantivos = []
inputt = open('UnigramTagger_cess_esp.pkl','rb')
unigram_tagger = load(inputt)
inputt.close()
errs = 0

data_lemmatized = []
for xml_file_name in utils.find_all_files_in_path('*.xml',ruta_archivos):
    try:
        handler = open(xml_file_name).read()
        soup = Soup(handler,'lxml')
        review = soup.find('review')
        data_lemmatized.append(utils.normalize_text(review.get_text()))
    except:
        errs += 1

id2word = corpora.Dictionary(data_lemmatized)
corpus = [id2word.doc2bow(text) for text in data_lemmatized]

    # Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
예제 #3
0
Created on Sun Jun  2 13:58:15 2019

@author: Turing
"""
import utils
import nltk
from _pickle import load
from nltk.probability import FreqDist

ruta_archivos = "..\\corpus\\SFU_Spanish_Review_Corpus\\lavadoras"
sustantivos = []
inputt = open('UnigramTagger_cess_esp.pkl', 'rb')
unigram_tagger = load(inputt)
inputt.close()

for file_name in utils.find_all_files_in_path('*.txt', ruta_archivos):
    oraciones = nltk.sent_tokenize(open(file_name).read().replace('\n', '.'))
    palabras_etiquetas = unigram_tagger.tag(nltk.word_tokenize(oraciones[-1]))
    sustantivos_archivo = [
        sustantivo for sustantivo, tag in palabras_etiquetas
        if tag.startswith('n')
    ]
    sustantivos_archivo = utils.lemmatize_text(sustantivos_archivo)
    sustantivos += sustantivos_archivo

#print(sustantivos)
fd = FreqDist(sustantivos)
print([word for word, freq in fd.most_common(10)])

sustantivos = []
errs = 0