files_path = "..\\corpus\\SFU_Spanish_Review_Corpus\\lavadoras" errs = 0 pol_dict = {} inputt = open('diccionario_polaridades.pk1', 'rb') pol_dict = load(inputt) inputt.close() def lemmatize_sent(sent): return utils.lemmatize_text( utils.remove_unalphabetic_words(nltk.word_tokenize(sent))) sents = [] for file_name in utils.find_all_files_in_path('*.txt', files_path): sents += nltk.sent_tokenize( open(file_name).read().replace('\n\n', '.').replace('\n', '.')) print('aspect', 'pos', 'neg') for aspect in aspects: aspect_sent_avg_pos_count = 0 aspect_sent_avg_neg_count = 0 sents_of_aspect_count = 0 for sent in sents: lemmatized_sent = lemmatize_sent(sent) if (aspect in sent) or (aspect in lemmatized_sent): sent_pos_count = 0 sent_neg_count = 0 sents_of_aspect_count += 1 for word in lemmatized_sent:
""" import utils from _pickle import load from bs4 import BeautifulSoup as Soup import gensim import gensim.corpora as corpora ruta_archivos = "..\\corpus\\corpusCine\\corpusCriticasCine" sustantivos = [] inputt = open('UnigramTagger_cess_esp.pkl','rb') unigram_tagger = load(inputt) inputt.close() errs = 0 data_lemmatized = [] for xml_file_name in utils.find_all_files_in_path('*.xml',ruta_archivos): try: handler = open(xml_file_name).read() soup = Soup(handler,'lxml') review = soup.find('review') data_lemmatized.append(utils.normalize_text(review.get_text())) except: errs += 1 id2word = corpora.Dictionary(data_lemmatized) corpus = [id2word.doc2bow(text) for text in data_lemmatized] # Build LDA model lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=8,
Created on Sun Jun 2 13:58:15 2019 @author: Turing """ import utils import nltk from _pickle import load from nltk.probability import FreqDist ruta_archivos = "..\\corpus\\SFU_Spanish_Review_Corpus\\lavadoras" sustantivos = [] inputt = open('UnigramTagger_cess_esp.pkl', 'rb') unigram_tagger = load(inputt) inputt.close() for file_name in utils.find_all_files_in_path('*.txt', ruta_archivos): oraciones = nltk.sent_tokenize(open(file_name).read().replace('\n', '.')) palabras_etiquetas = unigram_tagger.tag(nltk.word_tokenize(oraciones[-1])) sustantivos_archivo = [ sustantivo for sustantivo, tag in palabras_etiquetas if tag.startswith('n') ] sustantivos_archivo = utils.lemmatize_text(sustantivos_archivo) sustantivos += sustantivos_archivo #print(sustantivos) fd = FreqDist(sustantivos) print([word for word, freq in fd.most_common(10)]) sustantivos = [] errs = 0