def data_preparation_train_test(): """Generación del conjunto de entrenamiento y test Returns: train_sents: es una lista con las oraciones (listas de palabras) para el entrenamiento. test_sents: es una lista con las oraciones (listas de palabras) para el test. train_labels: lista de enteros con la clase (0:negativo, 1:positivo) de las oraciones de entrenamiento. test_labels: lista de enteros con la clase (0:negativo, 1:positivo) de las oraciones de test. """ #Esto lo hago porque a travé de la documentación de NLTK sé como es el corpus. positive_sents = sent_pol.sents(categories="pos") n_pos_sents = len(positive_sents) negative_sents = sent_pol.sents(categories="neg") n_neg_sents = len(negative_sents) #db_indexes: Cada posición se corresponde con una oración del corpus db_indexes = [i for i in range(n_pos_sents + n_neg_sents)] db_sents = positive_sents + negative_sents #db_labels: Cada posición se corresponde con una etiqueta de opinión del corpus. #Cada posición de esta lista se corresponde con cada posición de db_indexes. db_labels = [1] * n_pos_sents + [0] * n_neg_sents train_indexes, test_indexes, train_labels, test_labels = train_test_split( db_indexes, db_labels, test_size=0.2, shuffle=True, stratify=db_labels) train_sents = [db_sents[i] for i in train_indexes] test_sents = [db_sents[i] for i in test_indexes] return (train_sents, test_sents, train_labels, test_labels)
def getSentPolarities(): p = sentence_polarity.sents(categories='pos') n = sentence_polarity.sents(categories='neg') neg_sents = [(extractWords(sentence), 'neg') for sentence in n] pos_sents = [(extractWords(sentence), 'pos') for sentence in p] return (neg_sents, pos_sents)
def __init__(self): """ constructor """ self.positive_sentences = [] self.negative_sentences = [] response1 = input( 'Would you want to test sentiment with a local text data? (Y/N) ') if response1.lower() == 'y' or response1.lower() == 'yes': positive_file = input( 'Input the path for the positive sentiment data: ') negative_file = input( 'Input the path for the negative sentiment data: ') if os.path.exists(positive_file): # read positive sentences with open(positive_file, "r") as reader: self.positive_sentences = reader.readlines() self.positive_sentences = [ sent.rstrip() for sent in self.positive_sentences ] if os.path.exists(negative_file): # read negative sentences with open(negative_file, "r") as reader: self.negative_sentences = reader.readlines() self.negative_sentences = [ sent.rstrip() for sent in self.negative_sentences ] else: # use 5331 positive sentences and 5331 negative sentences as testing data # since this requires a huge amount of lexica, so this part is not implemented response2 = input( 'Would you want to test sentiment with data in sentence_polarity? (Y/N) ' ) if response2.lower() == 'y' or response2.lower() == 'yes': # negative words self.negative_lexica = opinion_lexicon.negative() self.negative_lexica_size = len(self.negative_lexica) # positive words self.positive_lexica = opinion_lexicon.positive() self.positive_lexica_size = len(self.positive_lexica) # sentence sentiment categories self.senti_categories = sentence_polarity.categories() # negative sentiment sentences self.negative_sentences = sentence_polarity.sents( categories=['neg'])[:10] # get the first 10 sentences self.negative_sentences = [ ' '.join(sent) for sent in self.negative_sentences ] self.negative_sentences_size = len(self.negative_sentences) # positive sentiment sentences self.positive_sentences = sentence_polarity.sents( categories=['pos'])[:10] # get the first 10 sentences self.positive_sentences = [ ' '.join(sent) for sent in self.positive_sentences ] self.positive_sentences_size = len(self.positive_sentences)
def load_sentence_polarity(): from nltk.corpus import sentence_polarity vocab = Vocab.build(sentence_polarity.sents()) train_data = [(vocab.convert_tokens_to_idx(sentence), 0) for sentence in sentence_polarity.sents(categories="pos")[:4000]] \ + [(vocab.convert_tokens_to_idx(sentence), 1) for sentence in sentence_polarity.sents(categories="neg")[:4000]] test_data = [(vocab.convert_tokens_to_idx(sentence), 0) for sentence in sentence_polarity.sents(categories="pos")[4000:]] \ + [(vocab.convert_tokens_to_idx(sentence), 1) for sentence in sentence_polarity.sents(categories="neg")[4000:]] return train_data, test_data, vocab
t = [ lemmatizer.lemmatize(w[0], get_pos(w[1])) for w in pos_tags if len(w[0]) > 1 ] # Combine them into a string #t = ' '.join(t) return t # print(dataset[0][0]) # print(preprocess(dataset[0][0])) # In[5]: # Get the sentence corpus and look at some sentences sentences = sentence_polarity.sents() documents = [] for cat in sentence_polarity.categories(): for sent in sentence_polarity.sents(categories=cat): documents.append((preprocess(' '.join(sent)), cat)) # print(documents) # documents = [(sent,cat) for cat in sentence_polarity.categories() for sent in sentence_polarity.sents(categories=cat)] random.shuffle(documents) all_words_list = [word for (sent, cat) in documents for word in sent] all_words = nltk.FreqDist(all_words_list) # # get the 200 most frequently appearing keywords in the corpus word_items = all_words.most_common(200) # print(word_items) word_features = [word for (word, count) in word_items] # print(word_features)
no_punc = [] for review in tokenized: line = "".join(char for char in review if char not in string.punctuation) no_punc.append(line) tokens = lemmatize(no_punc) return tokens def lemmatize(tokens): lmtzr = WordNetLemmatizer() lemma = [lmtzr.lemmatize(t) for t in tokens] return lemma reviews = reviews.apply(lambda x: tokenize(x)) from nltk.corpus import sentence_polarity import random sentences = sentence_polarity.sents() documents = [(sent, reviews) for reviews in sentence_polarity.categories() for sent in sentence_polarity.sents(categories=reviews)] random.shuffle(documents) all_words_list = [word for (sent,reviews) in documents for word in sent] all_words = nltk.FreqDist(all_words_list) import nltk all_words = nltk.FreqDist(all_words_list) word_items = all_words.most_common(100) word_features = [word for (word, freq) in word_items] def document_features(document, word_features): document_words = set(document) features = {} for word in word_features: features['contains({})'.format(word)] = (word in document_words) return features
# Author: Joyce Woznica # Lab - Week 8 # import nltk # get movie review from nltk.corpus import sentence_polarity import random # get the sentence corpus and look at some sentences # had to download nltk sentence_polarity nltk.download('sentence_polarity') sentences = sentence_polarity.sents() print(len(sentences)) print(type(sentences)) print(sentence_polarity.categories()) # sentences are already tokenized, print the first four sentences for sent in sentences[:4]: print(sent) # look at the sentences by category to see how many positive and negative pos_sents = sentence_polarity.sents(categories='pos') print(len(pos_sents)) neg_sents = sentence_polarity.sents(categories='neg') print(len(neg_sents)) ## setup the movie reviews sentences for classification # create a list of documents, each document is one sentence as a list of words paired with category documents = [(sent, cat) for cat in sentence_polarity.categories() for sent in sentence_polarity.sents(categories=cat)]
#!/usr/bin/python # -*- coding: utf-8 -*- from nltk.corpus import sentence_polarity print(sentence_polarity.sents()) print(sentence_polarity.categories()) print(sentence_polarity.sents()[1])
print(len(opinion_lexicon.words())) ## negative lexicon print('negative lexicon:') print(opinion_lexicon.negative()[:4]) print(len(opinion_lexicon.negative())) ## positive lexicon print('positive lexicon:') print(opinion_lexicon.positive()[:4]) print(len(opinion_lexicon.positive())) print() print('-------------------------------------------------------') # sentence polarity print('all sentences:') print(sentence_polarity.sents()) print(len(sentence_polarity.sents())) print('sentence categories:') print(sentence_polarity.categories()) print('examples:') print(sentence_polarity.sents()[0]) print(sentence_polarity.sents()[10661]) print(len(sentence_polarity.sents())) print() print('-------------------------------------------------------') # negative sentences in sentence_polarity print('negative sentences:')
Script makes use of :class:`grakel.WeisfeilerLehman`, :class:`grakel.VertexHistogram` """ from __future__ import print_function print(__doc__) import numpy as np import time from nltk import word_tokenize from nltk.corpus import sentence_polarity from grakel.kernels import WeisfeilerLehman, VertexHistogram from grakel import Graph sents = sentence_polarity.sents() sents = [sent for sent in sents if len(sent) > 1] n_sents = 3000 sents = sents[:n_sents] print("Loaded %d sentences\n" % n_sents) print("Creating word co-occurrence networks\n") word_networks = list() for sent in sents: node_labels = dict() tokens_to_ids = dict() for token in sent: if token not in tokens_to_ids: tokens_to_ids[token] = len(tokens_to_ids) node_labels[tokens_to_ids[token]] = token
printmeasures('neg', refneg, testneg) printmeasures('pos', refpos, testpos) # Using movie review corpus and top words from amazon baby review all_sent = nltk.word_tokenize(st) all_sent_list = [nltk.word_tokenize(sent) for sent in all_sent] words = [w.lower() for w in all_sent if w.isalpha()] stopwords = nltk.corpus.stopwords.words('english') all_words_list = [word for word in words if word not in stopwords] all_words = nltk.FreqDist(all_words_list) word_items = all_words.most_common(2000) word_features = [word for (word, freq) in word_items] sentences = sentence_polarity.sents() documents = [(sent, cat) for cat in sentence_polarity.categories() for sent in sentence_polarity.sents(categories=cat)] random.shuffle(documents) featuresets = [(document_features(d,word_features), c) for (d,c) in documents] traning = int(len(featuresets)*0.5) train_set, test_set = featuresets[traning:], featuresets[:traning] classifier3 = nltk.NaiveBayesClassifier.train(train_set) print ('Testing Accuracy: ',nltk.classify.accuracy(classifier3, test_set)) # 0.63421// 0.6450 reflist = [] testlist = [] for (features, label) in test_set: reflist.append(label)