#!/usr/bin/python # -*- coding: utf-8 -*- from nltk.corpus import product_reviews_1 camera_reviews = product_reviews_1.reviews('Canon_G3.txt') review = camera_reviews[0] print(review.sents()[0]) print(review.features()) print(product_reviews_1.features('Canon_G3.txt')) n_reviews = len([(feat, score) for (feat, score) in product_reviews_1.features('Canon_G3.txt') if feat == 'picture']) tot = sum([ int(score) for (feat, score) in product_reviews_1.features('Canon_G3.txt') if feat == 'picture' ]) # We use float for backward compatibility with division in Python2.7 mean = float(tot) / n_reviews print(n_reviews, tot, mean)
def review_func(request): if request.POST: if int(request.POST['website']) == 1: #### AMAZON.COM SCRAPING #### url= request.POST.get('url',False) r_ob = requests.get(url) gaussian = BeautifulSoup(r_ob.content) rev_list = gaussian.find_all("span", {"class": "MHRHead"}) if not rev_list: rev_list= gaussian.find_all("div", {"class" : "a-section"}) result= [i.text for i in rev_list] full= str(result) else: ###### FLIPKART SCRAPING #### url= request.POST.get('url',False) r_ob = requests.get(url) gaussian = BeautifulSoup(r_ob.content) rev_list = gaussian.find_all("span", {"class": "review-text-full"}) result= [i.text for i in rev_list] full= str(result) else: ### TEST URL TAKEN FOR TEST PURPOSE : GET REQUEST ### # url="http://www.amazon.com/Nokia-Lumia-900-Black-16GB/dp/B007P5NHJO" url="http://www.flipkart.com/nokia-lumia-630-dual-sim/p/itme7zdakdtxxmdy?pid=MOBDW52BQYEQNQHG&al=rr4jU3t8xiLfxoSVreiBF8ldugMWZuE7Qdj0IGOOVqtGps%2B5%2BbFNcBLbBYY0ImV%2FholPQluhdKA%3D&ref=L%3A-7980544905708093331&srno=b_1" r_ob = requests.get(url) gaussian = BeautifulSoup(r_ob.content) rev_list = gaussian.find_all("span", {"class": "review-text-full"}) result= [i.text for i in rev_list] full= str(result) ########### SENTIMENTAL ANALYSIS ########### sentences = tokenize.sent_tokenize(full) sid = SentimentIntensityAnalyzer() opinion=[] sumz=[0.0,0.0,0.0,0.0] tot=0 for sentence in sentences: n=0 tot+=1 ss = sid.polarity_scores(sentence) for k in sorted(ss): sumz[n]+=ss[k] n=n+1 final= {} avg = [x/tot for x in sumz] final['senti'] = avg ########## FEATURE ANALYSIS AND EXTRACTION STARTS HERE ########### v= product_reviews_1.features('gauss_data_set.txt') ##INSERT YOUR OWN LABELLED DATASET IN PRODUCT_REVIEWS_1 FOLDER OF NLTK gauss=[] for k in v: if int(k[1])>0: ans = (k[0].split(), '1') elif int(k[1]) < 0: ans = (k[0].split(), '-1') gauss.append(ans) random.shuffle(gauss) all_words = nltk.FreqDist(w.lower() for w in full.split()) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['%s' % word] = (word in document_words) return features ########### TRAING AND USING NAIVEBAYES CLASSIFIER ########### featuresets = [(document_features(d), c) for (d,c) in gauss] train_set= featuresets classifier = NaiveBayesClassifier.train(train_set) data1=[] cpdist = classifier._feature_probdist for (fname, fval) in classifier.most_informative_features(10): def labelprob(l): return cpdist[l, fname].prob(fval) labels = sorted([l for l in classifier._labels if fval in cpdist[l, fname].samples()], key=labelprob) if len(labels) == 1: continue l0 = labels[0] l1 = labels[-1] if cpdist[l0, fname].prob(fval) == 0: ratio = 'INF' else: ratio = '%8.1f' % (cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)) x = (fname, fval, l1, ratio.strip()) ########### REMOVING NOISE BELOW ########### if str(x[0]).upper() != 'PHONE' and str(x[0]).upper() != 'MOBILE' and str(x[0]).upper() != 'CUSTOMER': data1.append(x) if data1: data2=[] for p in data1: if p[0] not in [q[0] for q in data2]: data2.append(p) ###########REMOVING REDUNDANT DATA GENERATED ########### sol=[] # name_list= [q[0] for q in data1] for k in data2: resp={} resp['name'] = str(k[0]) resp['val'] = k[1] resp['l'] = k[2] resp['ratio'] = k[3] sol.append(resp) final['feature'] = sol return JsonResponse(final, safe=False) else: return HttpResponse('0')
import nltk import random from collections import defaultdict from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs from nltk.classify.api import ClassifierI from nltk.corpus import product_reviews_1 v= product_reviews_1.features('data.txt') res=[] for k in v: if int(k[1])>0: ans = (k[0].split(), '1') elif int(k[1]) < 0: ans = (k[0].split(), '-1') res.append(ans) random.shuffle(res) all_words = nltk.FreqDist(w.lower() for w in product_reviews_1.words('data.txt')) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['%s' % word] = (word in document_words) return features featuresets = [(document_features(d), c) for (d,c) in res] train_set, test_set = featuresets[50:], featuresets[:50] classifier = nltk.NaiveBayesClassifier.train(train_set)
import nltk.classify.util from nltk.classify import NaiveBayesClassifier from nltk.corpus import product_reviews_1 def word_feats(words): return dict([(word, True) for word in words]) negfeatures = [(word_feats(f[0]),'neg') for f in product_reviews_1.features() if f[1][0]=="-"] posfeatures = [(word_feats(f[0]),'pos') for f in product_reviews_1.features() if f[1][0]=="+"] print(len(negfeatures)) print(len(posfeatures)) negcutoff = int(len(negfeatures)*3/4) poscutoff = int(len(posfeatures)*3/4) trainfeats = negfeatures[:negcutoff] + posfeatures[:poscutoff] testfeats = negfeatures[negcutoff:] + posfeatures[poscutoff:] print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))) classifier = NaiveBayesClassifier.train(trainfeats) print('accuracy:', nltk.classify.util.accuracy(classifier, testfeats)) #classifier.show_most_informative_features() print(str(classifier.classify(word_feats(product_reviews_1.reviews()[0].features()) ))) word_list = [] for review in product_reviews_1.reviews(): for sent in review.sents(): print(sent) for word in sent: word_list.append(word) tagged_word_list = nltk.pos_tag(word_list)
def review_func(request): if request.POST: if int(request.POST['website']) == 1: #### AMAZON.COM SCRAPING #### url = request.POST.get('url', False) r_ob = requests.get(url) gaussian = BeautifulSoup(r_ob.content) rev_list = gaussian.find_all("span", {"class": "MHRHead"}) if not rev_list: rev_list = gaussian.find_all("div", {"class": "a-section"}) result = [i.text for i in rev_list] full = str(result) else: ###### FLIPKART SCRAPING #### url = request.POST.get('url', False) r_ob = requests.get(url) gaussian = BeautifulSoup(r_ob.content) rev_list = gaussian.find_all("span", {"class": "review-text-full"}) result = [i.text for i in rev_list] full = str(result) else: ### TEST URL TAKEN FOR TEST PURPOSE : GET REQUEST ### # url="http://www.amazon.com/Nokia-Lumia-900-Black-16GB/dp/B007P5NHJO" url = "http://www.flipkart.com/nokia-lumia-630-dual-sim/p/itme7zdakdtxxmdy?pid=MOBDW52BQYEQNQHG&al=rr4jU3t8xiLfxoSVreiBF8ldugMWZuE7Qdj0IGOOVqtGps%2B5%2BbFNcBLbBYY0ImV%2FholPQluhdKA%3D&ref=L%3A-7980544905708093331&srno=b_1" r_ob = requests.get(url) gaussian = BeautifulSoup(r_ob.content) rev_list = gaussian.find_all("span", {"class": "review-text-full"}) result = [i.text for i in rev_list] full = str(result) ########### SENTIMENTAL ANALYSIS ########### sentences = tokenize.sent_tokenize(full) sid = SentimentIntensityAnalyzer() opinion = [] sumz = [0.0, 0.0, 0.0, 0.0] tot = 0 for sentence in sentences: n = 0 tot += 1 ss = sid.polarity_scores(sentence) for k in sorted(ss): sumz[n] += ss[k] n = n + 1 final = {} avg = [x / tot for x in sumz] final['senti'] = avg ########## FEATURE ANALYSIS AND EXTRACTION STARTS HERE ########### v = product_reviews_1.features( 'gauss_data_set.txt' ) ##INSERT YOUR OWN LABELLED DATASET IN PRODUCT_REVIEWS_1 FOLDER OF NLTK gauss = [] for k in v: if int(k[1]) > 0: ans = (k[0].split(), '1') elif int(k[1]) < 0: ans = (k[0].split(), '-1') gauss.append(ans) random.shuffle(gauss) all_words = nltk.FreqDist(w.lower() for w in full.split()) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['%s' % word] = (word in document_words) return features ########### TRAING AND USING NAIVEBAYES CLASSIFIER ########### featuresets = [(document_features(d), c) for (d, c) in gauss] train_set = featuresets classifier = NaiveBayesClassifier.train(train_set) data1 = [] cpdist = classifier._feature_probdist for (fname, fval) in classifier.most_informative_features(10): def labelprob(l): return cpdist[l, fname].prob(fval) labels = sorted([ l for l in classifier._labels if fval in cpdist[l, fname].samples() ], key=labelprob) if len(labels) == 1: continue l0 = labels[0] l1 = labels[-1] if cpdist[l0, fname].prob(fval) == 0: ratio = 'INF' else: ratio = '%8.1f' % (cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)) x = (fname, fval, l1, ratio.strip() ) ########### REMOVING NOISE BELOW ########### if str(x[0]).upper() != 'PHONE' and str( x[0]).upper() != 'MOBILE' and str(x[0]).upper() != 'CUSTOMER': data1.append(x) if data1: data2 = [] for p in data1: if p[0] not in [q[0] for q in data2]: data2.append( p ) ###########REMOVING REDUNDANT DATA GENERATED ########### sol = [] # name_list= [q[0] for q in data1] for k in data2: resp = {} resp['name'] = str(k[0]) resp['val'] = k[1] resp['l'] = k[2] resp['ratio'] = k[3] sol.append(resp) final['feature'] = sol return JsonResponse(final, safe=False) else: return HttpResponse('0')
def get_aspects(self): return product_reviews_1.features()