#!/usr/bin/python
# -*- coding: utf-8 -*-

from nltk.corpus import product_reviews_1
camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
review = camera_reviews[0]
print(review.sents()[0])
print(review.features())
print(product_reviews_1.features('Canon_G3.txt'))

n_reviews = len([(feat, score)
                 for (feat,
                      score) in product_reviews_1.features('Canon_G3.txt')
                 if feat == 'picture'])
tot = sum([
    int(score) for (feat, score) in product_reviews_1.features('Canon_G3.txt')
    if feat == 'picture'
])
# We use float for backward compatibility with division in Python2.7
mean = float(tot) / n_reviews
print(n_reviews, tot, mean)
示例#2
0
def review_func(request):
	if request.POST:
		if int(request.POST['website']) == 1:
			#### AMAZON.COM SCRAPING ####
			
			url= request.POST.get('url',False)
			r_ob = requests.get(url)
			gaussian = BeautifulSoup(r_ob.content)
			rev_list = gaussian.find_all("span", {"class": "MHRHead"})
			if not rev_list:
				rev_list= gaussian.find_all("div", {"class" : "a-section"})
			result= [i.text for i in rev_list]
			full= str(result)
		else:
			###### FLIPKART SCRAPING ####

			url= request.POST.get('url',False)
			r_ob = requests.get(url)
			gaussian = BeautifulSoup(r_ob.content)
			rev_list = gaussian.find_all("span", {"class": "review-text-full"})
			result= [i.text for i in rev_list]
			full= str(result)
	else:
		### TEST URL TAKEN FOR TEST PURPOSE : GET REQUEST ###
		# url="http://www.amazon.com/Nokia-Lumia-900-Black-16GB/dp/B007P5NHJO"
		url="http://www.flipkart.com/nokia-lumia-630-dual-sim/p/itme7zdakdtxxmdy?pid=MOBDW52BQYEQNQHG&al=rr4jU3t8xiLfxoSVreiBF8ldugMWZuE7Qdj0IGOOVqtGps%2B5%2BbFNcBLbBYY0ImV%2FholPQluhdKA%3D&ref=L%3A-7980544905708093331&srno=b_1"
		r_ob = requests.get(url)
		gaussian = BeautifulSoup(r_ob.content)
		rev_list = gaussian.find_all("span", {"class": "review-text-full"})
		result= [i.text for i in rev_list]
		full= str(result)


###########    SENTIMENTAL ANALYSIS ###########

	sentences = tokenize.sent_tokenize(full)
	sid = SentimentIntensityAnalyzer()

	opinion=[]
	sumz=[0.0,0.0,0.0,0.0]

	tot=0
	for sentence in sentences:
		n=0
		tot+=1
	   	ss = sid.polarity_scores(sentence)
	   	for k in sorted(ss):
	   		sumz[n]+=ss[k]
	   		n=n+1

	final= {} 
	avg = [x/tot for x in sumz] 
	final['senti'] = avg

########## FEATURE ANALYSIS AND EXTRACTION STARTS HERE    ###########
	v= product_reviews_1.features('gauss_data_set.txt')   ##INSERT YOUR OWN LABELLED DATASET IN PRODUCT_REVIEWS_1 FOLDER OF NLTK
	gauss=[]
	for k in v:
		if int(k[1])>0:
			ans = (k[0].split(), '1')
		elif int(k[1]) < 0:
			ans = (k[0].split(), '-1')
		gauss.append(ans)
	random.shuffle(gauss)

	all_words = nltk.FreqDist(w.lower() for w in full.split())
	word_features = all_words.keys()[:2000]

	def document_features(document):
		document_words = set(document)
		features = {}
		for word in word_features:
			features['%s' % word] = (word in document_words)
		return features

########### TRAING AND USING NAIVEBAYES CLASSIFIER  ###########
	featuresets = [(document_features(d), c) for (d,c) in gauss]
	train_set= featuresets
	classifier = NaiveBayesClassifier.train(train_set)

	data1=[]
	cpdist = classifier._feature_probdist
	for (fname, fval) in classifier.most_informative_features(10):
		def labelprob(l):
			return cpdist[l, fname].prob(fval)
		labels = sorted([l for l in classifier._labels
						 if fval in cpdist[l, fname].samples()],
						key=labelprob)
		if len(labels) == 1:
			continue
		l0 = labels[0]
		l1 = labels[-1]
		if cpdist[l0, fname].prob(fval) == 0:
			ratio = 'INF'
		else:
			ratio = '%8.1f' % (cpdist[l1, fname].prob(fval) /
							   cpdist[l0, fname].prob(fval))
		x = (fname, fval, l1, ratio.strip())   ########### REMOVING NOISE BELOW ###########
		if str(x[0]).upper() != 'PHONE' and str(x[0]).upper() != 'MOBILE' and str(x[0]).upper() != 'CUSTOMER':
			data1.append(x)
	if data1:
		data2=[]                  
		for p in data1:
			if p[0] not in [q[0] for q in data2]:
				data2.append(p)                     ###########REMOVING REDUNDANT DATA GENERATED ###########
		sol=[]
		# name_list= [q[0] for q in data1]
		for k in data2:
			resp={}
			resp['name'] = str(k[0])
			resp['val'] = k[1]
			resp['l'] = k[2]
			resp['ratio'] = k[3]
			sol.append(resp)
		final['feature'] = sol
		return JsonResponse(final, safe=False)
	else:
		return HttpResponse('0')
示例#3
0
import nltk
import random
from collections import defaultdict
from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs
from nltk.classify.api import ClassifierI
from nltk.corpus import product_reviews_1

v= product_reviews_1.features('data.txt')
res=[]
for k in v:
	if int(k[1])>0:
		ans = (k[0].split(), '1')
	elif int(k[1]) < 0:
		ans = (k[0].split(), '-1')
	res.append(ans)
random.shuffle(res)

all_words = nltk.FreqDist(w.lower() for w in product_reviews_1.words('data.txt'))
word_features = all_words.keys()[:2000]

def document_features(document):
	document_words = set(document)
	features = {}
	for word in word_features:
		features['%s' % word] = (word in document_words)
	return features

featuresets = [(document_features(d), c) for (d,c) in res]
train_set, test_set = featuresets[50:], featuresets[:50]
classifier = nltk.NaiveBayesClassifier.train(train_set)
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import product_reviews_1

def word_feats(words):
    return dict([(word, True) for word in words])

negfeatures = [(word_feats(f[0]),'neg') for f in product_reviews_1.features() if f[1][0]=="-"]
posfeatures = [(word_feats(f[0]),'pos') for f in product_reviews_1.features() if f[1][0]=="+"]

print(len(negfeatures))
print(len(posfeatures))

negcutoff = int(len(negfeatures)*3/4)
poscutoff = int(len(posfeatures)*3/4)

trainfeats = negfeatures[:negcutoff] + posfeatures[:poscutoff]
testfeats = negfeatures[negcutoff:] + posfeatures[poscutoff:]
print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

classifier = NaiveBayesClassifier.train(trainfeats)
print('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
#classifier.show_most_informative_features()
print(str(classifier.classify(word_feats(product_reviews_1.reviews()[0].features()) )))
word_list = []
for review in product_reviews_1.reviews():
    for sent in review.sents():
        print(sent)
        for word in sent:
            word_list.append(word)
tagged_word_list = nltk.pos_tag(word_list)
示例#5
0
def review_func(request):
    if request.POST:
        if int(request.POST['website']) == 1:
            #### AMAZON.COM SCRAPING ####

            url = request.POST.get('url', False)
            r_ob = requests.get(url)
            gaussian = BeautifulSoup(r_ob.content)
            rev_list = gaussian.find_all("span", {"class": "MHRHead"})
            if not rev_list:
                rev_list = gaussian.find_all("div", {"class": "a-section"})
            result = [i.text for i in rev_list]
            full = str(result)
        else:
            ###### FLIPKART SCRAPING ####

            url = request.POST.get('url', False)
            r_ob = requests.get(url)
            gaussian = BeautifulSoup(r_ob.content)
            rev_list = gaussian.find_all("span", {"class": "review-text-full"})
            result = [i.text for i in rev_list]
            full = str(result)
    else:
        ### TEST URL TAKEN FOR TEST PURPOSE : GET REQUEST ###
        # url="http://www.amazon.com/Nokia-Lumia-900-Black-16GB/dp/B007P5NHJO"
        url = "http://www.flipkart.com/nokia-lumia-630-dual-sim/p/itme7zdakdtxxmdy?pid=MOBDW52BQYEQNQHG&al=rr4jU3t8xiLfxoSVreiBF8ldugMWZuE7Qdj0IGOOVqtGps%2B5%2BbFNcBLbBYY0ImV%2FholPQluhdKA%3D&ref=L%3A-7980544905708093331&srno=b_1"
        r_ob = requests.get(url)
        gaussian = BeautifulSoup(r_ob.content)
        rev_list = gaussian.find_all("span", {"class": "review-text-full"})
        result = [i.text for i in rev_list]
        full = str(result)

###########    SENTIMENTAL ANALYSIS ###########

    sentences = tokenize.sent_tokenize(full)
    sid = SentimentIntensityAnalyzer()

    opinion = []
    sumz = [0.0, 0.0, 0.0, 0.0]

    tot = 0
    for sentence in sentences:
        n = 0
        tot += 1
        ss = sid.polarity_scores(sentence)
        for k in sorted(ss):
            sumz[n] += ss[k]
            n = n + 1

    final = {}
    avg = [x / tot for x in sumz]
    final['senti'] = avg

    ########## FEATURE ANALYSIS AND EXTRACTION STARTS HERE    ###########
    v = product_reviews_1.features(
        'gauss_data_set.txt'
    )  ##INSERT YOUR OWN LABELLED DATASET IN PRODUCT_REVIEWS_1 FOLDER OF NLTK
    gauss = []
    for k in v:
        if int(k[1]) > 0:
            ans = (k[0].split(), '1')
        elif int(k[1]) < 0:
            ans = (k[0].split(), '-1')
        gauss.append(ans)
    random.shuffle(gauss)

    all_words = nltk.FreqDist(w.lower() for w in full.split())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['%s' % word] = (word in document_words)
        return features

########### TRAING AND USING NAIVEBAYES CLASSIFIER  ###########

    featuresets = [(document_features(d), c) for (d, c) in gauss]
    train_set = featuresets
    classifier = NaiveBayesClassifier.train(train_set)

    data1 = []
    cpdist = classifier._feature_probdist
    for (fname, fval) in classifier.most_informative_features(10):

        def labelprob(l):
            return cpdist[l, fname].prob(fval)

        labels = sorted([
            l
            for l in classifier._labels if fval in cpdist[l, fname].samples()
        ],
                        key=labelprob)
        if len(labels) == 1:
            continue
        l0 = labels[0]
        l1 = labels[-1]
        if cpdist[l0, fname].prob(fval) == 0:
            ratio = 'INF'
        else:
            ratio = '%8.1f' % (cpdist[l1, fname].prob(fval) /
                               cpdist[l0, fname].prob(fval))
        x = (fname, fval, l1, ratio.strip()
             )  ########### REMOVING NOISE BELOW ###########
        if str(x[0]).upper() != 'PHONE' and str(
                x[0]).upper() != 'MOBILE' and str(x[0]).upper() != 'CUSTOMER':
            data1.append(x)
    if data1:
        data2 = []
        for p in data1:
            if p[0] not in [q[0] for q in data2]:
                data2.append(
                    p
                )  ###########REMOVING REDUNDANT DATA GENERATED ###########
        sol = []
        # name_list= [q[0] for q in data1]
        for k in data2:
            resp = {}
            resp['name'] = str(k[0])
            resp['val'] = k[1]
            resp['l'] = k[2]
            resp['ratio'] = k[3]
            sol.append(resp)
        final['feature'] = sol
        return JsonResponse(final, safe=False)
    else:
        return HttpResponse('0')
示例#6
0
 def get_aspects(self):
     return product_reviews_1.features()