示例#1
0
def calculateChisquare():
	print 'Calculating chi...'
	reviews = yelpReview.readExperimentData()
	cReviews = classfiedReviews(reviews)
	lexicons = loadLexiconTable()
	chiTable = {}
	chiTable[0] = {}
	i=0
	for key, value in lexicons.items():
		A = 0
		B = 0
		C = 0
		D = 0
		for review in cReviews[0]:
			words = review['text'].split()
			if key in words:
				A+=1
		B = value - A
		C = len(cReviews[4]) - A
		D = 5000 - len(cReviews[4]) - B
		x = 5000.0 *(A*D-C*B)*(A*D-C*B)/((A+C)*(B+D)*(A+B)*(C+D))
		chiTable[0][key] = x
		i+=1
		if i%50 == 0:
			print i
	saveJson(chiTable[0], 'chi_0.json')
	print 'Done.'
示例#2
0
def svmTrainFor2(lexicons):
	print 'Training SVM...'
	reviews = yelpReview.readExperimentData()
	feature0 = loadFeature(FEATURE0)
	feature4 = loadFeature(FEATURE4)
	feature = feature0+feature4
	x=[]
	y=[]
	for review in reviews:
		star = review['stars']-1
		if star == 4 or star == 3:
			star = 1
		elif star == 2:
			continue
		elif star == 0 or star == 1:
			star = 0
		vector = getReviewVector(review, feature, lexicons)
		d = distance.cosine(vectors[0], vector)
		#if math.isnan(d):
		#	print 'meet nan'
		#	continue
		y.append(star)
		#x.append(vector)
		x.append([d])
		#print star
	print len(x), len(y)
	clf = svm.SVR()
	clf.fit(x, y)
	print 'Done.'
	return clf
示例#3
0
def svmTrain(lexicons):
	print 'Training SVM...'
	reviews = yelpReview.readExperimentData()
	feature0 = loadFeature(FEATURE0)
	feature4 = loadFeature(FEATURE4)
	feature = feature0+feature4
	x=[]
	y=[]
	for review in reviews:
		vector = getReviewVector(review, feature, lexicons)
		star = review['stars']-1
		d = distance.cosine(vectors[0], vector)
		y.append(star)
		#x.append(vector)
		x.append([d])
	print len(x), len(y)
	clf = svm.SVR()
	clf.fit(x, y)
	print 'Done.'
	return clf
示例#4
0
def buildVectors(lexicons):
	print 'Building vectors...'
	reviews = yelpReview.readExperimentData()
	cReviews = classfiedReviews(reviews)
	feature0 = loadFeature(FEATURE0)
	feature4 = loadFeature(FEATURE4)
	feature = feature0+feature4
	#feature0
	for i in range(0,5):
		print i
		for review in cReviews[i]:
			vector = getReviewVector(review, feature, lexicons)
			vectors[i]=numpy.add(vectors[i], vector)
	d01 = distance.cosine(vectors[0],vectors[1])
	d02 = distance.cosine(vectors[0],vectors[2])
	d03 = distance.cosine(vectors[0],vectors[3])
	d04 = distance.cosine(vectors[0],vectors[4])

	print d01, d02, d03, d04
	plt.plot([0, d01, d02, d03, d04])
	plt.show()
	#print len(vectors[i])
	print 'Done.'
def buildLexiconTable():
	print 'Generating Lexicon Tables...'
	lexiconTable = {}
	reviews = yelpReview.readExperimentData()
	i=0
	for review in reviews:
		text = nltk.word_tokenize(review['text'])
		posTags = nltk.pos_tag(text)
		wordSet = {}
		for posTag in posTags:
			if posTag[1] == 'JJ' or posTag[1] == 'JJS' or posTag[1] == 'JJR':
				if wordSet.has_key(posTag[0]):
					continue
				else:
					wordSet[posTag[0]]=1
				if not lexiconTable.has_key(posTag[0]):
					lexiconTable[posTag[0]] = 1
				else:
					lexiconTable[posTag[0]]+=1
		i+=1
		if i%100 == 0:
			print i
	saveJson(lexiconTable, LEXICONTABLE)
	print 'Done.'