Exemplo n.º 1
0
def get_vocabulary_list(reviews):
    """
	Retrieves the list of tuples of each word frequency
	"""
    freq_map = {}

    total = 0
    for review in reviews:
        tokens = preprocess(review)

        # If the language of the review is not English
        if (tokens == -1):
            continue

        total += 1
        print(total)
        for token in tokens:
            if token in freq_map:
                freq_map[token] += 1
            else:
                freq_map[token] = 1

    print('Number of English reviews:', total)

    # Sort the list of freq_map's tuples by their value
    sorted_freq_map = sorted(freq_map.items(),
                             key=operator.itemgetter(1),
                             reverse=True)

    return sorted_freq_map
Exemplo n.º 2
0
	def unigram(self, review, ID, label, total, num_of_reviews):
		"""
		Extracts the features of a review
		"""
		print(total, '----', total*100//num_of_reviews , '%', end='\r')
		tokens = preprocess(review)

		# If the language of the review is not English
		if tokens == -1:
			return 0

		word_indices = self.get_word_indices(tokens)
		feature_vector = self.get_feature_vector(word_indices)

		r.table('X_' + self.model).insert({
			'id': ID,
			'data': feature_vector
		}).run(connection)

		r.table('y_' + self.model).insert({
			'id': ID,
			'data': label
		}).run(connection)

		return 1
Exemplo n.º 3
0
def word2vec_word_embedding(min_count, rows, n_components):
	"""
	Returns the t-SNE dimension-reduced data of word embeddings with n_components
	as the new number of dimensions
	"""
	data = {
		'X': [],
		'y': [],
		'z': [],
		'words': []
	}

	# The actual label will be used as class labels
	reviews = load_files(dir_path + '/../../data/reviews/not_corrected').data

	preprocessed_reviews = [preprocess(review.decode('utf-8')) for review in reviews]
	model = gensim.models.Word2Vec(preprocessed_reviews, min_count=min_count, size=400)

	X = []

	for word in model.wv.index2word:
		X.append(model.wv[word])

	X = np.array(X)

	X_reduced = TSNE(n_components=n_components, init='pca', random_state=0).fit_transform(X)

	data['X'] = X_reduced[:rows, 0].tolist()
	data['y'] = X_reduced[:rows, 1].tolist()
	data['words'] = model.wv.index2word

	if n_components == 3:
		data['z'] = X_reduced[:rows, 2].tolist()

	return jsonify(data)
Exemplo n.º 4
0
def get_all_course_review_words_overall(course_slug):
	"""
	Retrieves all the preprocessed words
	"""
	data = {}

	cursor = r.table('reviews').filter({
				'id': course_slug
			}).run(connection)

	reviews = []
	for document in cursor:
		reviews.extend(document['data'])

	if len(reviews) == 0:
		return jsonify(data)

	words = []
	for review in reviews:
		tokens = preprocess(review)

		if tokens == -1:
			continue

		words.extend(tokens)

	data['word_mapping'] = count_by(words)

	return jsonify(data)
Exemplo n.º 5
0
def get_wordcloud():
	"""
	Retrieves all the words and its corresponding occurences on the whole dataset (English words only)
	"""
	data = {}

	cursor = r.table('combined_reviews_with_labels').limit(10000).run(connection)

	reviews = []
	for document in cursor:
		reviews.append(document)

	overall_words = []
	very_positive_words = []
	positive_words = []
	neutral_words = []
	negative_words = []
	very_negative_words = []

	for review in reviews:
		tokens = preprocess(review['data'])

		if tokens == -1:
			continue

		overall_words.extend(tokens)

		if review['label'] == 5:
			very_positive_words.extend(tokens)
		elif review['label'] == 4:
			positive_words.extend(tokens)
		elif review['label'] == 3:
			neutral_words.extend(tokens)
		elif review['label'] == 2:
			negative_words.extend(tokens)
		elif review['label'] == 1:
			very_negative_words.extend(tokens)

	data['overall'] = count_by(overall_words)
	data['very_positive'] = count_by(very_positive_words)
	data['positive'] = count_by(positive_words)
	data['neutral'] = count_by(neutral_words)
	data['negative'] = count_by(negative_words)
	data['very_negative'] = count_by(very_negative_words)

	return jsonify(data)
Exemplo n.º 6
0
def get_all_course_review_words_neutral(course_slug):
	"""
	Retrieves all the preprocessed neutral words
	"""
	data = {}

	cursor = r.table('reviews').filter({
				'id': course_slug
			}).run(connection)

	reviews = []
	for document in cursor:
		reviews.extend(document['data'])

	if len(reviews) == 0:
		return jsonify(data)

	overall_words = []
	for review in reviews:
		tokens = preprocess(review)

		if tokens == -1:
			continue

		# Check each word if neutral
		words = []
		for word in tokens:
			tb = TextBlob(word)

			if tb.sentiment.subjectivity == 0 or tb.sentiment.polarity == 0:
				words.append(word)

		overall_words.extend(words)

	data['word_mapping'] = count_by(overall_words)

	return jsonify(data)