def cluster_reviews(reviews): """ Classifies a list of reviews into specific and generic. Returns a list of integer of the same size as the list of reviews, in which each position of the list contains a 0 if that review is specific or a 1 if that review is generic. :param reviews: a list of reviews. Each review must contain the text of the review and the part-of-speech tags for every word :type reviews: list[Review] :return a list of integer of the same size as the list of reviews, in which each position of the list contains a 0 if that review is specific or a 1 if that review is generic """ metrics = np.zeros((len(reviews), NUM_FEATURES)) for index in range(len(reviews)): metrics[index] =\ review_metrics_extractor.get_review_metrics(reviews[index]) review_metrics_extractor.normalize_matrix_by_columns(metrics) k_means = KMeans(n_clusters=2) k_means.fit(metrics) labels = k_means.labels_ record_clusters = split_list_by_labels(metrics, labels) cluster0_sum = reduce(lambda x, y: x + sum(y), record_clusters[0], 0) cluster1_sum = reduce(lambda x, y: x + sum(y), record_clusters[1], 0) if cluster0_sum < cluster1_sum: # If the cluster 0 contains the generic review we invert the tags labels = [1 if element == 0 else 0 for element in labels] return labels
def transform(records): """ Transforms the reviews into a numpy matrix so that they can be easily processed by the functions available in scikit-learn :type records: list[dict] :param records: a list of dictionaries with the reviews :return: """ num_features =\ len(review_metrics_extractor.get_review_metrics(records[0])) x_matrix = numpy.zeros((len(records), num_features)) for index in range(len(records)): x_matrix[index] =\ review_metrics_extractor.get_review_metrics(records[index]) min_values = x_matrix.min(axis=0) max_values = x_matrix.max(axis=0) review_metrics_extractor.normalize_matrix_by_columns( x_matrix, min_values, max_values) y_vector =\ numpy.array([record['specific'] == 'yes' for record in records]) return x_matrix, y_vector
def predict(self, reviews): metrics = numpy.zeros((len(reviews), self.num_features)) for index in range(len(reviews)): metrics[index] = review_metrics_extractor.get_review_metrics(reviews[index]) review_metrics_extractor.normalize_matrix_by_columns(metrics, self.min_values, self.max_values) return self.classifier.predict(metrics)
def transform(self, records): """ Transforms the reviews into a numpy matrix so that they can be easily processed by the functions available in scikit-learn :type records: list[dict] :param records: a list of dictionaries with the reviews :return: a matrix with the independent variables (X) and a vector with the dependent variables (y) """ self.num_features = \ len(review_metrics_extractor.get_review_metrics(records[0])) metrics = numpy.zeros((len(records), self.num_features)) for index in range(len(records)): metrics[index] = \ review_metrics_extractor.get_review_metrics(records[index]) self.min_values = metrics.min(axis=0) self.max_values = metrics.max(axis=0) review_metrics_extractor.normalize_matrix_by_columns( metrics, self.min_values, self.max_values) labels = \ numpy.array([record['specific'] == 'yes' for record in records]) return metrics, labels
def train(self, records, reviews=None): if reviews is None: reviews = [] for record in records: reviews.append(Review(record['text'])) if len(records) != len(reviews): msg = 'The size of the records and reviews arrays must be the same' raise ValueError(msg) metrics = numpy.zeros((len(reviews), self.num_features)) for index in range(len(reviews)): metrics[index] =\ review_metrics_extractor.get_review_metrics(reviews[index]) self.min_values = metrics.min(axis=0) self.max_values = metrics.max(axis=0) review_metrics_extractor.normalize_matrix_by_columns( metrics, self.min_values, self.max_values) labels =\ numpy.array([record['specific'] == 'yes' for record in records]) self.classifier.fit(metrics, labels)
def plot(records): num_features = len(review_metrics_extractor.get_review_metrics(records[0])) metrics = numpy.zeros((len(records), num_features)) for index in range(len(records)): metrics[index] = \ review_metrics_extractor.get_review_metrics(records[index]) review_metrics_extractor.normalize_matrix_by_columns(metrics) labels = numpy.array([record['specific'] == 'yes' for record in records]) clf = LogisticRegression(C=100) clf.fit(metrics, labels) coef = clf.coef_[0] intercept = clf.intercept_ print('coef', coef) # print('intercept', intercept) xvals = numpy.linspace(0, 1.0, 2) yvals = -(coef[0] * xvals + intercept[0]) / coef[1] plt.plot(xvals, yvals, color='g', label='Decision boundary') plt.xlabel("log number of words (normalized)") plt.ylabel("log number of verbs in past tense (normalized)") my_legends = ['Specific reviews', 'Generic reviews'] for outcome, marker, colour, legend in zip([0, 1], "ox", "br", my_legends): plt.scatter( metrics[:, 0][labels == outcome], metrics[:, 1][labels == outcome], c=colour, marker=marker, label=legend) # plt.legend([red_dot, (red_dot, white_cross)], ["Attr A", "Attr A+B"]) plt.legend(loc='lower left', numpoints=1, ncol=3, fontsize=8, bbox_to_anchor=(0, 0))
def predict(self, records): metrics = numpy.zeros((len(records), self.num_features)) for index in range(len(records)): metrics[index] =\ review_metrics_extractor.get_review_metrics(records[index]) review_metrics_extractor.normalize_matrix_by_columns( metrics, self.min_values, self.max_values) return self.classifier.predict(metrics)
def score(self, records): metrics = numpy.zeros((len(records), self.num_features)) for index in range(len(records)): metrics[index] = \ review_metrics_extractor.get_review_metrics(records[index]) review_metrics_extractor.normalize_matrix_by_columns( metrics, self.min_values, self.max_values) labels = \ numpy.array([record['specific'] == 'yes' for record in records]) return self.classifier.score(metrics, labels)
def plot(records): num_features = len(review_metrics_extractor.get_review_metrics(records[0])) metrics = numpy.zeros((len(records), num_features)) for index in range(len(records)): metrics[index] = \ review_metrics_extractor.get_review_metrics(records[index]) review_metrics_extractor.normalize_matrix_by_columns(metrics) labels = numpy.array([record['specific'] == 'yes' for record in records]) clf = LogisticRegression(C=100) clf.fit(metrics, labels) coef = clf.coef_[0] intercept = clf.intercept_ print('coef', coef) # print('intercept', intercept) xvals = numpy.linspace(0, 1.0, 2) yvals = -(coef[0] * xvals + intercept[0]) / coef[1] plt.plot(xvals, yvals, color='g', label='Decision boundary') plt.xlabel("log number of words (normalized)") plt.ylabel("log number of verbs in past tense (normalized)") my_legends = ['Specific reviews', 'Generic reviews'] for outcome, marker, colour, legend in zip([0, 1], "ox", "br", my_legends): plt.scatter(metrics[:, 0][labels == outcome], metrics[:, 1][labels == outcome], c=colour, marker=marker, label=legend) # plt.legend([red_dot, (red_dot, white_cross)], ["Attr A", "Attr A+B"]) plt.legend(loc='lower left', numpoints=1, ncol=3, fontsize=8, bbox_to_anchor=(0, 0))
def main(): item_type = 'hotel' # item_type = 'restaurant' my_folder = '/Users/fpena/UCC/Thesis/datasets/context/' my_file = my_folder + 'classified_' + item_type + '_reviews.json' binary_reviews_file = my_folder + 'classified_' + item_type + '_reviews.pkl' my_records = ETLUtils.load_json_file(my_file) with open(binary_reviews_file, 'rb') as read_file: my_reviews = pickle.load(read_file) num_features = 2 my_metrics = numpy.zeros((len(my_reviews), num_features)) for index in range(len(my_reviews)): my_metrics[index] =\ review_metrics_extractor.get_review_metrics(my_reviews[index]) review_metrics_extractor.normalize_matrix_by_columns(my_metrics) count_specific = 0 count_generic = 0 for record in my_records: if record['specific'] == 'yes': count_specific += 1 if record['specific'] == 'no': count_generic += 1 print('count_specific: %d' % count_specific) print('count_generic: %d' % count_generic) print('specific percentage: %f%%' % (float(count_specific)/len(my_records))) print('generic percentage: %f%%' % (float(count_generic)/len(my_records))) my_labels = numpy.array([record['specific'] == 'yes' for record in my_records]) classifiers = [ DummyClassifier(strategy='most_frequent', random_state=0), DummyClassifier(strategy='stratified', random_state=0), DummyClassifier(strategy='uniform', random_state=0), # DummyClassifier(strategy='constant', random_state=0, constant=True), LogisticRegression(C=100), SVC(C=1.0, kernel='rbf'), SVC(C=1.0, kernel='linear'), KNeighborsClassifier(n_neighbors=10), tree.DecisionTreeClassifier(), NuSVC(), LinearSVC() ] scores = [[] for _ in range(len(classifiers))] Xtrans = my_metrics cv = KFold(n=len(my_metrics), n_folds=5) for i in range(len(classifiers)): for train, test in cv: x_train, y_train = Xtrans[train], my_labels[train] x_test, y_test = Xtrans[test], my_labels[test] clf = classifiers[i] clf.fit(x_train, y_train) scores[i].append(clf.score(x_test, y_test)) for classifier, score in zip(classifiers, scores): print("Mean(scores)=%.5f\tStddev(scores)=%.5f" % (numpy.mean(score), numpy.std(score))) plot(my_metrics, my_labels)