def get_categories(file_path): records = ETLUtils.load_json_file(file_path) # Now we obtain the categories for all the businesses records = ETLUtils.add_transpose_list_column('categories', records) BusinessETL.drop_unwanted_fields(records) return records[0].keys()
for i in xrange(num_categories): if binary_list[i] == 1: category_list.append(categories[i]) # print category_list return category_list data_folder = '../../../../../../datasets/yelp_phoenix_academic_dataset/' business_file_path = data_folder + 'yelp_academic_dataset_business.json' my_matrix = BusinessETL.create_category_matrix(business_file_path) my_sets = BusinessETL.create_category_sets(business_file_path) print 'Data pre-processing done' # Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-scikit') # Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-nltk') # Clusterer.cluster_and_evaluate_data(my_matrix, 'mean-shift') # Clusterer.cluster_and_evaluate_data(my_matrix, 'ward') # Clusterer.cluster_and_evaluate_data(my_matrix, 'dbscan') my_labels = Clusterer.cluster_data(my_matrix, 'dbscan') my_categories = get_categories(business_file_path) size = len(set(my_labels)) clusters = [[] for i in range(size)] for i in xrange(len(my_labels)):
if review['business_id'] in business_ids: filtered_reviews.append(review) return filtered_reviews @staticmethod def sort_records(records, field, reverse=False): return sorted(records, key=itemgetter(field), reverse=reverse) start = time.time() review_etl = ReviewETL() my_business_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_business.json" my_reviews_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review.json" my_business_ids = BusinessETL.get_business_ids(my_business_file, 'Hotels') my_reviews = ETLUtils.load_json_file(my_reviews_file) # print(len(ReviewETL.filter_reviews_by_business(my_reviews, my_business_ids, 'text'))) my_restaurant_reviews = ReviewETL.filter_reviews_by_business_slow(my_reviews, my_business_ids) my_restaurants_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json" ETLUtils.save_json_file(my_restaurants_file, my_restaurant_reviews) # my_sorted_reviews = ReviewETL.sort_records(my_reviews, 'business_id') # print(len(my_sorted_reviews)) # main() end = time.time() total_time = end - start print("Total time = %f seconds" % total_time)
return category_list def count_categories(cluster_list): counted_clusters = [] for cluster in cluster_list: counted_clusters.append(Counter(list(itertools.chain(*cluster)))) return counted_clusters data_folder = '../../../../../../datasets/yelp_phoenix_academic_dataset/' business_file_path = data_folder + 'yelp_academic_dataset_business.json' my_matrix = BusinessETL.create_category_matrix(business_file_path) my_sets = BusinessETL.create_category_sets(business_file_path) print 'Data pre-processing done' # Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-scikit') # Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-nltk') # Clusterer.cluster_and_evaluate_data(my_matrix, 'mean-shift') # Clusterer.cluster_and_evaluate_data(my_matrix, 'ward') # Clusterer.cluster_and_evaluate_data(my_matrix, 'dbscan') my_labels = Clusterer.cluster_data(my_matrix, 'dbscan') my_categories = get_categories(business_file_path) size = len(set(my_labels)) clusters = [[] for i in range(size)] for i in xrange(len(my_labels)):