data_folder = '../../../../../../datasets/yelp_phoenix_academic_dataset/' business_file_path = data_folder + 'yelp_academic_dataset_business.json' my_matrix = BusinessETL.create_category_matrix(business_file_path) my_sets = BusinessETL.create_category_sets(business_file_path) print 'Data pre-processing done' # Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-scikit') # Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-nltk') # Clusterer.cluster_and_evaluate_data(my_matrix, 'mean-shift') # Clusterer.cluster_and_evaluate_data(my_matrix, 'ward') # Clusterer.cluster_and_evaluate_data(my_matrix, 'dbscan') my_labels = Clusterer.cluster_data(my_matrix, 'dbscan') my_categories = get_categories(business_file_path) size = len(set(my_labels)) clusters = [[] for i in range(size)] for i in xrange(len(my_labels)): if my_labels[i] == -1: clusters[size-1].append(binary_to_categories(my_matrix[i], my_categories)) else: clusters[int(my_labels[i])].append(binary_to_categories(my_matrix[i], my_categories)) # print my_labels[i] # Clusterer.linkage(my_matrix[:3000]) # Clusterer.gaac(my_matrix[:500][:50]) sets = []
def clustering(file_path): vectorized = TipTfidf.tf_idf_tips(file_path) Clusterer.cluster_and_evaluate_data(vectorized, 'k-means-scikit')
data_folder = '../../../../../../datasets/yelp_phoenix_academic_dataset/' business_file_path = data_folder + 'yelp_academic_dataset_business.json' my_matrix = BusinessETL.create_category_matrix(business_file_path) my_sets = BusinessETL.create_category_sets(business_file_path) print 'Data pre-processing done' # Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-scikit') # Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-nltk') # Clusterer.cluster_and_evaluate_data(my_matrix, 'mean-shift') # Clusterer.cluster_and_evaluate_data(my_matrix, 'ward') # Clusterer.cluster_and_evaluate_data(my_matrix, 'dbscan') my_labels = Clusterer.cluster_data(my_matrix, 'dbscan') my_categories = get_categories(business_file_path) size = len(set(my_labels)) clusters = [[] for i in range(size)] for i in xrange(len(my_labels)): if my_labels[i] == -1: clusters[size-1].append(binary_to_categories(my_matrix[i], my_categories)) else: clusters[int(my_labels[i])].append(binary_to_categories(my_matrix[i], my_categories)) # print my_labels[i] # Clusterer.linkage(my_matrix[:3000]) # Clusterer.gaac(my_matrix[:500][:50]) # counts = count_categories(clusters)