def k_elbow_plot(fpath, max_k=10): """ Funzione per plottare il grafico "a gomito" che mostra il rapporto tra SSE del modello di clustering e il numero di cluster scelti. L'utilità sta nel poter selezionare il numero di k più appropriato in base all'ultimo valore k che comporta una buona diminuzione dell'errore ("punta del gomito") :param fpath: percorso del dataset processato (vedasi descrizione di argv[1] in cima allo script) :param max_k: numero massimo di k che si vuole utilizzare per produrre il grafo """ if not path.isfile(fpath): print("Error: could not find specified CSV dataset.") return if max_k <= 0: print("Error: k must be a positive integer.") return data = refactor_data_frame(pd.read_csv(fpath)) errors = [] for k in range(1, max_k + 1): kmodes = KModes(n_clusters=k, random_state=42, n_init=1, init="random") kmodes.fit(data) errors.append(kmodes.cost_) print("DONE WITH K=" + str(k)) plt.figure(figsize=(16, 8)) plt.plot(range(1, max_k + 1), errors, 'bo-') plt.xlabel('#Clusters (K)') plt.ylabel('Errore (0/1)') plt.title("Rapporto parametro K/errore del dataset " + path.basename(fpath)) plt.show()
def test_kmodes_init_soybean(self): init_vals = np.array( [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1], [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3, 0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0], [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0, 1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0], [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]]) kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2) result = kmodes_init.fit_predict(SOYBEAN) expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) assert_cluster_splits_equal(result, expected) # 5 initial centroids, 4 n_clusters init_vals = np.array( [[0, 1], [4, 0], [4, 0], [3, 0], [3, 0]]) kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2) with self.assertRaises(AssertionError): kmodes_init.fit(SOYBEAN) # wrong number of attributes init_vals = np.array( [0, 1, 2, 3]) kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2) with self.assertRaises(AssertionError): kmodes_init.fit(SOYBEAN)
def clustering(user_session_subset, chicago_clustering, chicago_clustering_labels): #THIS WILL EXTRACT THE CHICAGO_ZERO_AND_ONE RESTAURANTS THAT MATCH WITH THE ONES FOUND FROM THE INITIAL SUBSET(required for clustering) #get count of user session subset user_session_subset_count = pd.crosstab( index=user_session_subset['Restaurant_ID'], columns="count") mask = np.zeros(len(chicago_clustering), dtype=bool) mask[user_session_subset_count['count'].index.values.astype(int)] = True chicago_clustering_labels = chicago_clustering_labels[mask] chicago_clustering = chicago_clustering[mask] #method - Huang, number of clusters - 4, verbose=1 mean textual output (0 is no output) kmodes_huang = KModes(n_clusters=3, init='Huang', verbose=0, n_init=20) kmodes_huang.fit(chicago_clustering) #this joins the restaurant name cluster_results = np.column_stack( (chicago_clustering_labels, kmodes_huang.labels_)) #convert numpy matrix to pandas dataframe cluster_result_df = pd.DataFrame(cluster_results) cluster_result_df.columns = ['Restaurant', 'Cluster'] #JOIN THE CLUSTERING RESULTS WITH user_session_subset_count TO GET OUT FINAL RESULTS #remove existing indecies so the new ones line up and df's can be joined cluster_result_df.reset_index(drop=True, inplace=True) user_session_subset_count.reset_index(drop=True, inplace=True) #join the cluster results with the restaurant counts clusters_with_counts = pd.concat( [cluster_result_df, user_session_subset_count], axis=1) return clusters_with_counts
def cluster_and_output(k, matrix, clust_type, inputpath, outdir): km = KModes(n_clusters=args.k, cat_dissim=conflict_dissim, init='huang', n_init=args.n, verbose=1) km.fit(matrix) from collections import defaultdict cluster_groups = defaultdict(list) for j in range(matrix.shape[0]): cluster_groups[km.labels_[j]].append(j) tot_rows = 0 for cluster in cluster_groups: tot_rows += len(cluster_groups[cluster]) filename = os.path.splitext(os.path.basename(inputpath))[0] outfile = os.path.join(outdir, filename) centroids = km.cluster_centroids_ out_matrix = list() for ix_c, c in enumerate(centroids): if ix_c in cluster_groups: x = list(map(int, list(map(round, c)))) out_matrix.append(x) out_matrix = np.transpose(np.array(out_matrix)) print(out_matrix.shape) print(len(cluster_groups)) np.savetxt('{}_celluloid.matrix'.format(outfile), out_matrix, fmt='%d', delimiter=' ') with open('{}_celluloid_clusters.txt'.format(outfile), 'w+') as file_out: for cluster in sorted(cluster_groups): file_out.write('{0}\t"{1}"\n'.format( cluster, ','.join([str(x + 1) for x in cluster_groups[cluster]]))) with open('{}_celluloid.mutations'.format(outfile), 'w+') as file_out: for cluster in sorted(cluster_groups): file_out.write('{0}\n'.format(','.join( [str(x + 1) for x in cluster_groups[cluster]]))) print('Done.')
def exec_kmodes(df, choices_obj): # reproduce results on small soybean data set cats_not_scaled = [header for header in choices_obj['categorical']] X = df[cats_not_scaled].astype(str) k = int(input("Number of clusters:\n > ")) kmodes_cao = KModes(n_clusters=k, init='Cao', verbose=1) kmodes_cao.fit(X.values) # Print cluster centroids of the trained model. print('k-modes (Cao) centroids:') print(kmodes_cao.cluster_centroids_) # Print training statistics print('Final training cost: {}'.format(kmodes_cao.cost_)) print('Training iterations: {}'.format(kmodes_cao.n_iter_))
def test_kmodes_predict_soybean(self): kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2) kmodes_cao = kmodes_cao.fit(SOYBEAN) result = kmodes_cao.predict(SOYBEAN2) expected = np.array([2, 1, 3, 0]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16))
def test_kmodes_random_soybean(self): kmodes_random = KModes(n_clusters=4, init='random', verbose=2, random_state=42) result = kmodes_random.fit(SOYBEAN) self.assertIsInstance(result, KModes)
def test_kmodes_predict_soybean_ng(self): kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim) kmodes_cao = kmodes_cao.fit(SOYBEAN) result = kmodes_cao.predict(SOYBEAN2) expected = np.array([2, 1, 3, 0]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint8))
def test_kmodes_predict_soybean_jaccard_dissim_label(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=jaccard_dissim_label, random_state=42) kmodes_huang = kmodes_huang.fit(TEST_DATA) result = kmodes_huang.fit_predict(TEST_DATA_PREDICT) expected = np.array([1, 0, 1, 2]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16))
def test_kmodes_fit_predict(self): """Test whether fit_predict interface works the same as fit and predict.""" kmodes = KModes(n_clusters=4, init='Cao', random_state=42) sample_weight = [0.5] * TEST_DATA.shape[0] data1 = kmodes.fit_predict(TEST_DATA, sample_weight=sample_weight) data2 = kmodes.fit(TEST_DATA, sample_weight=sample_weight).predict(TEST_DATA) assert_cluster_splits_equal(data1, data2)
def test_k_modes_sample_weight_unchanged(self): """Test whether centroid definition remains unchanged when scaling uniformly.""" kmodes_baseline = KModes(n_clusters=4, init='Cao', random_state=42) model_baseline = kmodes_baseline.fit(SOYBEAN) expected = set(tuple(row) for row in model_baseline.cluster_centroids_) for weight in [.5, 1, 1., 2]: sample_weight = [weight] * SOYBEAN.shape[0] kmodes_weighted = KModes(n_clusters=4, init='Cao', random_state=42) model_weighted = kmodes_weighted.fit(SOYBEAN, sample_weight=sample_weight) factual = set( tuple(row) for row in model_weighted.cluster_centroids_) # Centroids might be ordered differently. To compare the centroids, we first # sort them. tuple_pairs = zip(sorted(expected), sorted(factual)) for tuple_expected, tuple_factual in tuple_pairs: self.assertAlmostEqual(tuple_expected, tuple_factual)
def test_pickle_fitted(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, random_state=42) model = kmodes_huang.fit(SOYBEAN) serialized = pickle.dumps(model) self.assertTrue(isinstance(pickle.loads(serialized), model.__class__))
def test_kmodes_predict_soybean_jaccard_dissim_binary(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=jaccard_dissim_binary, random_state=42) # binary encoded variables are required bin_variables = SOYBEAN.astype(bool).astype(int) kmodes_huang = kmodes_huang.fit(bin_variables) # binary encoded variables required for prediction as well bin_variables_pred = SOYBEAN2.astype(bool).astype(int) result = kmodes_huang.fit_predict(bin_variables_pred) expected = np.array([0, 1, 2, 3]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16))
def test_kmodes_sample_weights_all_but_one_zero(self): """Test whether centroid collapses to single datapoint with non-zero weight.""" kmodes = KModes(n_clusters=1, init='Cao', random_state=42) n_samples = 10 for indicator in range(n_samples): sample_weight = np.zeros(n_samples) sample_weight[indicator] = 1 model = kmodes.fit(TEST_DATA[:n_samples, :], sample_weight=sample_weight) self.assertTrue( (model.cluster_centroids_[0, :] == TEST_DATA[indicator, :] ).all())
def f(game, modes, K, N, in_colour, seed): print("Running clustering...") with NumpySeed(seed): dset = StaticAtariDataset(game=game, after_warp=not in_colour) X = dset.x if N: X = X[:N, ...] else: N = X.shape[0] if not in_colour: X = X[..., 0] image_shape = X.shape[1:] X = X.reshape(N, -1) if modes: km = KModes(n_clusters=K, init='Huang', n_init=1, verbose=1) km.fit(X) centroids = km.cluster_centroids_ centroids = centroids.reshape(K, *image_shape) discrete_centroids = centroids centroids = centroids / 255. labels = km.labels_ else: result = k_means(X / 255., K) centroids = result[0] discrete_centroids = np.uint8(np.floor(centroids * 255)) centroids = np.maximum(centroids, 1e-6) centroids = np.minimum(centroids, 1 - 1e-6) centroids = centroids.reshape(K, *image_shape) labels = np.array(labels) X = X.reshape(N, *image_shape) return centroids, discrete_centroids, labels, X print("Done.")
def test_kmodes_empty_init_cluster_soybean(self): # Check if the clustering does not crash in case of an empty cluster. init_vals = np.array( [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1], [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3, 0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0], [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0, 1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0], [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]]) kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2) result = kmodes_init.fit(SOYBEAN) self.assertIsInstance(result, KModes)
xls_file = pd.ExcelFile( "..\\source\\traffic_violations_selected_features_delete_missing.xlsx") # get excel sheet - object type: pandas dataframe pd_traffic_violations = xls_file.parse('Hoja1') # select features to model pd_traffic_violations_to_model = pd_traffic_violations[[ 'CODIGO INFRACCION', 'TIPO DE VIA', 'LUGAR DE INTERVENCION', 'EMPRESA DE TRANSPORTE' ]] # instance k-modes object - 180 clusters kmodes = KModes(n_clusters=180, init='Cao', verbose=1) # modeling kmodes.fit(pd_traffic_violations_to_model) # cluster centroids of the model print(kmodes.cluster_centroids_) # statistics of modeling print(kmodes.cost_) print(kmodes.n_iter_) # create new cluster column in pandas dataframe pd_traffic_violations['CLUSTER'] = kmodes.labels_ # save labeled dataframe to .csv pd_traffic_violations.to_csv( '..\\clustering\\kmodes_clustering_traffic_violations.csv', index=False, header=True)
#wcss = [] #for i in range(1,30): # kmodes = KModes(n_clusters=i, init='Huang', n_init=5, verbose=1) # kmodes.fit(data1) # wcss.append(kmodes.cluster_centroids_) #plt.plot(range(1,30), wcss) #plt.title("The elbow method") #plt.xlabel("The number of clusters") #plt.ylabel("WCSS") #plt.show() wcss """**Kmode Model Creation and prediction**""" km = KModes(n_clusters=23, init='Huang', n_init=5, verbose=1) km = km.fit(data1) clusters = km.predict(data1) # Print the cluster centroids print(km.cluster_centroids_) """**Storing My Prediction to CSV file**""" k = pd.DataFrame() k['output'] = clusters k.to_csv("outpt.csv")
#print(housing_binary) #print(len(housing_binary)) #print(u_housing[i_housing]) #print(len(u_housing[i_housing])) # LIFT km = KModes(n_clusters = 5) #kmeans = KMeans(n_clusters = 5) X = np.vstack((i_plaintiff,i_judgment_type, i_judgment_method,d_a, g_num)) X = np.transpose(X) X = np.hstack((X,low)) #print(X) km.fit(X) y_km = km.predict(X) #print(X[0,:]) #print(X[1,:]) plt.scatter(latitude, longitude, c = y_km, s = 50, cmap='winter') plt.xlabel('Latitude') plt.ylabel('Longitude') plt.show() #print(y_km) units = np.array(df['units']) number = np.nonzero(units) print(number) number = np.array(number)
# le = {} if (to_encode is None): df_enc = df.copy(deep=True) else: df_enc = df.copy(deep=True) for fname in to_encode: le[fname] = preprocessing.LabelEncoder() df_enc[fname] = le[fname].fit_transform(df_enc[fname]) # iterate over KModes a few times n_iter = n_max_clusters - n_min_clusters + 1 cost = np.zeros(n_iter) for i in range(n_iter): km = KModes(n_clusters= (i+1), n_init = 1, verbose=0) km.fit(df_enc) cost[i] = km.cost_ # locate the elbow kl = KneeLocator(range(n_iter), cost, curve="convex", direction="decreasing") n_clusters = kl.elbow # generate the final kmodes fit km = KModes(n_clusters=n_clusters, n_init = 1, verbose=0) clusters = km.fit_predict(df_enc) if not (to_encode is None): df_renc = df_enc.copy() for fname in to_encode: df_renc[fname] = le[fname].inverse_transform(df_renc[fname]) df_ind_res = df_renc.reset_index()
#THIS WILL EXTRACT THE CHICAGO_ZERO_AND_ONE RESTAURANTS THAT MATCH WITH THE ONES FOUND FROM THE INITIAL SUBSET(required for clustering) #get count of user session subset user_session_subset_count = pd.crosstab( index=user_session_subset['Restaurant_ID'], columns="count") mask = np.zeros(len(chicago_clustering), dtype=bool) mask[user_session_subset_count['count'].index.values.astype(int)] = True chicago_clustering_labels = chicago_clustering_labels[mask] chicago_clustering = chicago_clustering[mask] #CLUSTERING #method - Huang, number of clusters - 4, verbose=1 mean textual output (0 is no output) kmodes_huang = KModes(n_clusters=3, init='Huang', verbose=0) kmodes_huang.fit(chicago_clustering) #this joins the restaurant name cluster_results = np.column_stack( (chicago_clustering_labels, kmodes_huang.labels_)) #convert numpy matrix to pandas dataframe cluster_result_df = pd.DataFrame(cluster_results) cluster_result_df.columns = ['Restaurant', 'Cluster'] #JOIN THE CLUSTERING RESULTS WITH user_session_subset_count TO GET OUT FINAL RESULTS #remove existing indecies so the new ones line up and df's can be joined cluster_result_df.reset_index(drop=True, inplace=True) user_session_subset_count.reset_index(drop=True, inplace=True) #join the cluster results with the restaurant counts
def fit(qual_id, count): # More than 500 documents results in slow training if count >= 500: count = 500 # Query for passed qual_id with incorrect answers # May take a lengthly amount of time. Recommend optimizing query. # print("Querying for", qual_id) data = collection.find({"qual_id": qual_id, "correct": False})[:count] # print("Query complete.") # Compile dictionary of all possible features in given list of records # print("Compiling dictionary of features.") features = {} for doc in data: doc_features = {} if doc['response'] is None: continue doc_features = retrieveKeys(doc['response'], doc_features) features = mergeFeatures(doc_features, features) # print("Feature compilation complete.") # Count number of features length = countFeatures(features) if length == 0: return # Reuse queried documents. data = data.rewind() # Append missing features to all records and assign common benign value. # Current benign value is an empty string. # print("Appending features to documents.") student_data = np.array([]) for doc in data: if doc['response'] is None: continue else: temp = np.array([]) temp = addFeatures(features, temp, doc['response']) if len(student_data) == 0: student_data = np.append(student_data, temp) student_data = np.reshape(student_data, (-1, length)) else: student_data = np.append(student_data, [temp], axis=0) # print("Finished appending features to documents.") # Perform k-modes clustering # print("Clustering...") clusters = len(student_data) # K-modes implementation can't generate more than 255 centroids if clusters > 255: clusters = 255 km = KModes(n_clusters=clusters, init='Cao', n_init=4, verbose=False) # print("Finished.") km.fit(student_data) # Print important information from clustering # Centroids are common values to each cluster centroids = km.cluster_centroids_ # print("Centroids") # print(centroids) # Labels is a list indicating which cluster each record belongs to labels = km.labels_ # print("Labels") # print(labels) # Cost is value indicating possible error in the clusters. Ideal value is # 0.0. If value is greater than 0.0, then the max number of clusters were # generated and some responses were assigned to an inexact cluster. This. # would result in the largest cluster having having documents it shouldn't. # Recommend re-clustering with fewer documents or more clusters if possible. cost = km.cost_ # print("Cost") # print(cost) # Prints 5 largest cluster labels and number of records per cluster. most_common = Counter(labels).most_common(5) # print("Most populated centroids") # print(most_common) # Generate cluster dictionary to be inserted in the centroid_db. # Qual_id: qual_id of given documents # Features: Dictionary of all possible features in passed documents. # Centroids: List of generated centroids. # Cluster_sizes: Number of documents in each cluster. # Behavioral_traits: Behavioral traits associated with at least one # document assigned to the given centroid. # Screenshot_urls: A screenshot from one document within each cluster. # Centroids and behavioral_traits have the same lengths. The behavioral # traits in a given index of behavioral_traits is associated with the same # index of centroids. post = { 'qual_id': qual_id, 'features': features, 'centroids': {}, 'cluster_sizes': {}, 'behavioral_traits': {}, 'screenshot_urls': {} } for i in Counter(labels).most_common(len(centroids)): if str(i[0]) not in post['cluster_sizes']: post['cluster_sizes'][str(i[0])] = str(i[1]) for i in range(len(centroids.tolist())): if str(i) not in post['centroids']: post['centroids'][str(i)] = centroids.tolist()[i] # Reuse queried documents. data = data.rewind() label = 0 for doc in data: if doc['response'] is None: continue elif str(labels[label]) not in post['screenshot_urls']: post['screenshot_urls'][str(labels[label])] = doc['screenshot_url'] label += 1 else: label += 1 # Reuse queried documents. data = data.rewind() # Add associated behavioral traits to cluster dictionary. for doc in data: if doc['response'] is None: continue else: temp = np.array([]) temp = addFeatures(features, temp, doc['response']) temp = np.reshape(temp, (-1, length)) label = km.predict(temp)[0] if str(label) not in post['behavioral_traits']: post['behavioral_traits'][str( label)] = doc['behavioral_traits'] # Add generated cluster dictionary to centroid_db. # If a record shares the same qual_id as the generated cluster dictionary, # then the stored record will be overwritten. # print("Posting centroids to database centroids.") centroid_db.replace_one({'qual_id': qual_id}, post, upsert=True)
data_to_cluster = data[[ 'Project Resource Category', 'Project Subject Category Tree', 'Project Subject Subcategory Tree', 'Project Type', 'School Metro Type', 'Region', 'Project Grade Level Category' ]] ### Find Optimal Clusters ### n_clusters = np.arange(2, 1003, 100) costs = [] for n in n_clusters: print("Working on {} clusters.".format(n)) kproto = KModes(n_clusters=n, init='random', verbose=False) # here you use the unsclaed data and tell the model which columns are categorical # and which ones are numerical cluster_obj = kproto.fit(data_to_cluster) labels = cluster_obj.labels_ cost = cluster_obj.cost_ costs.append(cost) #Plot Average Silhouette Scores optimum_k = 100 fig, ax = plt.subplots() plt.title("Cost vs. Number of Clusters - Random Centroid Initializations") plt.plot(n_clusters, costs, linestyle='--', marker='o') plt.axvline(x=optimum_k, color='black', linestyle='--', label='Best Number of Clusters: {}'.format(optimum_k)) plt.xlabel('Number of Clusters') plt.ylabel('Cost')
cslice_counts.tail() # In[83]: cluster_range = range( 1, 11 ) # In[84]: for n_clusters in cluster_range: km = KModes(n_clusters, init='Huang', n_init=10, verbose=1) km.fit(cslice) # In[86]: # Plot costs by number of clusters plt.plot([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [17513.0, 15391.0, 13947.0, 13507.0, 13236.0, 12803.0, 12625.0, 12467.0, 12292.0, 12101]) plt.xlabel('Clusters') plt.ylabel('Costs') plt.axis([0, 11, 12000.0, 18000.0]) plt.show() # ## Evaluate 3 clusters
def test_kmodes_epoch_costs(self): kmodes = KModes(n_clusters=4, init='Cao', random_state=42) kmodes.fit(SOYBEAN) self.assertEqual(kmodes.epoch_costs_, [206.0, 204.0, 199.0, 199.0])
showDistance1() print() ## e) dataFrame['Type'] = dataFrame['Type'].astype('category') dataFrame['Origin'] = dataFrame['Origin'].astype('category') dataFrame['DriveTrain'] = dataFrame['DriveTrain'].astype('category') dataFrame['Cylinders'] = dataFrame['Cylinders'].astype('category') cat_col = dataFrame.select_dtypes(['category']).columns df = dataFrame[cat_col].apply(lambda x: x.cat.codes) km = KModes(n_clusters=3, init='Huang', random_state=555) clusters = km.fit(df) cents = km.cluster_centroids_ predict_results = km.predict(df) unique, counts = np.unique(predict_results, return_counts=True) num_obs_in_each_cluster = dict(zip(unique, counts)) def showResult(i): print("The number of observations in cluster 1: %d" % num_obs_in_each_cluster[i]) print("The number of observations in cluster 2: %d" % num_obs_in_each_cluster[i + 1]) print("The number of observations in cluster 3: %d" % num_obs_in_each_cluster[i + 2])
def test_kmodes_random_soybean(self): kmodes_random = KModes(n_clusters=4, init='random', verbose=2) result = kmodes_random.fit(SOYBEAN) self.assertIsInstance(result, KModes)
delimiter=',')[:, 0:] # test.csv y = np.genfromtxt('data_category/dataset_extract.csv', dtype=str, delimiter=',', usecols=(0)) #data_category/dataset_extract.csv print(x.shape) print(y.shape) dataNum = y.shape[0] n_clusters = [100, 300, 500, 600, 1000] for nc in n_clusters: kmodes_huang = KModes(n_clusters=nc, cat_dissim=multimatch_dissim, init='Huang', verbose=0) kmodes_huang.fit(x) # with open('summary'+str(nc)+'.txt','w') as f: # Print cluster centroids of the trained model. # f.write('k-modes (Huang) centroids:') # print(kmodes_huang.cluster_centroids_) # Print training statistics print('For number of clusters ', nc) print('Final training cost: {}'.format(kmodes_huang.cost_)) print('Training iterations: {}'.format(kmodes_huang.n_iter_)) print('Save tables:') np.savetxt('labels' + str(nc) + '.out', kmodes_huang.labels_, fmt='%i', delimiter=',')
def fit(qual_id, count): # More than 500 documents results in slow training if count >= 500: count = 500 # Query for passed qual_id with incorrect answers # May take a lengthly amount of time. Recommend optimizing query. if FLAG_VERBOSE: print("Querying for", qual_id) data = collection.find({"qual_id": qual_id, "correct": False})[:count] if FLAG_VERBOSE: print("Query complete.") # Compile dictionary of all possible features in given list of records if FLAG_VERBOSE: print("Compiling dictionary of features.") num_examples = 0 num_empty = 0 features = {} for doc in data: doc_features = {} if doc['response'] is None: num_empty += 1 continue doc_features = retrieveKeys(doc['response'], doc_features) features = mergeFeatures(doc_features, features, "") num_examples += 1 if FLAG_VERBOSE: print("Feature compilation complete.") # Count number of features num_features = countFeatures(features) if FLAG_VERBOSE: print("*** Number of features: {}".format(num_features)) print( "*** Number of non-empty records for [Q_ID:{}]: {}. (dropped {} with empty resp)" .format(qual_id, num_examples, num_empty)) if num_features == 0: return # Reuse queried documents. data = data.rewind() # Append missing features to all records and assign common benign value. # Current benign value is an empty string. # print("Appending features to documents.") # faster to create zeroed np array first, rather then appending student_data = np.zeros((num_examples, num_features), dtype='<U32') i = 0 for doc in data: if doc['response'] is None: continue else: temp = addFeatures(features, [], doc['response']) student_data[i, :] = temp i += 1 if FLAG_VERBOSE: print("Finished appending features to documents.") print(student_data) #print("*** Features: ***") #pprint(interpretFeatures(features, [])) # print feature vectors #print("*** FEATURE VECTOR: ***") #i = 0 #for row in student_data: # print("[{}]: {}".format(i, row)) # i += 1 #print(repr(student_data)) # Perform k-modes clustering print("Clustering...") clusters = NUM_CLUSTERS # K-modes implementation can't generate more than 255 centroids if clusters > 255: clusters = 255 if clusters > len(student_data): clusters = len(student_data) km = KModes(n_clusters=clusters, init='Cao', n_init=4, verbose=False) km.fit(student_data) print("Finished.") # Print important information from clustering # Centroids are common values to each cluster centroids = km.cluster_centroids_ if FLAG_VERBOSE: print("*** CENTROIDS: ***") print(centroids) # Labels is a list indicating which cluster each record belongs to labels = km.labels_ if FLAG_VERBOSE: print("*** LABELS: ***") print(labels) # Cost is value indicating possible error in the clusters. Ideal value is 0.0 if FLAG_VERBOSE: cost = km.cost_ print("*** COST: ***") print(cost) # Prints 5 largest cluster labels and number of records per cluster. if FLAG_VERBOSE: most_common = Counter(labels).most_common(5) print("Most populated centroids") print(most_common) # Generate cluster dictionary to be inserted in the centroid_db. # Qual_id: qual_id of given documents # Features: Dictionary of all possible features in passed documents. # Centroids: List of generated centroids. # Behavioral_traits: Behavioral traits associated with at least one # document assigned to the given centroid. # Centroids and behavioral_traits have the same lengths. The behavioral # traits in a given index of behavioral_traits is associated with the same # index of centroids. if FLAG_USE_CENTROID_DB: post = { 'qual_id': qual_id, 'features': features, 'centroids': centroids.tolist(), 'behavioral_traits': {} } # Reuse queried documents. data = data.rewind() # Add associated behavioral traits to cluster dictionary. for doc in data: if doc['response'] is None: continue else: temp = np.array([]) temp = addFeatures(features, temp, doc['response']) temp = np.reshape(temp, (-1, num_features)) label = km.predict(temp)[0] if str(label) not in post['behavioral_traits']: post['behavioral_traits'][str( label)] = doc['behavioral_traits'] X_ids.append(doc['_id']) # Add generated cluster dictionary to centroid_db. # If a record shares the same qual_id as the generated cluster dictionary, # then the stored record will be overwritten. print("Posting centroids to database centroids.") centroid_db.replace_one({'qual_id': qual_id}, post, upsert=True) print(qual_id, "complete.") print() if FLAG_DO_ANALYSIS: # perform some automatic EDA on largest clusters and save # collect ids of examples data = data.rewind() X_ids = [] for doc in data: if doc['response'] is None: continue else: X_ids.append(doc['_id']) out_dir = ANALYS_OUT_DIR if out_dir is None: out_dir = "./out/" + str(qual_id) analys = cluster_analyzer(collection, out_dir) analys.analyze(student_data, labels, centroids, X_ids, qual_id, interpretFeatures(features, []))
#!/usr/bin/env python import numpy as np from kmodes.kmodes import KModes # reproduce results on small soybean data set x = np.genfromtxt('soybean.csv', dtype=int, delimiter=',')[:, :-1] y = np.genfromtxt('soybean.csv', dtype=str, delimiter=',', usecols=(35, )) kmodes_huang = KModes(n_clusters=4, init='Huang', verbose=1) kmodes_huang.fit(x) # Print cluster centroids of the trained model. print('k-modes (Huang) centroids:') print(kmodes_huang.cluster_centroids_) # Print training statistics print('Final training cost: {}'.format(kmodes_huang.cost_)) print('Training iterations: {}'.format(kmodes_huang.n_iter_)) kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=1) kmodes_cao.fit(x) # Print cluster centroids of the trained model. print('k-modes (Cao) centroids:') print(kmodes_cao.cluster_centroids_) # Print training statistics print('Final training cost: {}'.format(kmodes_cao.cost_)) print('Training iterations: {}'.format(kmodes_cao.n_iter_)) print('Results tables:') for result in (kmodes_huang, kmodes_cao):
def trainModelAndValidate(train, test): count = 0 # select the required columns per = pd.DataFrame(np.c_[train.iloc[:, 31:73]]) # kmodes clustering with initial cluster as 500 km = KModes(n_clusters=500, max_iter=1000, init='Huang', n_init=2, n_jobs=-1) print("Cost of K clusters") m1 = km.fit(per) # print the cost of clustering print("500 clusters:", m1.cost_) # reduce the clusters gradually till the cost is minimized mdl1 = m1.cluster_centroids_ km1 = KModes( n_clusters=250, max_iter=1000, init='Huang', n_init=2, n_jobs=-1) m2 = km1.fit(mdl1) # print(m2.cluster_centroids_) print("250 clusters:", m2.cost_) mdl2 = m2.cluster_centroids_ km2 = KModes( n_clusters=125, max_iter=1000, init='Huang', n_init=2, n_jobs=-1) m3 = km2.fit(mdl2) # print(m3.cluster_centroids_) print("125 clusters:", m3.cost_) mdl3 = m3.cluster_centroids_ km3 = KModes( n_clusters=62, max_iter=1000, init='Huang', n_init=2, n_jobs=-1) m4 = km3.fit(mdl3) # print(m4.cluster_centroids_) print("62 clusters:", m4.cost_) mdl4 = m4.cluster_centroids_ km4 = KModes( n_clusters=31, max_iter=1000, init='Huang', n_init=2, n_jobs=-1) m5 = km4.fit(mdl4) # print(m5.cluster_centroids_) print("31 clusters:", m5.cost_) mdl5 = m5.cluster_centroids_ km5 = KModes( n_clusters=15, max_iter=1000, init='Huang', n_init=2, n_jobs=-1) m6 = km5.fit(mdl5) # print(m6.cluster_centroids_) print("15 clusters:", m6.cost_) mdl6 = m6.cluster_centroids_ km6 = KModes( n_clusters=10, max_iter=1000, init='Cao', n_init=2, n_jobs=-1) m7 = km6.fit(mdl6) # print(m7.cluster_centroids_) print("10 clusters:", m7.cost_) mdl7 = m7.cluster_centroids_ km7 = KModes( n_clusters=8, max_iter=1000, init='Cao', n_init=2, n_jobs=-1) m8 = km7.fit(mdl7) mfin_clust = m8.cluster_centroids_ print("8 clusters:", m8.cost_) print() # The min cost is obtained when number of clusters = 8 mfin = km7.fit_predict(per) fin = pd.DataFrame(mfin) # print(mfin_clust) # select the required columns df1 = train.iloc[:, 20:73] # add a new column which has the final classification df1['clusters'] = mfin # In order to find the similarity between the users, we group the users # who belong to the same cluster df_fin = df1.groupby(['clusters']) fin_0 = df_fin.get_group(0) # print(np.std(fin_0['Horror'])) fin_1 = df_fin.get_group(1) # print(np.std(fin_1['Horror'])) fin_2 = df_fin.get_group(2) fin_3 = df_fin.get_group(3) fin_4 = df_fin.get_group(4) fin_5 = df_fin.get_group(5) fin_6 = df_fin.get_group(6) fin_7 = df_fin.get_group(7) # convert the centroids of a cluster into a list mfin_clust = list(mfin_clust) for i in range((test.shape[0])): row_hobby = list(df.iloc[i, 31:73]) row_genre = list(df.iloc[i, 20:31]) # Euclidian distance between y and the centroid of each cluster # The calculated distances are stored in a dictionary with the key = # cluster numbers distance = {} for i in range(0, 8): distance[i] = (math.sqrt( sum([(a - b) ** 2 for a, b in zip(mfin_clust[i], row_hobby)]))) # minimum distance is calculated using the values of the dictionary min_clust = min(distance, key=distance.get) # the user is classified into the cluster df_clust = df_fin.get_group(min_clust) # similarity for y and df_clust # drop the columns containing movie genre as the similarity between the # users is calculated using the hobbies preferences df_clust2 = df_clust.drop(['Horror', 'Romantic', 'Comedy', 'Thriller', 'Sci-fi', 'War', 'Fantasy/Fairy tales', 'Western', 'Animated', 'Documentary', 'Action'], axis=1) # insert a new column called index as each user needs to have one # unique identity ind = list(range(0, len(df_clust2))) df_clust2.insert(0, 'Index', ind) # add the index column to the dataframe df_clust2['Index'] # dictinary to store the user-user similarity xz_dict = {} for j in range(0, len(df_clust2)): xz = [] # the list contains the column header and the preferences of the # jth row xz = list(df_clust2.iloc[j, :].items()) # print(xz) xz1 = [] # append only the preferences in a new list for i in range(1, 43): xz1.append(xz[i][1]) # print(xz1) simi = sim(xz1, row_hobby) # store the user similarity in a dictionary xz_dict[j] = simi # find 5 users who are most similar to the new user top_5 = sorted(xz_dict, key=xz_dict.get, reverse=True)[:5] # dictionary used to store the rating for each genre based on user # similarities fin_rec = {} actual = {} # for each genre for k in range(1, 12): actual[k] = row_genre[k - 1] user_rating = [] sum_sim = 0 rec = 0 # apped the ratings of the similar users into a list for a # particular genre for i in top_5: user_rating.append(df_clust.iloc[i, k:k + 1].item()) # calculate the rating of the new user based on user-user # similarity for i, j in zip(user_rating, xz_dict): rec = rec + (i * xz_dict[j]) sum_sim = sum_sim + xz_dict[j] # store the rating in the dictionary created fin_rec[k] = rec / sum_sim # select the top 3 genres based on rating top_3 = sorted(fin_rec, key=fin_rec.get, reverse=True)[:3] top_3_actual = sorted(actual, key=actual.get, reverse=True)[:3] # Thus recommend the genres to the user. for l in top_3: if l in top_3_actual: count += 1 print("Accuracy", count / (3 * test.shape[0])) return count / (3 * test.shape[0])