def KModesRatio(self): ''' Type: K-Modes Y-axis: No Reaction X-axis: Reaction ''' if self.authenticated: from kmodes.kmodes import KModes as KMo algorithm = KMo(n_clusters=2) categories = algorithm.fit_predict(self.allCoord) print(algorithm.cluster_centroids_) plt.scatter(self.allCoord[categories == 0, 0], self.allCoord[categories == 0, 1], c="green") plt.scatter(self.allCoord[categories == 1, 0], self.allCoord[categories == 1, 1], c="red") plt.scatter(algorithm.cluster_centroids_[:, 0], algorithm.cluster_centroids_[:, 1], c="black", marker="*") for i, txt in enumerate(self.labels): plt.annotate(txt, (self.allCoord[i][0], self.allCoord[i][1])) plt.ylabel("NO REACTION") plt.xlabel("REACTION") plt.annotate("NO INFLAMMATION", algorithm.cluster_centroids_[0]) plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centroids_[1]) plt.title("K-Modes: Reaction, No Reaction") plt.show()
def k_modes(questions): temp = [] for col in df.columns: temp.append(col) for val in questions: foo = 'Q' + str(val) + '-0' print(str(val) + ' - ' + mapping[foo][0]) headers = [] for q in questions: head = 'Q' + str(q) + '-' for val in temp: if head in val: headers.append(val) km = KModes(n_clusters=2) clusters = km.fit_predict(df[headers]) columns = [] for centroid in km.cluster_centroids_: temp = [] for i in range(0, len(centroid)): if centroid[i] == 1: temp.append(headers[i]) columns.append(temp) for column in columns: l = [mapping[i][1] for i in column] print(column) print(l)
def makeClusters (data,year, numClusters): km=KModes(n_clusters=numClusters, init="Cao", n_init=1, verbose=1) subsetData= data[data["Year"]==year].drop(["Year","Community Area","Beat"],axis=1).values fitClusters=km.fit_predict(subsetData) clustersCentroidsData=pd.DataFrame(km.cluster_centroids_) clustersCentroidsData.columns=subsetData.columns return fitClusters, clustersCentroidsData
def cluster(): # create a DataFrame to hold the categorical data df = pd.DataFrame(data) # remove all features not appropriate for clustering df = df.drop(['CVD_ID', 'Date_Published', 'Date_Modified', 'Vendor', 'Product'], axis=1) km = KModes(n_clusters=NUMBER_OF_CLUSTERS, init=CLUSTERING_ALGORITHM, verbose=0) km.fit_predict(df) centroids = km.cluster_centroids_ labels = km.labels_ cost = km.cost_ # add counts to this dataframe l = pd.DataFrame(centroids, columns=df.columns) # add assigned cluster to record df['Cluster'] = labels clusters = pd.DataFrame(df.groupby('Cluster')['Cluster'].count()) clusters.rename(columns={'Cluster':'Cluster_Count'}, inplace=True) cnt = [] for i in range(0, len(clusters)): cnt.append(clusters.iloc[i][0]) l['Count'] = cnt print("\nTotal Cost of Selected Clustering Hyperparameters: ", cost) print("Number of Clusters: ", NUMBER_OF_CLUSTERS) print("Algorithm: ", CLUSTERING_ALGORITHM) print("Cluster data is printed to Final_Clusters.csv.") print(l.sort_values('Count', ascending=False)) l.sort_values('Count', ascending=False).to_csv("Data/Final_Clusters.csv", index=False)
def test_kmodes_predict_soybean(self): kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2) kmodes_cao = kmodes_cao.fit(SOYBEAN) result = kmodes_cao.predict(SOYBEAN2) expected = np.array([2, 1, 3, 0]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16))
def clustering(user_session_subset, chicago_clustering, chicago_clustering_labels): #THIS WILL EXTRACT THE CHICAGO_ZERO_AND_ONE RESTAURANTS THAT MATCH WITH THE ONES FOUND FROM THE INITIAL SUBSET(required for clustering) #get count of user session subset user_session_subset_count = pd.crosstab( index=user_session_subset['Restaurant_ID'], columns="count") mask = np.zeros(len(chicago_clustering), dtype=bool) mask[user_session_subset_count['count'].index.values.astype(int)] = True chicago_clustering_labels = chicago_clustering_labels[mask] chicago_clustering = chicago_clustering[mask] #method - Huang, number of clusters - 4, verbose=1 mean textual output (0 is no output) kmodes_huang = KModes(n_clusters=3, init='Huang', verbose=0, n_init=20) kmodes_huang.fit(chicago_clustering) #this joins the restaurant name cluster_results = np.column_stack( (chicago_clustering_labels, kmodes_huang.labels_)) #convert numpy matrix to pandas dataframe cluster_result_df = pd.DataFrame(cluster_results) cluster_result_df.columns = ['Restaurant', 'Cluster'] #JOIN THE CLUSTERING RESULTS WITH user_session_subset_count TO GET OUT FINAL RESULTS #remove existing indecies so the new ones line up and df's can be joined cluster_result_df.reset_index(drop=True, inplace=True) user_session_subset_count.reset_index(drop=True, inplace=True) #join the cluster results with the restaurant counts clusters_with_counts = pd.concat( [cluster_result_df, user_session_subset_count], axis=1) return clusters_with_counts
def k_elbow_plot(fpath, max_k=10): """ Funzione per plottare il grafico "a gomito" che mostra il rapporto tra SSE del modello di clustering e il numero di cluster scelti. L'utilità sta nel poter selezionare il numero di k più appropriato in base all'ultimo valore k che comporta una buona diminuzione dell'errore ("punta del gomito") :param fpath: percorso del dataset processato (vedasi descrizione di argv[1] in cima allo script) :param max_k: numero massimo di k che si vuole utilizzare per produrre il grafo """ if not path.isfile(fpath): print("Error: could not find specified CSV dataset.") return if max_k <= 0: print("Error: k must be a positive integer.") return data = refactor_data_frame(pd.read_csv(fpath)) errors = [] for k in range(1, max_k + 1): kmodes = KModes(n_clusters=k, random_state=42, n_init=1, init="random") kmodes.fit(data) errors.append(kmodes.cost_) print("DONE WITH K=" + str(k)) plt.figure(figsize=(16, 8)) plt.plot(range(1, max_k + 1), errors, 'bo-') plt.xlabel('#Clusters (K)') plt.ylabel('Errore (0/1)') plt.title("Rapporto parametro K/errore del dataset " + path.basename(fpath)) plt.show()
def run_cluster(self): columns = self.board_game_data.columns.tolist() columns = [ c for c in columns if c not in [ 'board_game_id', 'name', 'year', 'minplayer', 'maxplayer', 'playingtime', 'avgratings', 'designer', 'category', 'mechanic', 'publisher', 'age', 'rank' ] ] print(columns) cluster_df = self.board_game_data[columns] km = KModes(n_clusters=15, init='Huang', n_init=10, verbose=1) clusters = km.fit_predict(cluster_df) print(km.cluster_centroids_) centroids = km.cluster_centroids_ for i in range(centroids.shape[0]): if sum(centroids[i, :]) == 0: print("\ncluster " + str(i) + ": ") print("no cluster") else: print("\ncluster " + str(i) + ": ") cent = centroids[i, :] for j in cluster_df.columns[np.nonzero(cent)]: print(j)
def test_kmodes_ninit(self): kmodes = KModes(n_init=10, init='Huang') self.assertEqual(kmodes.n_init, 10) kmodes = KModes(n_init=10) self.assertEqual(kmodes.n_init, 1) kmodes = KModes(n_init=10, init=np.array([1, 1])) self.assertEqual(kmodes.n_init, 1)
def run_kmodes(n_clusters=4): km_huang = KModes(n_clusters=n_clusters, init="Huang", verbose=1, n_init=2, max_iter=10) csv_data = pd.read_csv("kmodes_input.csv") input_data = csv_data.iloc[:, 1:] roadmap_id = csv_data.iloc[:, 0] clusters = km_huang.fit_predict(input_data) cluster_df = pd.DataFrame(clusters) cluster_df.columns = ["cluster_predicted"] cluster_df["roadmap_id"] = roadmap_id # # cluster_data의 전체 행 개수를 roadmap id와 맞추어서 서치 없이 바로 접근할 수 있게하기위함 # # 전체 roadmap 개수 + 0행만큼 행을 만든다 # continuous_id_df = pd.DataFrame(list(range(roadmap_id[roadmap_id.index[-1]] + 1))) # continuous_id_df.columns = ["roadmap_id"] # # cluster_df = pd.merge(cluster_df, continuous_id_df, how="right", on="roadmap_id") print(cluster_df) # save as csv cluster_df.to_csv("clustering_result.csv", sep=",", na_rep="NaN", index=False)
def test_kmodes_predict_soybean_ng(self): kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim) kmodes_cao = kmodes_cao.fit(SOYBEAN) result = kmodes_cao.predict(SOYBEAN2) expected = np.array([2, 1, 3, 0]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint8))
def test_kmodes_random_soybean(self): kmodes_random = KModes(n_clusters=4, init='random', verbose=2, random_state=42) result = kmodes_random.fit(SOYBEAN) self.assertIsInstance(result, KModes)
def fit(self, data, verbose=0): best_scores = dict(zip(self.metric_names, -np.ones(len(self.metrics)))) best_clusters = [] score = dict() clustering_options = self.clustering_options for n_clusters in range(self.min_clusters, self.max_clusters + 1, self.step): clustering_options["n_clusters"] = n_clusters km = KModes(**self.clustering_options) clusters = km.fit_predict(data) for name, metric in zip(self.metric_names, self.metrics): if name == "Incluster distances": score[name] = metric(np.array(data), clusters, metric=matching_dissim, centroids=km.cluster_centroids_) else: score[name] = metric(np.array(data), clusters, metric=matching_dissim) if score["Silhouette"] > best_scores["Silhouette"]: best_clusters = copy(clusters) best_scores = copy(score) self.centroids = copy(km.cluster_centroids_) self.km = deepcopy(km) self.best_scores = best_scores return best_clusters, best_scores
def kmode_calculation(self, data): """ This function calculates the centroid using the k-mode algorithm. This functiontakes in the cleaned data and returns: - Column element mapping dictionary - Centroids - The output data with classification """ col_dict = {} for col in data.columns: data[col] = data[col].astype('category') col_dict.update({col: dict(enumerate(data[col].cat.categories))}) # Get all the cols in the DataFrame cols = [col for col in data.columns] # Transform all values into categorical and numerical values for col in cols: data[col] = data[col].astype('category') data[col] = data[col].cat.codes # Run k-modes using the algorithm kmodes_method = KModes(n_clusters=self.n_cluster, init=self.init_method, n_init=self.n_iter, verbose=1) kmode_result = kmodes_method.fit_predict(data[cols]) # Attach the output label for each data point data['classification'] = pd.Series(kmode_result, index=data.index) return col_dict, kmodes_method.cluster_centroids_, data
def KModePercentTotal(self): ''' Type: K-Modes Y-axis: % Reactions X-axis: # Observations ''' if self.authenticated: from kmodes.kmodes import KModes as KMo algorithm = KMo(n_clusters=2) # partPercent = np.array([np.array([x, percent]) for j in self.stuff for _, x, _, percent in j]) categories = algorithm.fit_predict(self.percentTotal) plt.scatter(self.percentTotal[categories == 0, 0], self.percentTotal[categories == 0, 1], c="green") plt.scatter(self.percentTotal[categories == 1, 0], self.percentTotal[categories == 1, 1], c="red") plt.scatter(algorithm.cluster_centroids_[:, 0], algorithm.cluster_centroids_[:, 1], c="black", marker="*") for i, txt in enumerate(self.labels): plt.annotate( txt, (self.percentTotal[i][0], self.percentTotal[i][1])) plt.ylabel("PERCENT") plt.xlabel("TOTAL") plt.annotate("NO INFLAMMATION", algorithm.cluster_centroids_[0]) plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centroids_[1]) plt.title("K-Modes: # Observations, % Reactions") plt.show()
def fit_kModes(data, n_cluster=2, N_trials=10): kmo = KModes(n_clusters=n_cluster, n_init=N_trials, init='Huang', random_state=616) clusters = kmo.fit_predict(data) cluster_feature_weights = kmo.cluster_centroids_ return clusters, cluster_feature_weights
def clusterBitVec(data, max_clusters=5): best_k = findKBitVec(data, max_clusters) if best_k == 0: return 0, [] else: kmodes = KModes(best_k) labels = kmodes.fit_predict(data) return best_k, labels
def test_kmodes_cao_soybean(self): kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2) result = kmodes_cao.fit_predict(SOYBEAN) expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint8))
def test_kmodes_fit_predict(self): """Test whether fit_predict interface works the same as fit and predict.""" kmodes = KModes(n_clusters=4, init='Cao', random_state=42) sample_weight = [0.5] * TEST_DATA.shape[0] data1 = kmodes.fit_predict(TEST_DATA, sample_weight=sample_weight) data2 = kmodes.fit(TEST_DATA, sample_weight=sample_weight).predict(TEST_DATA) assert_cluster_splits_equal(data1, data2)
def test_kmodes_predict_soybean_jaccard_dissim_label(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=jaccard_dissim_label, random_state=42) kmodes_huang = kmodes_huang.fit(TEST_DATA) result = kmodes_huang.fit_predict(TEST_DATA_PREDICT) expected = np.array([1, 0, 1, 2]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16))
def test_kmodes_cao_soybean_ng(self): kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim) result = kmodes_cao.fit_predict(SOYBEAN) expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16))
def test_kmodes_huang_soybean_parallel(self): kmodes_huang = KModes(n_clusters=4, n_init=4, init='Huang', verbose=2, random_state=42, n_jobs=4) result = kmodes_huang.fit_predict(SOYBEAN) expected = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16))
def test_kmodes_huang_soybean_ng(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=ng_dissim, random_state=42) result = kmodes_huang.fit_predict(SOYBEAN) expected = np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16))
def clusterCreationKmode(): # random categorical data data = np.random.choice(20, (100, 10)) km = KModes(n_clusters=4, init='Huang', n_init=5, verbose=1) clusters = km.fit_predict(data) return HttpResponse(km.cluster_centroids_)
def test_kmodes_huang_soybean(self): np.random.seed(42) kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2) result = kmodes_huang.fit_predict(SOYBEAN) expected = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint8))
def test_kmodes_nunique_nclusters(self): data = np.array([[0, 1], [0, 1], [0, 1], [0, 2], [0, 2], [0, 2]]) np.random.seed(42) kmodes_cao = KModes(n_clusters=6, init='Cao', verbose=2) result = kmodes_cao.fit_predict(data, categorical=[1]) expected = np.array([0, 0, 0, 1, 1, 1]) assert_cluster_splits_equal(result, expected) np.testing.assert_array_equal(kmodes_cao.cluster_centroids_, np.array([[0, 1], [0, 2]]))
def do_clustering(newDF, number_cluster): clusters = [] randomm = randint(2, 10) rand_clusters = randint(number_cluster, 2 * number_cluster) km = KModes(n_clusters=4, init='random', n_init=randomm, verbose=0) km.fit_predict(newDF) clusters = list(km.labels_) print(len(clusters)) return clusters
def test_pickle_fitted(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, random_state=42) model = kmodes_huang.fit(SOYBEAN) serialized = pickle.dumps(model) self.assertTrue(isinstance(pickle.loads(serialized), model.__class__))
def kmodes_samping(df): km = KModes(n_clusters=100, init='Huang', n_init=5, verbose=1, n_jobs=-1) #model = KPrototypes(n_clusters=100, init='Huang', n_init=5, verbose=1, n_jobs=1) data = df[[ 'PANDAID', 'JOBSTATUS', 'COMPUTINGSITE', 'FINAL_STATUS', 'IS_SCOUT', 'DURATION' ]].values clusters = km.fit_predict(data) centers = [row[0] for row in km.cluster_centroids_] return df[df['PANDAID'].isin(centers)]
def do_clustering(newDF, number_cluster): clusters = [] randomm = randint(20, 100) km = KModes(n_clusters=number_cluster, init='Huang', n_init=randomm, verbose=0) km.fit_predict(newDF) clusters = list(km.labels_) return clusters
def test_kmodes_huang_soybean_jaccard_dissim_binary(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=jaccard_dissim_binary, random_state=42) # binary encoded variables are required bin_variables = SOYBEAN.astype(bool).astype(int) result = kmodes_huang.fit_predict(bin_variables) expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 1, 3, 3, 1, 1, 1, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3, 3, 3, 1, 1, 3, 1, 3, 1, 1]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16))
def test_kmodes_huang_soybean(self): np.random.seed(42) kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2) result = kmodes_huang.fit_predict(SOYBEAN) expected = np.array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1 ]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint8))
def kmode(data, ncluster, n_init, verbose): # kmode for categorical data # random categorical data data = np.random.choice(20, (100, 10)) km = KModes(n_clusters=ncluster, init='Huang', n_init=n_init, verbose=verbose) clusters = km.fit_predict(data) return clusters
def test_kmodes_empty_init_cluster_soybean(self): # Check if the clustering does not crash in case of an empty cluster. init_vals = np.array( [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1], [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3, 0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0], [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0, 1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0], [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]]) kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2) result = kmodes_init.fit(SOYBEAN) self.assertIsInstance(result, KModes)
def test_kmodes_init_soybean(self): init_vals = np.array( [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1], [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3, 0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0], [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0, 1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0], [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]]) kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2) result = kmodes_init.fit_predict(SOYBEAN) expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) assert_cluster_splits_equal(result, expected) # 5 initial centroids, 4 n_clusters init_vals = np.array( [[0, 1], [4, 0], [4, 0], [3, 0], [3, 0]]) kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2) with self.assertRaises(AssertionError): kmodes_init.fit(SOYBEAN) # wrong number of attributes init_vals = np.array( [0, 1, 2, 3]) kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2) with self.assertRaises(AssertionError): kmodes_init.fit(SOYBEAN)
def test_kmodes_nunique_nclusters_ng(self): data = np.array([ [0, 1], [0, 1], [0, 1], [0, 2], [0, 2], [0, 2] ]) np.random.seed(42) kmodes_cao = KModes(n_clusters=6, init='Cao', verbose=2, cat_dissim=ng_dissim) result = kmodes_cao.fit_predict(data, categorical=[1]) expected = np.array([0, 0, 0, 1, 1, 1]) assert_cluster_splits_equal(result, expected) np.testing.assert_array_equal(kmodes_cao.cluster_centroids_, np.array([[0, 1], [0, 2]]))
def test_kmodes_predict_unfitted(self): kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2) with self.assertRaises(AssertionError): kmodes_cao.predict(SOYBEAN) with self.assertRaises(AttributeError): kmodes_cao.cluster_centroids_
def test_kmodes_random_soybean(self): kmodes_random = KModes(n_clusters=4, init='random', verbose=2) result = kmodes_random.fit(SOYBEAN) self.assertIsInstance(result, KModes)