def test_predict(): km = KMeansConstrained(n_clusters=n_clusters, random_state=42) km.fit(X) # sanity check: predict centroid labels pred = km.predict(km.cluster_centers_) assert_array_equal(pred, np.arange(n_clusters)) # sanity check: re-predict labeling for training set samples pred = km.predict(X) assert_array_equal(pred, km.labels_) # re-predict labels for training set using fit_predict pred = km.fit_predict(X) assert_array_equal(pred, km.labels_)
def __fit_clusters(self, column: np.array) -> List[float]: """ Fit the clusters for a given feature. Arguments: column (np.array): All the values for a single feature. Returns: The cluster centers for this feature. """ column = np.sort(column) distinct_counter = counter(column) max_clusters = sum(min(count, self.__min_cluster_size) for count in distinct_counter.values()) // \ self.__min_cluster_size for num_clusters in range(max_clusters, 0, -1): clustering = KMeansConstrained(n_clusters = num_clusters, size_min = self.__min_cluster_size, random_state = self.__random_generator) clusters = clustering.fit_predict(column[:, np.newaxis]) if self.__correct_clustering(column, clusters): return self.__cluster_centers(column, clusters)
def subgroup_by_cluster_constrained(object_states: List[ObjectState], n_member: int = 5) -> List[List[int]]: """Generate subgroup based on constrained K-means clustering on object's position Args: object_states (List[ObjectState]): array of object states n_member (int, optional): max number of member per subgroup. Default to 5. Returns: List[List[int]]: 2D array, each row contains indices of objects that belong to same subgroup """ n_cluster = math.ceil(len(object_states) / n_member) features = [[obj.x, obj.y] for obj in object_states] kmeans = KMeansConstrained(n_clusters=n_cluster, size_max=min(n_member, len(object_states)), random_state=42) labels = kmeans.fit_predict(features) groups = [] for label in set(labels): indices = np.flatnonzero(labels == label) groups.append(indices.tolist()) return groups
def make_groups(df, total_students, students_per_group): df.drop(["Name", "Email"], axis=1, inplace=True) df = pd.get_dummies(df, columns=['Year', 'Interests'], drop_first=False) def encode(df): def skill_encoder(df): for i in range(len(df.iloc[:, 2])): if df.iloc[i, 2] == 4: df.iloc[i, 2] = 2 elif df.iloc[i, 2] == 5: df.iloc[i, 2] = 1 def availability_encoder(df): for i in range(len(df.iloc[:, 1])): if df.iloc[i, 1] == "00:00 - 6:00": df.iloc[i, 1] = 0 elif df.iloc[i, 1] == "6:00 - 12:00": df.iloc[i, 1] = 1 elif df.iloc[i, 1] == "12:00 - 18:00": df.iloc[i, 1] = 2 elif df.iloc[i, 1] == "18:00 - 24:00": df.iloc[i, 1] = 3 def timezone_encoder(df): for i in range(len(df.iloc[:, 0])): if df.iloc[i, 0] == "GMT–8 (Pacific Time)": df.iloc[i, 0] = 0 elif df.iloc[i, 0] == "GMT–6 (CST)": df.iloc[i, 0] = 1 elif df.iloc[i, 0] == "GMT–5 (EST)": df.iloc[i, 0] = 2 elif df.iloc[i, 0] == "GMT–3 (South America)": df.iloc[i, 0] = 3 elif df.iloc[i, 0] == "GMT+0 (GMT)": df.iloc[i, 0] = 4 elif df.iloc[i, 0] == "GMT+1 (CET)": df.iloc[i, 0] = 5 elif df.iloc[i, 0] == "GMT+3 (Eastern Europe/Middle East)": df.iloc[i, 0] = 6 elif df.iloc[i, 0] == "GMT+5 (South Asia)": df.iloc[i, 0] = 7 elif df.iloc[i, 0] == "GMT+8 (East Asia)": df.iloc[i, 0] = 8 elif df.iloc[i, 0] == "GMT+10 (Australia)": df.iloc[i, 0] = 9 elif df.iloc[i, 0] == "GMT+12 (New Zealand)": df.iloc[i, 0] = 10 df["Timezone"] = df["Timezone"].astype(int) df["Timezone"] = df["Timezone"].astype(str) timezone_encoder(df) df["Availability"] = df["Availability"].astype(str) availability_encoder(df) df["Skill"] = df["Skill"].astype(int) skill_encoder(df) return df df = encode(df) n_groups = total_students // students_per_group min_students = 0 max_students = 0 if total_students % students_per_group == 0: min_students = students_per_group max_students = students_per_group else: n_groups += 1 min_students = total_students - ( students_per_group * (total_students // students_per_group)) max_students = students_per_group groups = KMeansConstrained(n_clusters=n_groups, size_min=min_students, size_max=max_students) groups.fit_predict(df) return groups.labels_.astype(int).tolist()
scannedSidesWithLabels[sideLabels[i]] = scannedSides[i] # Map over scanned sides and get an array of all BGR values for each square allCubes = [] for face in scannedSides: for square in face: allCubes.append([ square["avgColor"][0], square["avgColor"][1], square["avgColor"][2] ]) # https://joshlk.github.io/k-means-constrained/ # Calculate Kmeans and cluster colors with min/max size of 9 kmeans = KMeansConstrained(n_clusters=6, size_min=9, size_max=9) k = kmeans.fit labels = kmeans.fit_predict(allCubes) # Object to hold all colors cube = { "front": [], "left": [], "back": [], "right": [], "up": [], "down": [] } # Loop over the cluster data and get cube map based on cluster for i in range(len(labels)): if (i >= 0 and i <= 8): cube["front"].append(int(labels[i]))
import os import pandas as pd import numpy as np import constant from sklearn.preprocessing import StandardScaler from k_means_constrained import KMeansConstrained """ Clustering """ # Prepare Data df = pd.read_csv(os.path.join(constant.DATA_DIR, "cluster_analysis.csv")) X = df.drop(columns=["Ticker"]).values scaler = StandardScaler() X = scaler.fit_transform(X) # Kmeans clustering with size constraints clf = KMeansConstrained(n_clusters=50, size_min=5, size_max=10) labels = clf.fit_predict(X) # cluster_centers = scaler.inverse_transform(clf.cluster_centers_) # Output dataframe df["clusterId"] = labels df.to_csv(os.path.join(constant.DATA_DIR, "cluster_result.csv"), index=False)