示例#1
0
def test_predict():
    km = KMeansConstrained(n_clusters=n_clusters, random_state=42)

    km.fit(X)

    # sanity check: predict centroid labels
    pred = km.predict(km.cluster_centers_)
    assert_array_equal(pred, np.arange(n_clusters))

    # sanity check: re-predict labeling for training set samples
    pred = km.predict(X)
    assert_array_equal(pred, km.labels_)

    # re-predict labels for training set using fit_predict
    pred = km.fit_predict(X)
    assert_array_equal(pred, km.labels_)
示例#2
0
    def __fit_clusters(self, column: np.array) -> List[float]:
        """ Fit the clusters for a given feature.

        Arguments:
            column (np.array): All the values for a single feature.

        Returns:
            The cluster centers for this feature.
        """
        column = np.sort(column)
        distinct_counter = counter(column)
        max_clusters = sum(min(count, self.__min_cluster_size) for count in distinct_counter.values()) // \
                       self.__min_cluster_size
        for num_clusters in range(max_clusters, 0, -1):
            clustering = KMeansConstrained(n_clusters = num_clusters, size_min = self.__min_cluster_size,
                                           random_state = self.__random_generator)
            clusters = clustering.fit_predict(column[:, np.newaxis])
            if self.__correct_clustering(column, clusters):
                return self.__cluster_centers(column, clusters)
def subgroup_by_cluster_constrained(object_states: List[ObjectState],
                                    n_member: int = 5) -> List[List[int]]:
    """Generate subgroup based on constrained K-means clustering on object's position
    
    Args:
        object_states (List[ObjectState]): array of object states
        n_member (int, optional): max number of member per subgroup. Default to 5.
    
    Returns:
        List[List[int]]: 2D array, each row contains indices of objects that belong to same subgroup
    """
    n_cluster = math.ceil(len(object_states) / n_member)
    features = [[obj.x, obj.y] for obj in object_states]
    kmeans = KMeansConstrained(n_clusters=n_cluster,
                               size_max=min(n_member, len(object_states)),
                               random_state=42)
    labels = kmeans.fit_predict(features)
    groups = []
    for label in set(labels):
        indices = np.flatnonzero(labels == label)
        groups.append(indices.tolist())
    return groups
示例#4
0
def make_groups(df, total_students, students_per_group):
    df.drop(["Name", "Email"], axis=1, inplace=True)
    df = pd.get_dummies(df, columns=['Year', 'Interests'], drop_first=False)

    def encode(df):
        def skill_encoder(df):
            for i in range(len(df.iloc[:, 2])):
                if df.iloc[i, 2] == 4:
                    df.iloc[i, 2] = 2
                elif df.iloc[i, 2] == 5:
                    df.iloc[i, 2] = 1

        def availability_encoder(df):
            for i in range(len(df.iloc[:, 1])):
                if df.iloc[i, 1] == "00:00 - 6:00":
                    df.iloc[i, 1] = 0
                elif df.iloc[i, 1] == "6:00 - 12:00":
                    df.iloc[i, 1] = 1
                elif df.iloc[i, 1] == "12:00 - 18:00":
                    df.iloc[i, 1] = 2
                elif df.iloc[i, 1] == "18:00 - 24:00":
                    df.iloc[i, 1] = 3

        def timezone_encoder(df):
            for i in range(len(df.iloc[:, 0])):
                if df.iloc[i, 0] == "GMT–8 (Pacific Time)":
                    df.iloc[i, 0] = 0
                elif df.iloc[i, 0] == "GMT–6 (CST)":
                    df.iloc[i, 0] = 1
                elif df.iloc[i, 0] == "GMT–5 (EST)":
                    df.iloc[i, 0] = 2
                elif df.iloc[i, 0] == "GMT–3 (South America)":
                    df.iloc[i, 0] = 3
                elif df.iloc[i, 0] == "GMT+0 (GMT)":
                    df.iloc[i, 0] = 4
                elif df.iloc[i, 0] == "GMT+1 (CET)":
                    df.iloc[i, 0] = 5
                elif df.iloc[i, 0] == "GMT+3 (Eastern Europe/Middle East)":
                    df.iloc[i, 0] = 6
                elif df.iloc[i, 0] == "GMT+5 (South Asia)":
                    df.iloc[i, 0] = 7
                elif df.iloc[i, 0] == "GMT+8 (East Asia)":
                    df.iloc[i, 0] = 8
                elif df.iloc[i, 0] == "GMT+10 (Australia)":
                    df.iloc[i, 0] = 9
                elif df.iloc[i, 0] == "GMT+12 (New Zealand)":
                    df.iloc[i, 0] = 10
            df["Timezone"] = df["Timezone"].astype(int)

        df["Timezone"] = df["Timezone"].astype(str)
        timezone_encoder(df)

        df["Availability"] = df["Availability"].astype(str)
        availability_encoder(df)

        df["Skill"] = df["Skill"].astype(int)
        skill_encoder(df)

        return df

    df = encode(df)

    n_groups = total_students // students_per_group
    min_students = 0
    max_students = 0

    if total_students % students_per_group == 0:
        min_students = students_per_group
        max_students = students_per_group
    else:
        n_groups += 1
        min_students = total_students - (
            students_per_group * (total_students // students_per_group))
        max_students = students_per_group

    groups = KMeansConstrained(n_clusters=n_groups,
                               size_min=min_students,
                               size_max=max_students)
    groups.fit_predict(df)

    return groups.labels_.astype(int).tolist()
示例#5
0
            scannedSidesWithLabels[sideLabels[i]] = scannedSides[i]

        # Map over scanned sides and get an array of all BGR values for each square
        allCubes = []
        for face in scannedSides:
            for square in face:
                allCubes.append([
                    square["avgColor"][0], square["avgColor"][1],
                    square["avgColor"][2]
                ])

        # https://joshlk.github.io/k-means-constrained/
        # Calculate Kmeans and cluster colors with min/max size of 9
        kmeans = KMeansConstrained(n_clusters=6, size_min=9, size_max=9)
        k = kmeans.fit
        labels = kmeans.fit_predict(allCubes)

        # Object to hold all colors
        cube = {
            "front": [],
            "left": [],
            "back": [],
            "right": [],
            "up": [],
            "down": []
        }

        # Loop over the cluster data and get cube map based on cluster
        for i in range(len(labels)):
            if (i >= 0 and i <= 8):
                cube["front"].append(int(labels[i]))
示例#6
0
import os 
import pandas as pd
import numpy as np
import constant 

from sklearn.preprocessing import StandardScaler
from k_means_constrained import KMeansConstrained

""" Clustering """
# Prepare Data
df = pd.read_csv(os.path.join(constant.DATA_DIR, "cluster_analysis.csv"))

X = df.drop(columns=["Ticker"]).values

scaler = StandardScaler()
X = scaler.fit_transform(X)

# Kmeans clustering with size constraints
clf = KMeansConstrained(n_clusters=50, size_min=5, size_max=10)

labels = clf.fit_predict(X)
# cluster_centers = scaler.inverse_transform(clf.cluster_centers_)

# Output dataframe
df["clusterId"] = labels
df.to_csv(os.path.join(constant.DATA_DIR, "cluster_result.csv"), index=False)