def test_k_means_perfect_init():
    km = KMeansConstrained(init=centers.copy(),
                           n_clusters=n_clusters,
                           random_state=42,
                           n_init=1)
    km.fit(X)
    _check_fitted_model(km)
Exemplo n.º 2
0
def test_k_means_n_init():
    rnd = np.random.RandomState(0)
    X = rnd.normal(size=(40, 2))

    # two regression tests on bad n_init argument
    # previous bug: n_init <= 0 threw non-informative TypeError (#3858)
    assert_raises_regex(ValueError, "n_init", KMeansConstrained(n_init=0).fit, X)
    assert_raises_regex(ValueError, "n_init", KMeansConstrained(n_init=-1).fit, X)
Exemplo n.º 3
0
def test_k_means_copyx():
    # Check if copy_x=False returns nearly equal X after de-centering.
    my_X = X.copy()
    km = KMeansConstrained(copy_x=False, n_clusters=n_clusters, random_state=42)
    km.fit(my_X)
    _check_fitted_model(km)

    # check if my_X is centered
    assert_array_almost_equal(my_X, X)
Exemplo n.º 4
0
def test_transform():
    km = KMeansConstrained(n_clusters=n_clusters)
    km.fit(X)
    X_new = km.transform(km.cluster_centers_)

    for c in range(n_clusters):
        assert_equal(X_new[c, c], 0)
        for c2 in range(n_clusters):
            if c != c2:
                assert_greater(X_new[c, c2], 0)
Exemplo n.º 5
0
def test_k_means_fortran_aligned_data():
    # Check the KMeans will work well, even if X is a fortran-aligned data.
    X = np.asfortranarray([[0, 0], [0, 1], [0, 1]])
    centers = np.array([[0, 0], [0, 1]])
    labels = np.array([0, 1, 1])
    km = KMeansConstrained(n_init=1, init=centers,
                random_state=42, n_clusters=2)
    km.fit(X)
    assert_array_equal(km.cluster_centers_, centers)
    assert_array_equal(km.labels_, labels)
Exemplo n.º 6
0
def test_k_means_init_centers():
    # This test is used to check KMeans won't mutate the user provided input
    # array silently even if input data and init centers have the same type
    X_small = np.array([[1.1, 1.1], [-7.5, -7.5], [-1.1, -1.1], [7.5, 7.5]])
    init_centers = np.array([[0.0, 0.0], [5.0, 5.0], [-5.0, -5.0]])
    for dtype in [np.int32, np.int64, np.float32, np.float64]:
        X_test = dtype(X_small)
        init_centers_test = dtype(init_centers)
        assert_array_equal(init_centers, init_centers_test)
        km = KMeansConstrained(init=init_centers_test, n_clusters=3, n_init=1)
        km.fit(X_test)
        assert_equal(False, np.may_share_memory(km.cluster_centers_, init_centers))
def test_sparse_validate_centers():
    from sklearn.datasets import load_iris

    iris = load_iris()
    X = iris.data

    # Get a local optimum
    centers = KMeansConstrained(n_clusters=4).fit(X).cluster_centers_

    # Test that a ValueError is raised for validate_center_shape
    classifier = KMeansConstrained(n_clusters=3, init=centers, n_init=1)

    assert_raises(ValueError, classifier.fit, X)
    def fit(self, X):
        n_samples, n_features = X.shape
        assert self.size_max * self.n_clusters >= n_samples

        clf = KMeansConstrained(self.n_clusters,
                                size_min=self.size_min,
                                size_max=self.size_max,
                                distance_func=self.distance_func)

        clf.fit(X)

        self.clf = clf
        self.cluster_centers_ = self.clf.cluster_centers_
        self.labels_ = self.clf.labels_
Exemplo n.º 9
0
def test_float_precision():
    km = KMeansConstrained(n_init=1, random_state=30)

    inertia = {}
    X_new = {}
    centers = {}

    for dtype in [np.float64, np.float32]:
        X_test = X.astype(dtype)
        km.fit(X_test)
        # dtype of cluster centers has to be the dtype of the input
        # data
        assert_equal(km.cluster_centers_.dtype, dtype)
        inertia[dtype] = km.inertia_
        X_new[dtype] = km.transform(X_test)
        centers[dtype] = km.cluster_centers_
        # ensure the extracted row is a 2d array
        assert_equal(km.predict(X_test[:1]),
                     km.labels_[0])
        if hasattr(km, 'partial_fit'):
            km.partial_fit(X_test[0:3])
            # dtype of cluster centers has to stay the same after
            # partial_fit
            assert_equal(km.cluster_centers_.dtype, dtype)

    # compare arrays with low precision since the difference between
    # 32 and 64 bit sometimes makes a difference up to the 4th decimal
    # place
    assert_array_almost_equal(inertia[np.float32], inertia[np.float64],
                              decimal=4)
    assert_array_almost_equal(X_new[np.float32], X_new[np.float64],
                              decimal=4)
    assert_array_almost_equal(centers[np.float32], centers[np.float64],
                              decimal=4)
def test_sparse_k_means_init_centers():
    from sklearn.datasets import load_iris

    iris = load_iris()
    X = iris.data

    # Get a local optimum
    centers = KMeansConstrained(n_clusters=3,
                                size_min=50).fit(X).cluster_centers_

    # Fit starting from a local optimum shouldn't change the solution
    np.testing.assert_allclose(
        centers,
        KMeansConstrained(n_clusters=3, size_min=50, init=centers,
                          n_init=1).fit(X).cluster_centers_)
Exemplo n.º 11
0
def test_sparse_validate_centers():
    from sklearn.datasets import load_iris

    iris = load_iris()
    X = iris.data

    # Get a local optimum
    centers = KMeansConstrained(n_clusters=4).fit(X).cluster_centers_

    # Test that a ValueError is raised for validate_center_shape
    classifier = KMeansConstrained(n_clusters=3, init=centers, n_init=1)

    msg = "The shape of the initial centers \(\(4L?, 4L?\)\) " \
          "does not match the number of clusters 3"
    assert_raises_regex(ValueError, msg, classifier.fit, X)
Exemplo n.º 12
0
def test_k_means_plus_plus_init_2_jobs():
    if sys.version_info[:2] < (3, 4):
        raise SkipTest(
            "Possible multi-process bug with some BLAS under Python < 3.4")

    km = KMeansConstrained(init="k-means++", n_clusters=n_clusters, n_jobs=2,
                random_state=42).fit(X)
    _check_fitted_model(km)
Exemplo n.º 13
0
def test_k_means_non_collapsed():
    # Check k_means with a bad initialization does not yield a singleton
    # Starting with bad centers that are quickly ignored should not
    # result in a repositioning of the centers to the center of mass that
    # would lead to collapsed centers which in turns make the clustering
    # dependent of the numerical unstabilities.
    my_X = np.array([[1.1, 1.1], [0.9, 1.1], [1.1, 0.9], [0.9, 1.1]])
    array_init = np.array([[1.0, 1.0], [5.0, 5.0], [-5.0, -5.0]])
    km = KMeansConstrained(init=array_init, n_clusters=3, random_state=42, n_init=1)
    km.fit(my_X)

    # centers must not been collapsed
    assert_equal(len(np.unique(km.labels_)), 3)

    centers = km.cluster_centers_
    assert (np.linalg.norm(centers[0] - centers[1]) >= 0.1).all()
    assert (np.linalg.norm(centers[0] - centers[2]) >= 0.1).all()
    assert (np.linalg.norm(centers[1] - centers[2]) >= 0.1).all()
Exemplo n.º 14
0
    def __fit_clusters(self, column: np.array) -> List[float]:
        """ Fit the clusters for a given feature.

        Arguments:
            column (np.array): All the values for a single feature.

        Returns:
            The cluster centers for this feature.
        """
        column = np.sort(column)
        distinct_counter = counter(column)
        max_clusters = sum(min(count, self.__min_cluster_size) for count in distinct_counter.values()) // \
                       self.__min_cluster_size
        for num_clusters in range(max_clusters, 0, -1):
            clustering = KMeansConstrained(n_clusters = num_clusters, size_min = self.__min_cluster_size,
                                           random_state = self.__random_generator)
            clusters = clustering.fit_predict(column[:, np.newaxis])
            if self.__correct_clustering(column, clusters):
                return self.__cluster_centers(column, clusters)
    def fit(self, X):
        n_samples, n_features = X.shape
        minsize = n_samples // self.n_clusters
        maxsize = (n_samples + self.n_clusters - 1) // self.n_clusters

        clf = KMeansConstrained(self.n_clusters,
                                size_min=minsize,
                                size_max=maxsize,
                                distance_func=self.distance_func)

        if minsize != maxsize:
            warnings.warn(
                "Cluster minimum and maximum size are {} and {}, respectively".
                format(minsize, maxsize))

        clf.fit(X)

        self.clf = clf
        self.cluster_centers_ = self.clf.cluster_centers_
        self.labels_ = self.clf.labels_
def test_k_means_new_centers():
    # Explore the part of the code where a new center is reassigned
    X = np.array([[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 0],
                  [0, 0, 0, 0], [0, 1, 0, 0]])
    labels = [0, 1, 2, 1, 1, 2]
    bad_centers = np.array([[+0, 1, 0, 0], [.2, 0, .2, .2], [+0, 0, 0, 0]])

    km = KMeansConstrained(n_clusters=3,
                           init=bad_centers,
                           n_init=1,
                           max_iter=10,
                           random_state=1)

    for i in range(2):
        km.fit(X)
        this_labels = km.labels_
        # Reorder the labels so that the first instance is in cluster 0,
        # the second in cluster 1, ...
        this_labels = np.unique(this_labels, return_index=True)[1][this_labels]
        np.testing.assert_array_equal(this_labels, labels)
Exemplo n.º 17
0
def test_score():

    km1 = KMeansConstrained(n_clusters=n_clusters, max_iter=1, random_state=42, n_init=1)
    s1 = km1.fit(X).score(X)
    km2 = KMeansConstrained(n_clusters=n_clusters, max_iter=10, random_state=42, n_init=1)
    s2 = km2.fit(X).score(X)
    assert_greater(s2, s1)
def subgroup_by_cluster_constrained(object_states: List[ObjectState],
                                    n_member: int = 5) -> List[List[int]]:
    """Generate subgroup based on constrained K-means clustering on object's position
    
    Args:
        object_states (List[ObjectState]): array of object states
        n_member (int, optional): max number of member per subgroup. Default to 5.
    
    Returns:
        List[List[int]]: 2D array, each row contains indices of objects that belong to same subgroup
    """
    n_cluster = math.ceil(len(object_states) / n_member)
    features = [[obj.x, obj.y] for obj in object_states]
    kmeans = KMeansConstrained(n_clusters=n_cluster,
                               size_max=min(n_member, len(object_states)),
                               random_state=42)
    labels = kmeans.fit_predict(features)
    groups = []
    for label in set(labels):
        indices = np.flatnonzero(labels == label)
        groups.append(indices.tolist())
    return groups
Exemplo n.º 19
0
def test_k_means_explicit_init_shape():
    # test for sensible errors when giving explicit init
    # with wrong number of features or clusters
    rnd = np.random.RandomState(0)
    X = rnd.normal(size=(40, 3))

    # mismatch of number of features
    km = KMeansConstrained(n_init=1, init=X[:, :2], n_clusters=len(X))
    msg = "does not match the number of features of the data"
    assert_raises_regex(ValueError, msg, km.fit, X)
    # for callable init
    km = KMeansConstrained(n_init=1,
               init=lambda X_, k, random_state: X_[:, :2],
               n_clusters=len(X))
    assert_raises_regex(ValueError, msg, km.fit, X)
    # mismatch of number of clusters
    msg = "does not match the number of clusters"
    km = KMeansConstrained(n_init=1, init=X[:2, :], n_clusters=3)
    assert_raises_regex(ValueError, msg, km.fit, X)
    # for callable init
    km = KMeansConstrained(n_init=1,
               init=lambda X_, k, random_state: X_[:2, :],
               n_clusters=3)
    assert_raises_regex(ValueError, msg, km.fit, X)
Exemplo n.º 20
0
def test_n_init():
    # Check that increasing the number of init increases the quality
    n_runs = 5
    n_init_range = [1, 5, 10]
    inertia = np.zeros((len(n_init_range), n_runs))
    for i, n_init in enumerate(n_init_range):
        for j in range(n_runs):
            km = KMeansConstrained(n_clusters=n_clusters, init="random", n_init=n_init,
                        random_state=j).fit(X)
            inertia[i, j] = km.inertia_

    inertia = inertia.mean(axis=1)
    failure_msg = ("Inertia %r should be decreasing"
                   " when n_init is increasing.") % list(inertia)
    for i in range(len(n_init_range) - 1):
        assert (inertia[i] >= inertia[i + 1]).all(), failure_msg
Exemplo n.º 21
0
def test_predict():
    km = KMeansConstrained(n_clusters=n_clusters, random_state=42)

    km.fit(X)

    # sanity check: predict centroid labels
    pred = km.predict(km.cluster_centers_)
    assert_array_equal(pred, np.arange(n_clusters))

    # sanity check: re-predict labeling for training set samples
    pred = km.predict(X)
    assert_array_equal(pred, km.labels_)

    # re-predict labels for training set using fit_predict
    pred = km.fit_predict(X)
    assert_array_equal(pred, km.labels_)
Exemplo n.º 22
0
def test_k_means_plus_plus_init():
    km = KMeansConstrained(init="k-means++", n_clusters=n_clusters,
                random_state=42).fit(X)
    _check_fitted_model(km)
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 17 19:13:48 2020

@author: lcota
"""

from k_means_constrained import KMeansConstrained

clf = KMeansConstrained(n_clusters=2, size_min=2, size_max=5, random_state=0)
clf.fit(X)

clf.cluster_centers_
clf.predict([[0, 0], [4, 4]])
Exemplo n.º 24
0
!pip install k-means-constrained 
from k_means_constrained import KMeansConstrained


df=pd.read_csv("https://raw.githubusercontent.com/JavierLilly/Proyecto_Eco/main/BDC_DATA.csv")

#Estandarizando las coordenadas
data= df[['lat','lon']].values.astype('float32',copy=False)
scaler = StandardScaler().fit(data)
data_scal = scaler.transform(data)
df_ = df
df_[['lat','lon']]=data_scal

#Construyendo el modelo de clustering min - max size
coor = df_[['lat','lon']]
model = KMeansConstrained(n_clusters=6,size_min=600,size_max=700,random_state=5565280).fit(coor)
y = model.predict(coor) # Predicion
df_['cluster'] = y
 
#Gráfica Todos con Frecuencia >=1 
cdict={0:'red',1:'black',2:'yellow',3:'green',4:'blue',5:'grey'}
plt.figure(figsize=(10,10))
sns.set()
for g in np.unique(y):
  plt.scatter(coor['lat'][y==g], coor['lon'][y==g], c = cdict[g], label = g, s = 60)
# plt.scatter(df['lat'][df['Frecuencia']==2],df['lon'][df['Frecuencia']==2],c='purple',s=80,alpha = .5)
# plt.scatter(df['lat'][df['Frecuencia']==3],df['lon'][df['Frecuencia']==3],c='brown',s=150,)

plt.legend()

#Reducimos los datos
Exemplo n.º 25
0
    if len(scannedSides) > 5:
        for i in range(len(scannedSides)):
            scannedSidesWithLabels[sideLabels[i]] = scannedSides[i]

        # Map over scanned sides and get an array of all BGR values for each square
        allCubes = []
        for face in scannedSides:
            for square in face:
                allCubes.append([
                    square["avgColor"][0], square["avgColor"][1],
                    square["avgColor"][2]
                ])

        # https://joshlk.github.io/k-means-constrained/
        # Calculate Kmeans and cluster colors with min/max size of 9
        kmeans = KMeansConstrained(n_clusters=6, size_min=9, size_max=9)
        k = kmeans.fit
        labels = kmeans.fit_predict(allCubes)

        # Object to hold all colors
        cube = {
            "front": [],
            "left": [],
            "back": [],
            "right": [],
            "up": [],
            "down": []
        }

        # Loop over the cluster data and get cube map based on cluster
        for i in range(len(labels)):
Exemplo n.º 26
0
def test_max_iter_error():

    km = KMeansConstrained(max_iter=-1)
    assert_raise_message(ValueError, 'Number of iterations should be',
                         km.fit, X)
Exemplo n.º 27
0
def test_fit_transform():
    X1 = KMeansConstrained(n_clusters=3, random_state=51).fit(X).transform(X)
    X2 = KMeansConstrained(n_clusters=3, random_state=51).fit_transform(X)
    assert_array_equal(X1, X2)
Exemplo n.º 28
0
def make_groups(df, total_students, students_per_group):
    df.drop(["Name", "Email"], axis=1, inplace=True)
    df = pd.get_dummies(df, columns=['Year', 'Interests'], drop_first=False)

    def encode(df):
        def skill_encoder(df):
            for i in range(len(df.iloc[:, 2])):
                if df.iloc[i, 2] == 4:
                    df.iloc[i, 2] = 2
                elif df.iloc[i, 2] == 5:
                    df.iloc[i, 2] = 1

        def availability_encoder(df):
            for i in range(len(df.iloc[:, 1])):
                if df.iloc[i, 1] == "00:00 - 6:00":
                    df.iloc[i, 1] = 0
                elif df.iloc[i, 1] == "6:00 - 12:00":
                    df.iloc[i, 1] = 1
                elif df.iloc[i, 1] == "12:00 - 18:00":
                    df.iloc[i, 1] = 2
                elif df.iloc[i, 1] == "18:00 - 24:00":
                    df.iloc[i, 1] = 3

        def timezone_encoder(df):
            for i in range(len(df.iloc[:, 0])):
                if df.iloc[i, 0] == "GMT–8 (Pacific Time)":
                    df.iloc[i, 0] = 0
                elif df.iloc[i, 0] == "GMT–6 (CST)":
                    df.iloc[i, 0] = 1
                elif df.iloc[i, 0] == "GMT–5 (EST)":
                    df.iloc[i, 0] = 2
                elif df.iloc[i, 0] == "GMT–3 (South America)":
                    df.iloc[i, 0] = 3
                elif df.iloc[i, 0] == "GMT+0 (GMT)":
                    df.iloc[i, 0] = 4
                elif df.iloc[i, 0] == "GMT+1 (CET)":
                    df.iloc[i, 0] = 5
                elif df.iloc[i, 0] == "GMT+3 (Eastern Europe/Middle East)":
                    df.iloc[i, 0] = 6
                elif df.iloc[i, 0] == "GMT+5 (South Asia)":
                    df.iloc[i, 0] = 7
                elif df.iloc[i, 0] == "GMT+8 (East Asia)":
                    df.iloc[i, 0] = 8
                elif df.iloc[i, 0] == "GMT+10 (Australia)":
                    df.iloc[i, 0] = 9
                elif df.iloc[i, 0] == "GMT+12 (New Zealand)":
                    df.iloc[i, 0] = 10
            df["Timezone"] = df["Timezone"].astype(int)

        df["Timezone"] = df["Timezone"].astype(str)
        timezone_encoder(df)

        df["Availability"] = df["Availability"].astype(str)
        availability_encoder(df)

        df["Skill"] = df["Skill"].astype(int)
        skill_encoder(df)

        return df

    df = encode(df)

    n_groups = total_students // students_per_group
    min_students = 0
    max_students = 0

    if total_students % students_per_group == 0:
        min_students = students_per_group
        max_students = students_per_group
    else:
        n_groups += 1
        min_students = total_students - (
            students_per_group * (total_students // students_per_group))
        max_students = students_per_group

    groups = KMeansConstrained(n_clusters=n_groups,
                               size_min=min_students,
                               size_max=max_students)
    groups.fit_predict(df)

    return groups.labels_.astype(int).tolist()
Exemplo n.º 29
0
def test_k_means_random_init():
    km = KMeansConstrained(init="random", n_clusters=n_clusters, random_state=42)
    km.fit(X)
    _check_fitted_model(km)
Exemplo n.º 30
0
def test_k_means_invalid_init():
    km = KMeansConstrained(init="invalid", n_init=1, n_clusters=n_clusters)
    assert_raises(ValueError, km.fit, X)