def main(input_path="ubicaciones.csv",
         balance_deviations=[0.1, 0.15, 0.2, 0.3]):
    """
    Use different balance deviations to test complex lp function to minimize.
    Args:
        input_path : csv path with information regarding agencies, frequency, volume and coordinates
        balance_deviations = list with percentage of deviations, for calaculation refer to stops_gap and items_gap
    Returns:
        None
    """
    df = pd.read_csv(input_path)
    df.loc[df[df["Vol_Entrega"] == 0].index, "Vol_Entrega"] = 1

    zones = ["D1", "D2", "D3", "D4", "D5", "D6"]
    agencies = list("A" + df["Id_Cliente"].astype(str))
    vol_delivery = list(df["Vol_Entrega"])
    vol_stores = list(df["Vol_Entrega"] * df["Frecuencia"])
    frequency = list(df["Frecuencia"])
    stores_volume = dict(zip(agencies, vol_stores))
    stores_frequency = dict(zip(agencies, frequency))
    vol_delivery = dict(zip(agencies, vol_delivery))

    scaler = MinMaxScaler()
    fitted_scaler = scaler.fit(df[["lat", "lon"]])
    scaled_coordinates = fitted_scaler.transform(df[["lat", "lon"]])

    kmeans = KMeansConstrained(n_clusters=6,
                               size_min=604,
                               size_max=605,
                               random_state=12,
                               n_init=100,
                               max_iter=200,
                               n_jobs=-1)
    kmeans_values = kmeans.fit(scaled_coordinates)
    df["kmeans"] = list(kmeans.predict(scaled_coordinates))

    vectorized_lat_lon = df[["lat", "lon"]].to_numpy()
    cluster_centers = fitted_scaler.inverse_transform(kmeans.cluster_centers_)
    distance_matrix = cdist(cluster_centers,
                            vectorized_lat_lon,
                            metric="cityblock")

    routes = [(z, a) for z in zones for a in agencies]
    distances = pulp.makeDict([zones, agencies], distance_matrix, 0)
    flow = pulp.LpVariable.dicts("Distribution", (zones, agencies), 0, None)
    using = pulp.LpVariable.dicts("BelongstoZone", (zones, agencies), 0, 1,
                                  pulp.LpInteger)

    for percentage in balance_deviations:
        prob = pulp.LpProblem("BrewingDataCup2020_" + str(percentage),
                              pulp.LpMinimize)
        prob += pulp.lpSum([
            distances[z][a] * flow[z][a] for (z, a) in routes
        ]) + pulp.lpSum([distances[z][a] * using[z][a]
                         for (z, a) in routes]), "totalCosts"
        stops_upper, stops_lower = stops_gap(percentage)
        distr_upper, distr_lower = items_gap(percentage)
        for z in zones:
            prob += pulp.lpSum([using[z][a] for a in agencies
                                ]) <= stops_upper, "SumStopsInZoneUpper %s" % z
            prob += pulp.lpSum([using[z][a] for a in agencies
                                ]) >= stops_lower, "SumStopsInZoneLower %s" % z
            prob += pulp.lpSum([flow[z][a] for a in agencies
                                ]) <= distr_upper, "SumDistrInZoneUpper %s" % z
            prob += pulp.lpSum([flow[z][a] for a in agencies
                                ]) >= distr_lower, "SumDistrInZoneLower %s" % z
        for z in zones:
            for a in agencies:
                prob += flow[z][a] - (100000 * using[z][a]) <= 0
                prob += flow[z][a] <= vol_delivery[a]
        for a in agencies:
            prob += pulp.lpSum([flow[z][a] for z in zones
                                ]) >= stores_volume[a], "Distribution %s" % a
            prob += pulp.lpSum([
                using[z][a] for z in zones
            ]) == stores_frequency[a], "FrequencyDistribution %s" % a

        prob.writeLP("lp_files/milp_brewing_" + str(percentage) + ".lp")
        solver = pulp.CPLEX_CMD(path=path_to_cplex)
        prob.solve(solver)
        print("Estado: ", pulp.LpStatus[prob.status])
        print("Total Cost: ", pulp.value(prob.objective))

        final_df = pd.DataFrame(columns=["D1", "D2", "D3", "D4", "D5", "D6"],
                                index=(range(1, 3626)))
        final_distr = dict()
        for v in prob.variables():
            if (v.name).find("BelongstoZone_") == 0:
                if v.varValue > 0:
                    dist = v.name[14:]
                    zone = dist[:2]
                    id_cliente = int(dist[4:])
                    final_df.loc[id_cliente, zone] = 1

        final_df.fillna(0, inplace=True)
        final_df = final_df.astype(int).reset_index().rename(
            columns={"index": "Id_Cliente"})
        final_df.to_csv("lp_solutions/cplex_opt_" + str(percentage) + "_" +
                        str(pulp.value(prob.objective)) + ".csv",
                        header=True,
                        index=False)
示例#2
0
def test_float_precision():
    km = KMeansConstrained(n_init=1, random_state=30)

    inertia = {}
    X_new = {}
    centers = {}

    for dtype in [np.float64, np.float32]:
        X_test = X.astype(dtype)
        km.fit(X_test)
        # dtype of cluster centers has to be the dtype of the input
        # data
        assert_equal(km.cluster_centers_.dtype, dtype)
        inertia[dtype] = km.inertia_
        X_new[dtype] = km.transform(X_test)
        centers[dtype] = km.cluster_centers_
        # ensure the extracted row is a 2d array
        assert_equal(km.predict(X_test[:1]),
                     km.labels_[0])
        if hasattr(km, 'partial_fit'):
            km.partial_fit(X_test[0:3])
            # dtype of cluster centers has to stay the same after
            # partial_fit
            assert_equal(km.cluster_centers_.dtype, dtype)

    # compare arrays with low precision since the difference between
    # 32 and 64 bit sometimes makes a difference up to the 4th decimal
    # place
    assert_array_almost_equal(inertia[np.float32], inertia[np.float64],
                              decimal=4)
    assert_array_almost_equal(X_new[np.float32], X_new[np.float64],
                              decimal=4)
    assert_array_almost_equal(centers[np.float32], centers[np.float64],
                              decimal=4)
示例#3
0
def test_score():

    km1 = KMeansConstrained(n_clusters=n_clusters, max_iter=1, random_state=42, n_init=1)
    s1 = km1.fit(X).score(X)
    km2 = KMeansConstrained(n_clusters=n_clusters, max_iter=10, random_state=42, n_init=1)
    s2 = km2.fit(X).score(X)
    assert_greater(s2, s1)
def test_k_means_perfect_init():
    km = KMeansConstrained(init=centers.copy(),
                           n_clusters=n_clusters,
                           random_state=42,
                           n_init=1)
    km.fit(X)
    _check_fitted_model(km)
示例#5
0
def test_k_means_copyx():
    # Check if copy_x=False returns nearly equal X after de-centering.
    my_X = X.copy()
    km = KMeansConstrained(copy_x=False, n_clusters=n_clusters, random_state=42)
    km.fit(my_X)
    _check_fitted_model(km)

    # check if my_X is centered
    assert_array_almost_equal(my_X, X)
示例#6
0
def test_transform():
    km = KMeansConstrained(n_clusters=n_clusters)
    km.fit(X)
    X_new = km.transform(km.cluster_centers_)

    for c in range(n_clusters):
        assert_equal(X_new[c, c], 0)
        for c2 in range(n_clusters):
            if c != c2:
                assert_greater(X_new[c, c2], 0)
示例#7
0
def test_k_means_fortran_aligned_data():
    # Check the KMeans will work well, even if X is a fortran-aligned data.
    X = np.asfortranarray([[0, 0], [0, 1], [0, 1]])
    centers = np.array([[0, 0], [0, 1]])
    labels = np.array([0, 1, 1])
    km = KMeansConstrained(n_init=1, init=centers,
                random_state=42, n_clusters=2)
    km.fit(X)
    assert_array_equal(km.cluster_centers_, centers)
    assert_array_equal(km.labels_, labels)
示例#8
0
def test_k_means_init_centers():
    # This test is used to check KMeans won't mutate the user provided input
    # array silently even if input data and init centers have the same type
    X_small = np.array([[1.1, 1.1], [-7.5, -7.5], [-1.1, -1.1], [7.5, 7.5]])
    init_centers = np.array([[0.0, 0.0], [5.0, 5.0], [-5.0, -5.0]])
    for dtype in [np.int32, np.int64, np.float32, np.float64]:
        X_test = dtype(X_small)
        init_centers_test = dtype(init_centers)
        assert_array_equal(init_centers, init_centers_test)
        km = KMeansConstrained(init=init_centers_test, n_clusters=3, n_init=1)
        km.fit(X_test)
        assert_equal(False, np.may_share_memory(km.cluster_centers_, init_centers))
    def fit(self, X):
        n_samples, n_features = X.shape
        assert self.size_max * self.n_clusters >= n_samples

        clf = KMeansConstrained(self.n_clusters,
                                size_min=self.size_min,
                                size_max=self.size_max,
                                distance_func=self.distance_func)

        clf.fit(X)

        self.clf = clf
        self.cluster_centers_ = self.clf.cluster_centers_
        self.labels_ = self.clf.labels_
示例#10
0
def test_predict():
    km = KMeansConstrained(n_clusters=n_clusters, random_state=42)

    km.fit(X)

    # sanity check: predict centroid labels
    pred = km.predict(km.cluster_centers_)
    assert_array_equal(pred, np.arange(n_clusters))

    # sanity check: re-predict labeling for training set samples
    pred = km.predict(X)
    assert_array_equal(pred, km.labels_)

    # re-predict labels for training set using fit_predict
    pred = km.fit_predict(X)
    assert_array_equal(pred, km.labels_)
示例#11
0
def test_k_means_non_collapsed():
    # Check k_means with a bad initialization does not yield a singleton
    # Starting with bad centers that are quickly ignored should not
    # result in a repositioning of the centers to the center of mass that
    # would lead to collapsed centers which in turns make the clustering
    # dependent of the numerical unstabilities.
    my_X = np.array([[1.1, 1.1], [0.9, 1.1], [1.1, 0.9], [0.9, 1.1]])
    array_init = np.array([[1.0, 1.0], [5.0, 5.0], [-5.0, -5.0]])
    km = KMeansConstrained(init=array_init, n_clusters=3, random_state=42, n_init=1)
    km.fit(my_X)

    # centers must not been collapsed
    assert_equal(len(np.unique(km.labels_)), 3)

    centers = km.cluster_centers_
    assert (np.linalg.norm(centers[0] - centers[1]) >= 0.1).all()
    assert (np.linalg.norm(centers[0] - centers[2]) >= 0.1).all()
    assert (np.linalg.norm(centers[1] - centers[2]) >= 0.1).all()
    def fit(self, X):
        n_samples, n_features = X.shape
        minsize = n_samples // self.n_clusters
        maxsize = (n_samples + self.n_clusters - 1) // self.n_clusters

        clf = KMeansConstrained(self.n_clusters,
                                size_min=minsize,
                                size_max=maxsize,
                                distance_func=self.distance_func)

        if minsize != maxsize:
            warnings.warn(
                "Cluster minimum and maximum size are {} and {}, respectively".
                format(minsize, maxsize))

        clf.fit(X)

        self.clf = clf
        self.cluster_centers_ = self.clf.cluster_centers_
        self.labels_ = self.clf.labels_
def test_k_means_new_centers():
    # Explore the part of the code where a new center is reassigned
    X = np.array([[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 0],
                  [0, 0, 0, 0], [0, 1, 0, 0]])
    labels = [0, 1, 2, 1, 1, 2]
    bad_centers = np.array([[+0, 1, 0, 0], [.2, 0, .2, .2], [+0, 0, 0, 0]])

    km = KMeansConstrained(n_clusters=3,
                           init=bad_centers,
                           n_init=1,
                           max_iter=10,
                           random_state=1)

    for i in range(2):
        km.fit(X)
        this_labels = km.labels_
        # Reorder the labels so that the first instance is in cluster 0,
        # the second in cluster 1, ...
        this_labels = np.unique(this_labels, return_index=True)[1][this_labels]
        np.testing.assert_array_equal(this_labels, labels)
示例#14
0
def test_k_means_random_init():
    km = KMeansConstrained(init="random", n_clusters=n_clusters, random_state=42)
    km.fit(X)
    _check_fitted_model(km)
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 17 19:13:48 2020

@author: lcota
"""

from k_means_constrained import KMeansConstrained

clf = KMeansConstrained(n_clusters=2, size_min=2, size_max=5, random_state=0)
clf.fit(X)

clf.cluster_centers_
clf.predict([[0, 0], [4, 4]])
        #         print(word)
        #         embeddings.append([word,row_e['embeddings']])

    # print(len(embeddings))
    # for em in embeddings:
    #     print(em)
    print('total tokens :',len(embeddings))
    X = np.array([x[1] for x in embeddings])
    clf = KMeansConstrained(
        n_clusters=5,
        size_min=3,
        size_max=9,random_state=42)



    kmeans = clf.fit(X)
    # kmeans = KMeans(n_clusters=10, random_state=0).fit(X)
    clusters = kmeans.labels_.tolist()

    dff = pd.DataFrame()

    dff['word'] = [w[0] for w in embeddings]
    dff['embedding'] = [e[1] for e in embeddings]
    dff['cluster'] = clusters

    clusters_dict = {key:[] for key in dff['cluster'].unique()}
    for i,rr in dff.iterrows():
        clusters_dict[rr['cluster']].append(rr['word'])