def main(input_path="ubicaciones.csv", balance_deviations=[0.1, 0.15, 0.2, 0.3]): """ Use different balance deviations to test complex lp function to minimize. Args: input_path : csv path with information regarding agencies, frequency, volume and coordinates balance_deviations = list with percentage of deviations, for calaculation refer to stops_gap and items_gap Returns: None """ df = pd.read_csv(input_path) df.loc[df[df["Vol_Entrega"] == 0].index, "Vol_Entrega"] = 1 zones = ["D1", "D2", "D3", "D4", "D5", "D6"] agencies = list("A" + df["Id_Cliente"].astype(str)) vol_delivery = list(df["Vol_Entrega"]) vol_stores = list(df["Vol_Entrega"] * df["Frecuencia"]) frequency = list(df["Frecuencia"]) stores_volume = dict(zip(agencies, vol_stores)) stores_frequency = dict(zip(agencies, frequency)) vol_delivery = dict(zip(agencies, vol_delivery)) scaler = MinMaxScaler() fitted_scaler = scaler.fit(df[["lat", "lon"]]) scaled_coordinates = fitted_scaler.transform(df[["lat", "lon"]]) kmeans = KMeansConstrained(n_clusters=6, size_min=604, size_max=605, random_state=12, n_init=100, max_iter=200, n_jobs=-1) kmeans_values = kmeans.fit(scaled_coordinates) df["kmeans"] = list(kmeans.predict(scaled_coordinates)) vectorized_lat_lon = df[["lat", "lon"]].to_numpy() cluster_centers = fitted_scaler.inverse_transform(kmeans.cluster_centers_) distance_matrix = cdist(cluster_centers, vectorized_lat_lon, metric="cityblock") routes = [(z, a) for z in zones for a in agencies] distances = pulp.makeDict([zones, agencies], distance_matrix, 0) flow = pulp.LpVariable.dicts("Distribution", (zones, agencies), 0, None) using = pulp.LpVariable.dicts("BelongstoZone", (zones, agencies), 0, 1, pulp.LpInteger) for percentage in balance_deviations: prob = pulp.LpProblem("BrewingDataCup2020_" + str(percentage), pulp.LpMinimize) prob += pulp.lpSum([ distances[z][a] * flow[z][a] for (z, a) in routes ]) + pulp.lpSum([distances[z][a] * using[z][a] for (z, a) in routes]), "totalCosts" stops_upper, stops_lower = stops_gap(percentage) distr_upper, distr_lower = items_gap(percentage) for z in zones: prob += pulp.lpSum([using[z][a] for a in agencies ]) <= stops_upper, "SumStopsInZoneUpper %s" % z prob += pulp.lpSum([using[z][a] for a in agencies ]) >= stops_lower, "SumStopsInZoneLower %s" % z prob += pulp.lpSum([flow[z][a] for a in agencies ]) <= distr_upper, "SumDistrInZoneUpper %s" % z prob += pulp.lpSum([flow[z][a] for a in agencies ]) >= distr_lower, "SumDistrInZoneLower %s" % z for z in zones: for a in agencies: prob += flow[z][a] - (100000 * using[z][a]) <= 0 prob += flow[z][a] <= vol_delivery[a] for a in agencies: prob += pulp.lpSum([flow[z][a] for z in zones ]) >= stores_volume[a], "Distribution %s" % a prob += pulp.lpSum([ using[z][a] for z in zones ]) == stores_frequency[a], "FrequencyDistribution %s" % a prob.writeLP("lp_files/milp_brewing_" + str(percentage) + ".lp") solver = pulp.CPLEX_CMD(path=path_to_cplex) prob.solve(solver) print("Estado: ", pulp.LpStatus[prob.status]) print("Total Cost: ", pulp.value(prob.objective)) final_df = pd.DataFrame(columns=["D1", "D2", "D3", "D4", "D5", "D6"], index=(range(1, 3626))) final_distr = dict() for v in prob.variables(): if (v.name).find("BelongstoZone_") == 0: if v.varValue > 0: dist = v.name[14:] zone = dist[:2] id_cliente = int(dist[4:]) final_df.loc[id_cliente, zone] = 1 final_df.fillna(0, inplace=True) final_df = final_df.astype(int).reset_index().rename( columns={"index": "Id_Cliente"}) final_df.to_csv("lp_solutions/cplex_opt_" + str(percentage) + "_" + str(pulp.value(prob.objective)) + ".csv", header=True, index=False)
def test_float_precision(): km = KMeansConstrained(n_init=1, random_state=30) inertia = {} X_new = {} centers = {} for dtype in [np.float64, np.float32]: X_test = X.astype(dtype) km.fit(X_test) # dtype of cluster centers has to be the dtype of the input # data assert_equal(km.cluster_centers_.dtype, dtype) inertia[dtype] = km.inertia_ X_new[dtype] = km.transform(X_test) centers[dtype] = km.cluster_centers_ # ensure the extracted row is a 2d array assert_equal(km.predict(X_test[:1]), km.labels_[0]) if hasattr(km, 'partial_fit'): km.partial_fit(X_test[0:3]) # dtype of cluster centers has to stay the same after # partial_fit assert_equal(km.cluster_centers_.dtype, dtype) # compare arrays with low precision since the difference between # 32 and 64 bit sometimes makes a difference up to the 4th decimal # place assert_array_almost_equal(inertia[np.float32], inertia[np.float64], decimal=4) assert_array_almost_equal(X_new[np.float32], X_new[np.float64], decimal=4) assert_array_almost_equal(centers[np.float32], centers[np.float64], decimal=4)
def test_score(): km1 = KMeansConstrained(n_clusters=n_clusters, max_iter=1, random_state=42, n_init=1) s1 = km1.fit(X).score(X) km2 = KMeansConstrained(n_clusters=n_clusters, max_iter=10, random_state=42, n_init=1) s2 = km2.fit(X).score(X) assert_greater(s2, s1)
def test_k_means_perfect_init(): km = KMeansConstrained(init=centers.copy(), n_clusters=n_clusters, random_state=42, n_init=1) km.fit(X) _check_fitted_model(km)
def test_k_means_copyx(): # Check if copy_x=False returns nearly equal X after de-centering. my_X = X.copy() km = KMeansConstrained(copy_x=False, n_clusters=n_clusters, random_state=42) km.fit(my_X) _check_fitted_model(km) # check if my_X is centered assert_array_almost_equal(my_X, X)
def test_transform(): km = KMeansConstrained(n_clusters=n_clusters) km.fit(X) X_new = km.transform(km.cluster_centers_) for c in range(n_clusters): assert_equal(X_new[c, c], 0) for c2 in range(n_clusters): if c != c2: assert_greater(X_new[c, c2], 0)
def test_k_means_fortran_aligned_data(): # Check the KMeans will work well, even if X is a fortran-aligned data. X = np.asfortranarray([[0, 0], [0, 1], [0, 1]]) centers = np.array([[0, 0], [0, 1]]) labels = np.array([0, 1, 1]) km = KMeansConstrained(n_init=1, init=centers, random_state=42, n_clusters=2) km.fit(X) assert_array_equal(km.cluster_centers_, centers) assert_array_equal(km.labels_, labels)
def test_k_means_init_centers(): # This test is used to check KMeans won't mutate the user provided input # array silently even if input data and init centers have the same type X_small = np.array([[1.1, 1.1], [-7.5, -7.5], [-1.1, -1.1], [7.5, 7.5]]) init_centers = np.array([[0.0, 0.0], [5.0, 5.0], [-5.0, -5.0]]) for dtype in [np.int32, np.int64, np.float32, np.float64]: X_test = dtype(X_small) init_centers_test = dtype(init_centers) assert_array_equal(init_centers, init_centers_test) km = KMeansConstrained(init=init_centers_test, n_clusters=3, n_init=1) km.fit(X_test) assert_equal(False, np.may_share_memory(km.cluster_centers_, init_centers))
def fit(self, X): n_samples, n_features = X.shape assert self.size_max * self.n_clusters >= n_samples clf = KMeansConstrained(self.n_clusters, size_min=self.size_min, size_max=self.size_max, distance_func=self.distance_func) clf.fit(X) self.clf = clf self.cluster_centers_ = self.clf.cluster_centers_ self.labels_ = self.clf.labels_
def test_predict(): km = KMeansConstrained(n_clusters=n_clusters, random_state=42) km.fit(X) # sanity check: predict centroid labels pred = km.predict(km.cluster_centers_) assert_array_equal(pred, np.arange(n_clusters)) # sanity check: re-predict labeling for training set samples pred = km.predict(X) assert_array_equal(pred, km.labels_) # re-predict labels for training set using fit_predict pred = km.fit_predict(X) assert_array_equal(pred, km.labels_)
def test_k_means_non_collapsed(): # Check k_means with a bad initialization does not yield a singleton # Starting with bad centers that are quickly ignored should not # result in a repositioning of the centers to the center of mass that # would lead to collapsed centers which in turns make the clustering # dependent of the numerical unstabilities. my_X = np.array([[1.1, 1.1], [0.9, 1.1], [1.1, 0.9], [0.9, 1.1]]) array_init = np.array([[1.0, 1.0], [5.0, 5.0], [-5.0, -5.0]]) km = KMeansConstrained(init=array_init, n_clusters=3, random_state=42, n_init=1) km.fit(my_X) # centers must not been collapsed assert_equal(len(np.unique(km.labels_)), 3) centers = km.cluster_centers_ assert (np.linalg.norm(centers[0] - centers[1]) >= 0.1).all() assert (np.linalg.norm(centers[0] - centers[2]) >= 0.1).all() assert (np.linalg.norm(centers[1] - centers[2]) >= 0.1).all()
def fit(self, X): n_samples, n_features = X.shape minsize = n_samples // self.n_clusters maxsize = (n_samples + self.n_clusters - 1) // self.n_clusters clf = KMeansConstrained(self.n_clusters, size_min=minsize, size_max=maxsize, distance_func=self.distance_func) if minsize != maxsize: warnings.warn( "Cluster minimum and maximum size are {} and {}, respectively". format(minsize, maxsize)) clf.fit(X) self.clf = clf self.cluster_centers_ = self.clf.cluster_centers_ self.labels_ = self.clf.labels_
def test_k_means_new_centers(): # Explore the part of the code where a new center is reassigned X = np.array([[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 0]]) labels = [0, 1, 2, 1, 1, 2] bad_centers = np.array([[+0, 1, 0, 0], [.2, 0, .2, .2], [+0, 0, 0, 0]]) km = KMeansConstrained(n_clusters=3, init=bad_centers, n_init=1, max_iter=10, random_state=1) for i in range(2): km.fit(X) this_labels = km.labels_ # Reorder the labels so that the first instance is in cluster 0, # the second in cluster 1, ... this_labels = np.unique(this_labels, return_index=True)[1][this_labels] np.testing.assert_array_equal(this_labels, labels)
def test_k_means_random_init(): km = KMeansConstrained(init="random", n_clusters=n_clusters, random_state=42) km.fit(X) _check_fitted_model(km)
# -*- coding: utf-8 -*- """ Created on Sat Oct 17 19:13:48 2020 @author: lcota """ from k_means_constrained import KMeansConstrained clf = KMeansConstrained(n_clusters=2, size_min=2, size_max=5, random_state=0) clf.fit(X) clf.cluster_centers_ clf.predict([[0, 0], [4, 4]])
# print(word) # embeddings.append([word,row_e['embeddings']]) # print(len(embeddings)) # for em in embeddings: # print(em) print('total tokens :',len(embeddings)) X = np.array([x[1] for x in embeddings]) clf = KMeansConstrained( n_clusters=5, size_min=3, size_max=9,random_state=42) kmeans = clf.fit(X) # kmeans = KMeans(n_clusters=10, random_state=0).fit(X) clusters = kmeans.labels_.tolist() dff = pd.DataFrame() dff['word'] = [w[0] for w in embeddings] dff['embedding'] = [e[1] for e in embeddings] dff['cluster'] = clusters clusters_dict = {key:[] for key in dff['cluster'].unique()} for i,rr in dff.iterrows(): clusters_dict[rr['cluster']].append(rr['word'])