def test_dataframes(): df = dd.from_pandas( pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 10]}), npartitions=2 ) kmeans = DKKMeans() kmeans.fit(df)
def test_basic(self, Xl_blobs_easy): X, _ = Xl_blobs_easy # make it super easy to cluster a = DKKMeans(n_clusters=3, random_state=0) b = SKKMeans(n_clusters=3, random_state=0) a.fit(X) b.fit(X) assert_estimator_equal( a, b, exclude=['n_iter_', 'inertia_', 'cluster_centers_', 'labels_']) assert abs(a.inertia_ - b.inertia_) < 0.01 # order is arbitrary, so align first a_order = np.argsort(a.cluster_centers_, 0)[:, 0] b_order = np.argsort(b.cluster_centers_, 0)[:, 0] a_centers = a.cluster_centers_[a_order] b_centers = b.cluster_centers_[b_order] np.testing.assert_allclose(a_centers, b_centers, rtol=1e-3) b_labels = replace(b.labels_, [0, 1, 2], a_order[b_order]) assert_eq(a.labels_.compute(), b_labels) assert a.n_iter_ # this is hacky b.cluster_centers_ = b_centers a.cluster_centers_ = a_centers assert_eq(a.transform(X), b.transform(X), rtol=1e-3)
def test_fit_raises(): km = DKKMeans() with pytest.raises(ValueError): km.fit(np.array([]).reshape(0, 1)) with pytest.raises(ValueError): km.fit(np.array([]).reshape(1, 0))
def test_fit_given_init(self, X_blobs): X_ = X_blobs.compute() x_squared_norms = k_means_.row_norms(X_, squared=True) rs = np.random.RandomState(0) init = k_means_._k_init(X_, 3, x_squared_norms, rs) dkkm = DKKMeans(3, init=init, random_state=rs) skkm = SKKMeans(3, init=init, random_state=rs, n_init=1) dkkm.fit(X_blobs) skkm.fit(X_) assert_eq(dkkm.inertia_, skkm.inertia_)
def test_kmeanspp_init(self, Xl_blobs_easy): X, y = Xl_blobs_easy X_ = X.compute() rs = np.random.RandomState(0) dkkm = DKKMeans(3, init="k-means++", random_state=rs) skkm = SKKMeans(3, init="k-means++", random_state=rs) dkkm.fit(X) skkm.fit(X_) assert abs(dkkm.inertia_ - skkm.inertia_) < 1e-4 assert dkkm.init == "k-means++"
def test_random_init(self, Xl_blobs_easy): X, y = Xl_blobs_easy X_ = X.compute() rs = 0 dkkm = DKKMeans(3, init="random", random_state=rs) skkm = SKKMeans(3, init="random", random_state=rs, n_init=1) dkkm.fit(X) skkm.fit(X_) assert abs(dkkm.inertia_ - skkm.inertia_) < 1e-4 assert dkkm.init == "random"
def run(self): if self.word_vectors not in {"fasttext", "word2vec"}: raise ValueError( f'Expected fasttext or word2vec; got {self.word_vectors}') print( f'Initializing dask dataframe of word embeddings at {datetime.now()}' ) ddf = dask.dataframe.read_csv(config.ARTICLE_EMBEDDINGS_DIR / f'{self.word_vectors}_to_csv' / "*.part") print( f'Dropping columns and converting to design matrix (dask array) at {datetime.now()}' ) X = ddf.drop(['Unnamed: 0', "id", "url", "title"], axis=1) X = X.to_dask_array(lengths=True) # Perform k-means clustering print(f'Starting K-Means clustering at {datetime.now()}') k_means_clustering_model = KMeans(n_clusters=self.num_clusters, n_jobs=-1, max_iter=config.K_MEANS_MAX_ITER) k_means_cluster_labels = k_means_clustering_model.fit(X) # Write k-means results to disk print( f'Joining K-means results and writing to disk at {datetime.now()}') k_means_results_ddf = ddf.join(k_means_cluster_labels) k_means_ddf_output_path = config.CLUSTERING_RESULTS_DIR / f'{self.word_vectors}_w_k_means' k_means_ddf_output_path.mkdir(parents=True, exist_ok=True) dask.dataframe.to_csv(k_means_results_ddf, k_means_ddf_output_path) # Perform spectral clustering print(f'Starting Spectral clustering at {datetime.now()}') spectral_clustering_model = SpectralClustering( n_clusters=self.num_clusters, n_jobs=-1, persist_embedding=True, kmeans_params={"max_iter": config.K_MEANS_MAX_ITER}) spectral_cluster_labels = spectral_clustering_model.fit(X) # Write spectral results to disk print( f'Joining Spectral results and writing to disk at {datetime.now()}' ) spectral_results_ddf = ddf.join(spectral_cluster_labels) spectral_ddf_output_path = config.CLUSTERING_RESULTS_DIR / f'{self.word_vectors}_w_spectral' spectral_ddf_output_path.mkdir(parents=True, exist_ok=True) dask.dataframe.to_csv(spectral_results_ddf, spectral_ddf_output_path) # And save the success flag with self.output().open("w") as f: # f.write(f'Clustering {self.word_vectors} k={self.num_clusters}: {silhouette_score_result}' + "\n") # f.write(spectral_clustering_model.get_params(deep=True)) f.write(f'{self.word_vectors}: Success!')
def test_fit_given_init(self): X, y = sklearn.datasets.make_blobs(n_samples=1000, n_features=4, random_state=1) X = da.from_array(X, chunks=500) X_ = X.compute() x_squared_norms = sklearn.utils.extmath.row_norms(X_, squared=True) rs = np.random.RandomState(0) init = _k_init(X_, 3, x_squared_norms, rs) dkkm = DKKMeans(3, init=init, random_state=0) skkm = SKKMeans(3, init=init, random_state=0, n_init=1) dkkm.fit(X) skkm.fit(X_) assert_eq(dkkm.inertia_, skkm.inertia_)
def test_dtypes(self): X = da.random.uniform(size=(100, 2), chunks=(50, 2)) X2 = X.astype("f4") pairs = [(X, X), (X2, X2), (X, X2), (X2, X)] for xx, yy in pairs: a = DKKMeans() b = SKKMeans() a.fit(xx) b.fit(xx) assert a.cluster_centers_.dtype == b.cluster_centers_.dtype assert a.labels_.dtype == b.labels_.dtype assert a.transform(xx).dtype == b.transform(xx).dtype assert a.transform(yy).dtype == b.transform(yy).dtype
def genmask(self, ddf: dask.dataframe.DataFrame): center = None if self.lat_lon: center = self.lat_lon else: # If lat_long is empty, do some ML model = KMeans(n_clusters=1, init_max_iter=self.max_iter) model.fit(ddf[[self.lat_col, self.lon_col]].to_dask_array(lengths=True)) center = tuple(model.cluster_centers_[0]) return ddf.map_partitions(lambda df: df.apply( self.applyfunc, axis=1, center=center).rename(self.name), meta=(self.name, 'bool'))
def main(): cfg = Path(__file__).parent.joinpath("kmeans_config.yaml") cfg = load_config(str(cfg)) kmeans = KMeans(n_clusters=3, random_state=0) X = read(cfg) fit(cfg, kmeans, X) print(timings)
def fit(data, use_scikit_learn=False): logger.info("Starting to cluster") # Cluster n_clusters = 8 oversampling_factor = 2 if use_scikit_learn: km = sk.KMeans(n_clusters=n_clusters, random_state=0) else: km = KMeans(n_clusters=n_clusters, oversampling_factor=oversampling_factor, random_state=0) t0 = tic() logger.info("Starting n_clusters=%2d, oversampling_factor=%2d", n_clusters, oversampling_factor) km.fit(data) t1 = tic() logger.info("Finished in %.2f", t1 - t0)
def fit(data, use_scikit_learn=False): logger.info("Starting to cluster") # Cluster n_clusters = 8 oversampling_factor = 2 if use_scikit_learn: km = sk.KMeans(n_clusters=n_clusters, random_state=0) else: km = KMeans( n_clusters=n_clusters, oversampling_factor=oversampling_factor, random_state=0, ) logger.info( "Starting n_clusters=%2d, oversampling_factor=%2d", n_clusters, oversampling_factor, ) with _timer("km.fit", _logger=logger): km.fit(data)
def test_kmeanspp_init_random_state(self, Xl_blobs_easy): X, y = Xl_blobs_easy a = DKKMeans(3, init="k-means++") a.fit(X) b = DKKMeans(3, init="k-means++", random_state=0) b.fit(X)
def cluster_variable(data): """ Creates a column that gives a cluster id based on KMeans clustering of all features :param data: a pandas dataframe where each row is an hour :return: a pandas dataframe containing the new column """ print("\tAdding cluster variable...") data = data.copy() to_cluster = dd.get_dummies(data) train = get_train(to_cluster) holdout = get_holdout(to_cluster) kmeans = KMeans(n_clusters=5, random_state=SEED).fit( train.drop("cnt", axis=1)) # magic numbers, blech data["cluster"] = da.append(kmeans.labels_, kmeans.predict(holdout.drop("cnt", axis=1))) data["cluster"] = data["cluster"].astype("category") return data
def weather_cluster(data): """ Creates a column that gives a cluster id based on KMeans clustering of only weather-related features :param data: a pandas dataframe where each row is an hour :return: a pandas dataframe containing the new column """ print("\tAdding clustering variable based on weather-related features...") df = data.copy()[["weathersit", "temp", "atemp", "hum", "windspeed"]] to_cluster = dd.get_dummies(df) train = get_train(to_cluster) holdout = get_holdout(to_cluster) kmeans = KMeans(n_clusters=5, random_state=SEED).fit(train) # magic numbers, blech data["weather_cluster"] = da.append(kmeans.labels_, kmeans.predict(holdout)) data["weather_cluster"] = data["weather_cluster"].astype("category") return data
def test_inputs(self, X): km = DKKMeans(n_clusters=3) km.fit(X) km.transform(X)
def learn_clusters(n_clust): client = Client(n_workers=4, processes=True) # 1. Learn clusters # Full set kmeans_path = 'Clustering/KMeans/n{}posts.joblib'.format(n_clust) array = da.from_npy_stack(npy_stack_path) kmeans = KMeans(n_clusters=n_clust) # Learn on a part of set # array = np.load('Clustering/npy_post_vecs_part/0.npy') # kmeans = SKMeans(n_clusters=n_clust) print('Fitting') kmeans.fit(array) del array # Dump centroids to the disk # Dump as a sklearn object, for (maybe) faster prediction and less problems skmeans = SKMeans(n_clusters=n_clust) skmeans.cluster_centers_ = kmeans.cluster_centers_ skmeans._n_threads = _openmp_effective_n_threads() dump(skmeans, kmeans_path) del kmeans, skmeans # dump(kmeans, kmeans_path) # For learning on a part of set # del kmeans print('Fitted') # 3. Turn posts into clusters kmeans_path = 'Clustering/KMeans/n{}posts.joblib'.format(n_clust) df = dd.read_parquet('preprocessed.parquet') df = df.map_partitions(df_to_vector_predict, kmeans_path, meta={ 'user_id': int, 'post_id': int, 'text': object, 'type': str, 'date': str, 'cluster': int }) df.to_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust)) print('Clustered') # 2.5. Filter outdated posts out. (The next time write date of parsing to user_info) # For each user find his last like and filter out likes that are older than the last + half a year df = dd.read_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust)) print('Original df len: {}'.format(len(df))) year = 31536000 # One year in timestamp kyear = 20 break_time = kyear * year # 0.75*year - A quarter to year last_like = df['date'].max().compute( ) # Set has been fully collected on 8 of June 2020 df = df[df['date'] > last_like - break_time] # Pass only a quarter-to-year recent likes print('max_date: {} '.format(df['date'].max().compute())) print('min date: {}'.format(df['date'].min().compute())) print('Filtered df len: {}'.format(len(df))) print('Likes has been filtered out by date') # 3. Group clusters by user_id and turn them into a single vector for each user # df = dd.read_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust)) # INSTEAD OF FILTER! # - Count text_likes number for each user (and later merge with user_info) count = df.drop(columns=['post_id', 'type', 'date', 'cluster']).groupby( 'user_id')['text'].count().compute() count.rename('text_likes', inplace=True) # Generate meta meta = {'user_id': int} for i in range(n_clust): meta[i] = float df = df.map_partitions( lambda df_part: kt.clusters_to_vector(df_part, n_clust), meta=meta) df.to_parquet( 'Clustering/KMeans/n{}posts-cluster_vecs.parquet'.format(n_clust)) # 5. Merge clusters and user_info dataframes. (Working with pandas frames) df_info = pd.read_csv('users_info.csv') df_info = df_info.merge(count, on='user_id', how='inner') del count df = pd.read_parquet( 'Clustering/KMeans/n{}posts-cluster_vecs.parquet'.format(n_clust)) df = df_info.merge( df, on='user_id', how='inner' ) # Merging user's info and clusters. Maybe, mistake is here df.to_csv('Clustering/KMeans/n{}-final_dataset-{}year.csv'.format( n_clust, kyear)) print('Final dataset has been saved') del df_info # Filter some users out # df = pd.read_csv('Clustering/KMeans/n{}-final_dataset.csv'.format(n_clust)).drop(columns=['Unnamed: 0']) # TESTING df = df.loc[(df['text_likes'] > 100) & (df['text_likes'] < 1000)] df['bdate'] = df['bdate'].apply( lambda bd: time.mktime(datetime.strptime(bd, "%d.%m.%Y").timetuple())) # Clean up the dataset df = df.drop(columns=[ 'posts_n', 'text_likes', 'status', 'sex', 'smoking', 'alcohol', 'parth_id', 'country', 'city', 'user_id' ]).dropna().reset_index(drop=True) # 6. Supervise a Linear Regression model regr = LinearRegression() R2 = train(df, regr) client.close() return R2
def do(X, n_clusters, factor): km = KMeans(n_clusters=n_clusters, oversampling_factor=factor) km.fit(X) return km
def test_dask_dataframe_raises(self): km = DKKMeans(n_clusters=3) X = dd.from_pandas(pd.DataFrame({"A": range(50)}), npartitions=2) with pytest.raises(TypeError): km.fit(X)
n_centers = 12 n_features = 20 X_small, y_small = make_blobs(n_samples=1000, centers=n_centers, n_features=n_features, random_state=0) centers = np.zeros((n_centers, n_features)) for i in range(n_centers): centers[i] = X_small[y_small == i].mean(0) print(centers) n_samples_per_block = 20000 # 0 n_blocks = 500 delayeds = [ dask.delayed(make_blobs)(n_samples=n_samples_per_block, centers=centers, n_features=n_features, random_state=i)[0] for i in range(n_blocks) ] arrays = [ da.from_delayed(obj, shape=(n_samples_per_block, n_features), dtype=X_small.dtype) for obj in delayeds ] X = da.concatenate(arrays) print(X.nbytes / 1e9) X = X.persist() #actually run the stuff clf = KMeans(init_max_iter=3, oversampling_factor=10) clf.fit(X) print(clf.labels_[:10].compute()) #actually run the stuff
def test_too_small(): km = DKKMeans() X = da.random.uniform(size=(20, 2), chunks=(10, 2)) km.fit(X)
from dask.distributed import Client import time import sys from dask_ml.cluster import KMeans import dask.dataframe as dd client = Client(n_workers=4) t0 = time.time() dataset = dd.read_csv(sys.argv[1], header=None) dataset = dataset[[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21 ]] clf = KMeans(n_clusters=5, tol=0.0001) clf.fit(dataset) a = clf.transform(dataset) a.compute() print(clf.cluster_centers_) print('Tiempo transcurrido:', time.time() - t0) client.close()
'Stay_In_Current_City_Years', 'Marital_Status' ]] target = df['Purchase'] #creating dummies for the categorical variables data = dd.get_dummies(categorical_variables.categorize()).compute() #converting dataframe to array datanew = data.values #fit the model from dask_ml.linear_model import LinearRegression lr = LinearRegression() lr.fit(datanew, target) #preparing the test data test_categorical = test[[ 'Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status' ]] test_dummy = dd.get_dummies(test_categorical.categorize()).compute() testnew = test_dummy.values #predict on test and upload pred = lr.predict(testnew) #Clustering/K-Means from dask_ml.cluster import KMeans model = KMeans() model.fit(datanew, target)