def main(readcsv=read_csv, method='defaultDense'): infile = "./data/batch/kmeans_dense.csv" nClusters = 20 maxIter = 5 initrain_algo = d4p.kmeans_init(nClusters, method="randomDense") # Load the data data = readcsv(infile, range(20)) # compute initial centroids initrain_result = initrain_algo.compute(data) # The results provides the initial centroids assert initrain_result.centroids.shape[0] == nClusters # configure kmeans main object: we also request the cluster assignments algo = d4p.kmeans(nClusters, maxIter, assignFlag=True) # compute the clusters/centroids result = algo.compute(data, initrain_result.centroids) # Note: we could have done this in just one line: # d4p.kmeans(nClusters, maxIter, assignFlag=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense").compute(data).centroids) # Kmeans result objects provide assignments (if requested), centroids, goalFunction, nIterations and objectiveFunction assert result.centroids.shape[0] == nClusters assert result.assignments.shape == (data.shape[0], 1) assert result.nIterations <= maxIter return result
def kmeans(N, D, nClusters, maxit): a = np.random.ranf((N, D)) # doesn't make much sense, but ok for now kmi = daal4py.kmeans_init(nClusters, method='plusPlusDense') km = daal4py.kmeans(nClusters, maxit) kmr = km.compute(a, kmi.compute(a).centroids) return (kmr.centroids, kmr.assignments, kmr.objectiveFunction, kmr.goalFunction, kmr.nIterations)
def run_inference(num_observations:int = 1000): """Run xgboost for specified number of observations""" # Load data test_df = common.get_test_data_df(X=common.X_dfc,size = num_observations) num_rows = len(test_df) ###################### print("_______________________________________") print("Total Number of Rows", num_rows) run_times = [] inference_times = [] for _ in range(NUM_LOOPS): start_time = timer() init_alg = d4p.kmeans_init(nClusters = 5, fptype = "float", method = "randomDense") centroids = init_alg.compute(test_df).centroids alg = d4p.kmeans(nClusters = 5, maxIterations = 100, fptype = "float", accuracyThreshold = 0, assignFlag = False) result = alg.compute((test_df), centroids) end_time = timer() total_time = end_time - start_time run_times.append(total_time*10e3) inference_time = total_time*(10e6)/num_rows inference_times.append(inference_time) return_elem = common.calculate_stats(inference_times) print(num_observations, ", ", return_elem) return return_elem
def _daal4py_compute_starting_centroids(X, X_fptype, nClusters, cluster_centers_0, verbose, random_state): def is_string(s, target_str): return isinstance(s, str) and s == target_str is_sparse = sp.isspmatrix(X) deterministic = False if is_string(cluster_centers_0, 'k-means++'): _seed = random_state.randint(np.iinfo('i').max) plus_plus_method = "plusPlusCSR" if is_sparse else "plusPlusDense" daal_engine = daal4py.engines_mt19937(fptype=X_fptype, method="defaultDense", seed=_seed) _n_local_trials = 2 + int(np.log(nClusters)) kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, nTrials=_n_local_trials, method=plus_plus_method, engine=daal_engine) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids elif is_string(cluster_centers_0, 'random'): _seed = random_state.randint(np.iinfo('i').max) random_method = "randomCSR" if is_sparse else "randomDense" daal_engine = daal4py.engines_mt19937(seed=_seed, fptype=X_fptype, method="defaultDense") kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method=random_method, engine=daal_engine) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids elif hasattr(cluster_centers_0, '__array__'): deterministic = True cc_arr = np.ascontiguousarray(cluster_centers_0, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr elif callable(cluster_centers_0): cc_arr = cluster_centers_0(X, nClusters, random_state) cc_arr = np.ascontiguousarray(cc_arr, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr elif is_string(cluster_centers_0, 'deterministic'): deterministic = True default_method = "lloydCSR" if is_sparse else "defaultDense" kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method=default_method) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids else: raise ValueError( f"init should be either 'k-means++', 'random', a ndarray or a " f"callable, got '{cluster_centers_0}' instead.") if verbose: print("Initialization complete") return deterministic, centroids_
def compute(data, nClusters, maxIter, method): # configure kmeans init object initrain_algo = d4p.kmeans_init(nClusters, method=method, fptype='float') # compute initial centroids initrain_result = initrain_algo.compute(data) # configure kmeans main object: we also request the cluster assignments algo = d4p.kmeans(nClusters, maxIter, assignFlag=True, fptype='float') # compute the clusters/centroids return algo.compute(data, initrain_result.centroids)
def test_kmeans_spmd(self): nClusters = 10 maxIter = 25 data = np.loadtxt("./data/distributed/kmeans_dense.csv", delimiter=',') rpp = int(data.shape[0] / d4p.num_procs()) spmd_data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :] for init_method in [ 'plusPlusDense', 'parallelPlusDense', 'deterministicDense' ]: batch_init_res = d4p.kmeans_init( nClusters=nClusters, method=init_method).compute(data) spmd_init_res = d4p.kmeans_init( nClusters=nClusters, method=init_method, distributed=True).compute(spmd_data) if init_method in ['parallelPlusDense']: print( "Warning: It is well known that results of parallelPlusDense init does not match with batch algorithm" ) else: self.assertTrue( np.allclose(batch_init_res.centroids, spmd_init_res.centroids), "Initial centroids with " + init_method + " does not match with batch algorithm") batch_res = d4p.kmeans(nClusters=nClusters, maxIterations=maxIter).compute( data, batch_init_res.centroids) spmd_res = d4p.kmeans(nClusters=nClusters, maxIterations=maxIter, distributed=True).compute( spmd_data, spmd_init_res.centroids) self.assertTrue( np.allclose(batch_res.centroids, batch_res.centroids), "Final centroids with " + init_method + " does not match with batch algorithm")
def _daal4py_compute_starting_centroids(X, X_fptype, nClusters, cluster_centers_0, random_state): def is_string(s, target_str): return isinstance(s, string_types) and s == target_str deterministic = False if is_string(cluster_centers_0, 'k-means++'): _seed = random_state.randint(np.iinfo('i').max) daal_engine = daal4py.engines_mt19937(fptype=X_fptype, method='defaultDense', seed=_seed) _n_local_trials = 2 + int(np.log(nClusters)) kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, nTrials=_n_local_trials, method='plusPlusDense', engine=daal_engine) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids elif is_string(cluster_centers_0, 'random'): _seed = random_state.randint(np.iinfo('i').max) daal_engine = daal4py.engines_mt19937(seed=_seed, fptype=X_fptype, method='defaultDense') kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method='randomDense', engine=daal_engine) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids elif hasattr(cluster_centers_0, '__array__'): deterministic = True cc_arr = np.ascontiguousarray(cluster_centers_0, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr elif callable(cluster_centers_0): cc_arr = cluster_centers_0(X, nClusters, random_state) cc_arr = np.ascontiguousarray(cc_arr, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr elif is_string(cluster_centers_0, 'deterministic'): deterministic = True kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method='defaultDense') kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids else: raise ValueError("Cluster centers should either be 'k-means++', 'random', 'deterministic' or an array") return deterministic, centroids_
def main(method='plusPlusDense'): infile = "./data/distributed/kmeans_dense.csv" nClusters = 10 maxIter = 25 # configure a kmeans-init init_algo = d4p.kmeans_init(nClusters, method=method, distributed=True) # Load the data data = loadtxt(infile, delimiter=',') # now slice the data, # it would have been better to read only what we need, of course... rpp = int(data.shape[0] / d4p.num_procs()) data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :] # compute initial centroids init_result = init_algo.compute(data) # The results provides the initial centroids assert init_result.centroids.shape[0] == nClusters # configure kmeans main object algo = d4p.kmeans(nClusters, maxIter, distributed=True) # compute the clusters/centroids result = algo.compute(data, init_result.centroids) # Note: we could have done this in just one line: # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute( # data, # d4p.kmeans_init( # nClusters, # method="plusPlusDense", # distributed=True # ).compute(data).centroids # ) # Kmeans result objects provide centroids, goalFunction, # nIterations and objectiveFunction assert result.centroids.shape[0] == nClusters assert result.nIterations <= maxIter # we need an extra call to kmeans to get the assignments # (not directly supported through parameter assignFlag yet in SPMD mode) algo = d4p.kmeans(nClusters, 0, assignFlag=True) # maxIt=0; not distributed, we compute on local data only! assignments = algo.compute(data, result.centroids).assignments return (assignments, result)
def kMeans(self, Data_Path, n): ''' daal4py KMeans Clustering SPMD Mode ''' nClusters = 4 maxIter = 25 # fixed maximum number of itertions # Initialize SPMD mode d4p.daalinit(nthreads=n) # training setup file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv" data = pd.read_csv(file_path) init_algo = d4p.kmeans_init(nClusters=nClusters, distributed=True, method="plusPlusDense") self.logger.info('Training the KMeans in pydaal SPMD Mode') # compute initial centroids centroids = init_algo.compute(data).centroids init_result = init_algo.compute(data) # configure kmeans main object algo = d4p.kmeans(nClusters, maxIter, distributed=True) kmeans_start_time = time.time() # compute the clusters/centroids result = algo.compute(data, init_result.centroids) self.latency["Parallel_KMeans_SPMD_Time"] = time.time() - \ kmeans_start_time # result is available on all processes - but we print only on root if d4p.my_procid() == 0: print("KMeans completed", result) self.logger.info('Completed KMeans in pydaal SPMD Mode') d4p.daalfini() return
def kMeans(self, data, target): ''' Method for serial running of Kmeans ''' nClusters = 4 maxIter = 25 #fixed maximum number of itertions data = data.drop(target, axis=1) init_algo = d4p.kmeans_init(nClusters=nClusters, method="plusPlusDense") self.logger.info('Training the KMeans in pydaal Batch/Serial Mode') train_result = init_algo.compute(data) # The results provides the initial centroids assert train_result.centroids.shape[0] == nClusters # configure kmeans main object: we also request the cluster assignments algo = d4p.kmeans(nClusters, maxIter) # compute the clusters/centroids kmeans_start_time = time.time() result = algo.compute(data, train_result.centroids) self.latency["Serial_KMeans_Batch_Time"] = time.time() - kmeans_start_time # Kmeans result objects provide assignments (if requested), centroids, goalFunction, nIterations and objectiveFunction assert result.centroids.shape[0] == nClusters assert result.assignments.shape == (data.shape[0], 1) assert result.nIterations <= maxIter self.logger.info('Completed KMeans in pydaal Batch/Serial Mode') return
# organizing variables used in the model for prediction # each process gets its own data infile = "./data/distributed_data/daal4py_Distributed_Kmeans_" + str( d4p.my_procid() + 1) + ".csv" # read data X = pd.read_csv(infile) # ## Computing and Saving Initial Centroids # Time to **initialize our centroids!** # In[4]: # computing inital centroids init_result = d4p.kmeans_init(nClusters=3, method="plusPlusDense").compute(X) # To **get initial centroid information and save it** to a file: # In[5]: # retrieving and printing inital centroids centroids = init_result.centroids print("Here our centroids:\n\n\n", centroids, "\n") centroids_filename = './models/kmeans_clustering_initcentroids_' + str( d4p.my_procid() + 1) + '.csv' # saving centroids to a file pickle.dump(centroids, open(centroids_filename, "wb"))
def kmeans_lightcones(self, past_params, future_params, past_decay=0, future_decay=0, past_init_params=None, future_init_params=None): ''' Performs clustering on the master arrays of both past and future lightcones. Expects clustering algorithm to give integer cluster labels start at 0, with the "noise cluster" having label -1. Diagnostics of this clustering (what are the unique clusters and how many lightcones were assigned to each cluster) accessed through namedtuple Reconstructor.lc_cluster_diagnostic. *** Actually make revert back to original Reconstructor format; don't require sklearn objects for clustering -- but do save centroids*** *** How is the call to distributed DAAL4PY clustering objects going to work with this? *** Parameters ---------- past_params: dict, Dictionary of keword arguments for past lightcone clustering algorithm. If past_cluster == 'kmeans': past_params must include values for 'nClusters' and 'maxIterations' future_params: dict, Dictionary of keword arguments for future lightcone clustering algorithm. If future_cluster == 'kmeans': future_params must include values for 'nClusters' and 'maxIterations' past_decay: int, optional (default=0) Exponential decay rate for lightcone distance used for past lightcone clustering. future_decay: int, optional (default=0) Exponential decay rate for lightcone distance used for future lightcone clustering. ''' # OPT: comment out for performance runs if self.plcs is None: raise RuntimeError( "Must call .extract() on a training field(s) before calling .cluster_lightcones()." ) past_decays = lightcone_decay_2D(self.past_depth, self.c, past_decay, False) self.plcs *= np.sqrt(past_decays) future_decays = lightcone_decay_2D(self.future_depth, self.c, future_decay, True) self.flcs *= np.sqrt(future_decays) # Need these for dbscan version (after clustering) self._N_pasts = past_params['nClusters'] self._N_futures = future_params['nClusters'] if past_init_params is None: #method = 'randomDense' #method = 'parallelPlusDense' #method = 'plusPlusDense' method = 'defaultDense' past_init_params = { 'nClusters': self._N_pasts, #'method':'plusPlusDense', 'method': method, 'distributed': True } initial = d4p.kmeans_init(**past_init_params) # print('past initialization method: ', method, flush=True) centroids = initial.compute(self.plcs).centroids # print('done: past centroid calc', flush=True) past_cluster = d4p.kmeans(distributed=True, **past_params).compute(self.plcs, centroids) # print('done: first pass past kmeans', flush=True) past_local = d4p.kmeans(nClusters=self._N_pasts, distributed=False, assignFlag=True, maxIterations=0).compute( self.plcs, past_cluster.centroids) # print('done: past cluster assignments', flush=True) self.pasts = past_local.assignments.flatten() # print('done: flatten the past assignments', flush=True) del past_cluster del self.plcs if future_init_params is None: #method = 'randomDense' #method = 'parallelPlusDense' #method = 'plusPlusDense' method = 'defaultDense' future_init_params = { 'nClusters': self._N_futures, #'method':'plusPlusDense', 'method': method, 'distributed': True } initial = d4p.kmeans_init(**future_init_params) # print('future initialization method: ', method, flush=True) centroids = initial.compute(self.flcs).centroids # print('done: future centroid calc', flush=True) future_cluster = d4p.kmeans(distributed=True, **future_params).compute( self.flcs, centroids) # print('done: first pass future kmeans', flush=True) future_local = d4p.kmeans(nClusters=self._N_futures, distributed=False, assignFlag=True, maxIterations=0).compute( self.flcs, future_cluster.centroids) # print('done: past cluster assignments', flush=True) self.futures = future_local.assignments.flatten() # print('done: flatten the future assignments', flush=True) del future_cluster del self.flcs
# each process gets its own data infile = "./data/distributed_data/daal4py_Distributed_Kmeans_" + str( d4p.my_procid() + 1) + ".csv" # read data X = pd.read_csv(infile) # ## Computing and Saving Initial Centroids # Time to **initialize our centroids!** # In[4]: # computing inital centroids init_result = d4p.kmeans_init(nClusters=3, method="plusPlusDense", distributed=True).compute(X) # To **get initial centroid information and save it** to a file: # In[5]: # retrieving and printing inital centroids centroids = init_result.centroids print("Here our centroids:\n\n\n", centroids, "\n") centroids_filename = './models/kmeans_clustering_initcentroids_' + str( d4p.my_procid() + 1) + '.csv' # saving centroids to a file pickle.dump(centroids, open(centroids_filename, "wb"))
import daal4py as d4p from numpy import loadtxt, allclose if __name__ == "__main__": # Initialize SPMD mode d4p.daalinit() infile = "./data/distributed/kmeans_dense.csv" nClusters = 10 maxIter = 25 # configure a kmeans-init init_algo = d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True) # Load the data data = loadtxt(infile, delimiter=',') # now slice the data, it would have been better to read only what we need, of course... rpp = int(data.shape[0] / d4p.num_procs()) data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :] # compute initial centroids init_result = init_algo.compute(data) # The results provides the initial centroids assert init_result.centroids.shape[0] == nClusters # configure kmeans main object algo = d4p.kmeans(nClusters, maxIter, distributed=True) # compute the clusters/centroids
def kmeans_lightcones(self, past_params, future_params, past_decay=0, future_decay=0, past_init_params=None, future_init_params=None): ''' Performs clustering on the global arrays of both past and future lightcones. Parameters ---------- past_params: dict, Dictionary of keword arguments for past lightcone clustering algorithm. If past_cluster == 'kmeans': past_params must include values for 'nClusters' and 'maxIterations' future_params: dict, Dictionary of keword arguments for future lightcone clustering algorithm. If future_cluster == 'kmeans': future_params must include values for 'nClusters' and 'maxIterations' past_decay: int, optional (default=0) Exponential decay rate for lightcone distance used for past lightcone clustering. future_decay: int, optional (default=0) Exponential decay rate for lightcone distance used for future lightcone clustering. ''' if self.plcs is None: raise RuntimeError("Must call .extract() on a training field(s) before calling .cluster_lightcones().") if len(self._adjusted_shape) == 2: past_decays = lightcone_decay(self.past_depth, self.c, past_decay, False) future_decays = lightcone_decay(self.future_depth, self.c, future_decay, True) elif len(self._adjusted_shape) == 3: past_decays = lightcone_decay_2D(self.past_depth, self.c, past_decay, False) future_decays = lightcone_decay_2D(self.future_depth, self.c, future_decay, True) self.plcs *= np.sqrt(past_decays) self.flcs *= np.sqrt(future_decays) # Primarily used for global joint dist in distributed mode self._N_pasts = past_params['nClusters'] self._N_futures = future_params['nClusters'] if past_init_params is None: #method = 'randomDense' #method = 'parallelPlusDense' method = 'plusPlusDense' #method = 'defaultDense' past_init_params = {'nClusters':self._N_pasts, 'method': method, 'distributed': self._distributed} initial = d4p.kmeans_init(**past_init_params) centroids = initial.compute(self.plcs).centroids past_cluster = d4p.kmeans(distributed=self._distributed, **past_params).compute(self.plcs, centroids) past_local = d4p.kmeans(nClusters=self._N_pasts, distributed=self._distributed, assignFlag=True, maxIterations=0).compute(self.plcs, past_cluster.centroids) self.pasts = past_local.assignments.flatten() del past_cluster del self.plcs if future_init_params is None: #method = 'randomDense' #method = 'parallelPlusDense' method = 'plusPlusDense' #method = 'defaultDense' future_init_params = {'nClusters':self._N_futures, 'method': method, 'distributed': self._distributed} initial = d4p.kmeans_init(**future_init_params) centroids = initial.compute(self.flcs).centroids future_cluster = d4p.kmeans(distributed=self._distributed, **future_params).compute(self.flcs, centroids) self._future_centroids = future_cluster.centroids # save for field reconstruction future_local = d4p.kmeans(nClusters=self._N_futures, distributed=self._distributed, assignFlag=True, maxIterations=0).compute(self.flcs, self._future_centroids) self.futures = future_local.assignments.flatten() del future_cluster del self.flcs