def _cluster(self): # , distance_function=spearman_squared_distance, max_iter=1000, tol=0.0001): """Calls local kmedoids module to group attributions""" if self.cluster_method is None: clusters = KMedoids( self.k, dist_func=self.distance_function, max_iter=self.max_iter, tol=self.tol, init_medoids=self.init_medoids, swap_medoids=self.swap_medoids, ) clusters.fit(self.clustering_attributions, verbose=self.verbose) self.subpopulations = clusters.members self.subpopulation_sizes = GAM.get_subpopulation_sizes( clusters.members) self.explanations = self._get_explanations(clusters.centers) # Making explanations return numerical values instead of dask arrays if isinstance(self.explanations[0][0][1], da.Array): explanations = [] for explanation in self.explanations: explanations.append([(x[0], x[1].compute()) for x in explanation]) self.explanations = explanations else: self.cluster_method(self)
def test_banditPAM(): # load the data df = pd.read_csv("tests/banditPAM_data.csv") attributions = df.values """"Run kmedoids on sample attributions""" kmed2 = KMedoids( 4, dist_func="euclidean", # dist_func=spearman_squared_distance, max_iter=20, tol=0.001, init_medoids="bandit", swap_medoids="bandit", verbose=False, ) start_time = time.time() kmed2.fit(attributions, verbose=False) end_time = time.time() elapsed_time = end_time - start_time print(f"Finished test in {elapsed_time:.2f}") print(kmed2.centers) # if testing with 'euclidean' distance assert( kmed2.centers == [256, 209, 470, 304])
def test_banditPAM_dask(): # load the data ddf = dd.read_csv("tests/banditPAM_data.csv", dtype={'ARTICLE_ID': 'object'}).repartition(npartitions=4) attributions = ddf.to_dask_array(lengths=True) """"Run kmedoids on sample attributions""" kmed2 = KMedoids( n_clusters=4, dist_func="euclidean", batchsize=200, # dist_func=spearman_squared_distance, max_iter=20, tol=0.001, init_medoids="bandit", swap_medoids="bandit", verbose=False, ) start_time = time.time() kmed2.fit(attributions, verbose=False) end_time = time.time() elapsed_time = end_time - start_time print(f"Finished test in {elapsed_time:.2f}") print(kmed2.centers) # if testing with 'euclidean' distance assert np.isin(kmed2.centers, [256, 209, 470, 304]).all()
def _cluster(self, distance_function=spearman_squared_distance, max_iter=1000, tol=0.0001): """Calls kmedoids module to group attributions""" clusters = KMedoids(self.k, dist_func=distance_function, max_iter=max_iter, tol=tol) clusters.fit(self.normalized_attributions, verbose=False) self.subpopulations = clusters.members self.subpopulation_sizes = GAM.get_subpopulation_sizes(clusters.members) self.explanations = self._get_explanations(clusters.centers)
def test_kmedoids(): """"Run kmedoids on sample attributions""" kmedoids_2 = KMedoids(2, dist_func=spearman_squared_distance, max_iter=1000, tol=0.0001) attributions = np.array([(0.2, 0.8), (0.1, 0.9), (0.91, 0.09), (0.88, 0.12)]) kmedoids_2.fit(attributions, verbose=False) # test that 2 attributions are in each cluster assert (sum(kmedoids_2.members) == 2)
def _cluster(self): # , distance_function=spearman_squared_distance, max_iter=1000, tol=0.0001): """Calls local kmedoids module to group attributions""" if self.cluster_method is None: clusters = KMedoids( self.k, dist_func=self.distance_function, max_iter=self.max_iter, tol=self.tol, ) clusters.fit(self.clustering_attributions, verbose=False) self.subpopulations = clusters.members self.subpopulation_sizes = GAM.get_subpopulation_sizes(clusters.members) self.explanations = self._get_explanations(clusters.centers) else: self.cluster_method(self)
from gam.clustering import KMedoids from gam.spearman_distance import spearman_squared_distance np.random.seed(42) # load the data df = pd.read_csv("samples_3500.csv") attributions = df.values print(df.shape) """"Run kmedoids on sample attributions""" kmed2 = KMedoids( 5, dist_func=spearman_squared_distance, max_iter=10, tol=0.01, init_medoids='bandit', swap_medoids="bandit", verbose=True, ) # attributions = np.array([(0.2, 0.8), (0.1, 0.9), (0.91, 0.09), (0.88, 0.12)]) start_time = time.time() kmed2.fit(attributions, verbose=True) end_time = time.time() elapsed_time = end_time - start_time print(f"Finished test in {elapsed_time:.2f}") print(kmed2.centers) cluster_sizes = np.unique(kmed2.members, return_counts=True)[1] print(f'cluster sizes - {cluster_sizes}') # test that 2 attributions are in each cluster # assert(sum(kmedoids_2.members) == 2)