def test_deterministic_codebook(self): n_rows, n_columns = 2, 2 codebook = np.zeros((2*2, 2), dtype=np.float32) data = np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float32) som = Somoclu(n_columns, n_rows, data=data, initialcodebook=codebook, compactsupport=False) som.train() correct_codebook = np.array([[[ 0.2 , 0.30000001], [ 0.10359724, 0.20359723]], [[ 0.29640275, 0.39640275], [ 0.2 , 0.30000001]]], dtype=np.float32) self.assertTrue(sum(codebook.reshape((n_rows*n_columns*2)) - correct_codebook.reshape((n_rows*n_columns*2))) < 10e-8)
def test_deterministic_codebook(self): n_rows, n_columns = 2, 2 codebook = np.zeros((2*2, 2), dtype=np.float32) data = np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float32) som = Somoclu(n_columns, n_rows, initialcodebook=codebook, compactsupport=False) som.train(data) correct_codebook = np.array([[[ 0.2 , 0.30000001], [ 0.10359724, 0.20359723]], [[ 0.29640275, 0.39640275], [ 0.2 , 0.30000001]]], dtype=np.float32) self.assertTrue(sum(codebook.reshape((n_rows*n_columns*2)) - correct_codebook.reshape((n_rows*n_columns*2))) < 10e-8)
class SOM(BaseEstimator, ClusterMixin): """Class for training and visualizing a self-organizing map. Parameters ---------- n_columns : int, default: 5 The number of columns in the map. n_rows : int, default: 5 The number of rows in the map. n_clusters : float, default: None The proportion of clusters relative to the number of samples of the input space. If this is not None then `n_columns` and `n_rows` are ignored. initialcodebook : 2D numpy.array of float32 or None, default: None Define the codebook to start the training. kerneltype : int, default: 0 Specify which kernel to use. 0 for dense CPU kernel. 1 for dense GPU kernel if compiled with it. maptype : str, default: "planar" Specify the map topology. "planar" for planar map. "toroid" for toroid map. gridtype : str, default: "rectangular" Specify the grid form of the nodes. "rectangular" for rectangular neurons. "hexagonal" for hexagonal neurons. compactsupport : bool, default: True Cut off map updates beyond the training radius with the Gaussian neighborhood. neighborhood : str, default: "gaussian" Specify the neighborhood. "gaussian" for Gaussian neighborhood. "bubble" for bubble neighborhood function. std_coeff : float, default: 0.5 Set the coefficient in the Gaussian neighborhood function exp(-||x-y||^2/(2*(coeff*radius)^2)). initialization : str or None, default: None Specify the codebook initalization. "random" for random weights in the codebook. "pca": codebook is initialized from the first subspace spanned by the first two eigenvectors of the correlation matrix. verbose : int, default: 0 Specify verbosity level (0, 1, or 2). """ _attributes = ['train', 'codebook', 'bmus'] def __init__(self, n_columns=5, n_rows=5, n_clusters=None, initialcodebook=None, kerneltype=0, maptype="planar", gridtype="rectangular", compactsupport=True, neighborhood="gaussian", std_coeff=0.5, initialization=None, verbose=0): self.n_columns = n_columns self.n_rows = n_rows self.n_clusters = n_clusters self.initialcodebook = initialcodebook self.kerneltype = kerneltype self.maptype = maptype self.gridtype = gridtype self.compactsupport = compactsupport self.neighborhood = neighborhood self.std_coeff = std_coeff self.initialization = initialization self.verbose = verbose @staticmethod def _generate_labels_mapping(grid_labels): """Generate a mapping between grid labels and cluster labels.""" # Identify unique grid labels unique_labels = [ tuple(grid_label) for grid_label in np.unique(grid_labels, axis=0) ] # Generate mapping labels_mapping = { grid_label: cluster_label for grid_label, cluster_label in zip(unique_labels, range(len(unique_labels))) } return labels_mapping def _return_topological_neighbors(self, col, row): """Return the topological neighbors of a neuron.""" # Return common topological neighbors for the two grid types topological_neighbors = [(col - 1, row), (col + 1, row), (col, row - 1), (col, row + 1)] # Append extra topological neighbors for hexagonal grid type if self.gridtype == 'hexagonal': offset = (-1)**row topological_neighbors += [(col - offset, row - offset), (col - offset, row + offset)] # Apply constraints topological_neighbors = [ (col, row) for col, row in topological_neighbors if 0 <= col < self.n_columns_ and 0 <= row < self.n_rows_ and [col, row] in self.algorithm_.bmus.tolist() ] return topological_neighbors def _generate_neighbors(self, grid_labels, labels_mapping): """Generate pairs of neighboring labels.""" # Generate grid topological neighbors grid_topological_neighbors = [ product([grid_label], self._return_topological_neighbors(*grid_label)) for grid_label in grid_labels ] # Flatten grid topological neighbors grid_topological_neighbors = [ pair for pairs in grid_topological_neighbors for pair in pairs ] # Generate cluster neighbors all_neighbors = [(labels_mapping[pair[0]], labels_mapping[pair[1]]) for pair in grid_topological_neighbors] all_neighbors = [ tuple(pair) for pair in np.unique(all_neighbors, axis=0) ] # Keep unique unordered pairs neighbors = [] for pair in all_neighbors: if pair not in neighbors and pair[::-1] not in neighbors: neighbors.append(pair) return neighbors def fit(self, X, y=None, **fit_params): """Train the self-organizing map. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Training instances to cluster. y : Ignored """ # Check and normalize input data X = minmax_scale(check_array(X, dtype=np.float32)) # Initialize Somoclu object if not hasattr(self, 'algorithm_'): # Set number of columns and rows from number of clusters if self.n_clusters is not None: self.n_columns_ = self.n_rows_ = int(self.n_clusters * (np.sqrt(len(X)) - 2) + 2) else: self.n_columns_, self.n_rows_ = self.n_columns, self.n_rows # Create object self.algorithm_ = Somoclu(n_columns=self.n_columns_, n_rows=self.n_rows_, initialcodebook=self.initialcodebook, kerneltype=self.kerneltype, maptype=self.maptype, gridtype=self.gridtype, compactsupport=self.compactsupport, neighborhood=self.neighborhood, std_coeff=self.std_coeff, initialization=self.initialization, data=None, verbose=self.verbose) # Fit Somoclu self.algorithm_.train(data=X, **fit_params) # Grid labels grid_labels = [ tuple(grid_label) for grid_label in self.algorithm_.bmus ] # Generate labels mapping labels_mapping = self._generate_labels_mapping(grid_labels) # Generate cluster labels self.labels_ = np.array( [labels_mapping[grid_label] for grid_label in grid_labels]) # Generate labels neighbors self.neighbors_ = self._generate_neighbors(grid_labels, labels_mapping) return self def fit_predict(self, X, y=None): """Train the self-organizing map and assign a cluster label to each sample. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to transform. u : Ignored Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ return self.fit(X).labels_
class SOM(BaseEstimator, ClusterMixin): """Class to fit and visualize a Self-Organizing Map (SOM). The implementation uses SOM from Somoclu. Read more in the :ref:`User Guide <user_guide>`. Parameters ---------- n_columns : int, optional (default=5) The number of columns in the map. n_rows : int, optional (default=5) The number of rows in the map. initialcodebook : 2D numpy.array of float32, str or None, optional (default=None) Define the codebook to start the training. If ``initialcodebook='pca'`` then the codebook is initialized from the first subspace spanned by the first two eigenvectors of the correlation matrix. kerneltype : int, optional (default=0) Specify which kernel to use. If ``kerneltype=0`` use dense CPU kernel. Else if ``kerneltype=1`` use dense GPU kernel if compiled with it. maptype : str, optional (default='planar') Specify the map topology. If ``maptype='planar'`` use planar map. Else if ``maptype='toroid'`` use toroid map. gridtype : str, optional (default='rectangular') Specify the grid form of the nodes. If ``gridtype='rectangular'`` use rectangular neurons. Else if ``gridtype='hexagonal'`` use hexagonal neurons. compactsupport : bool, optional (default=True) Cut off map updates beyond the training radius with the Gaussian neighborhood. neighborhood : str, optional (default='gaussian') Specify the neighborhood. If ``neighborhood='gaussian'`` use Gaussian neighborhood. Else if `neighborhood='bubble'`` use bubble neighborhood function. std_coeff : float, optional (default=0.5) Set the coefficient in the Gaussian neighborhood :math:`exp(-||x-y||^2/(2*(coeff*radius)^2))`. random_state : int, RandomState instance or None, optional (default=None) Control the randomization of the algorithm by specifying the codebook initalization. It is ignored when ``initialcodebook`` is not ``None``. - If int, ``random_state`` is the seed used by the random number generator. - If ``RandomState`` instance, random_state is the random number generator. - If ``None``, the random number generator is the ``RandomState`` instance used by ``np.random``. verbose : int, optional (default=0) Specify verbosity level (0, 1, or 2). """ _attributes = ['train', 'codebook', 'bmus'] def __init__( self, n_columns=5, n_rows=5, initialcodebook=None, kerneltype=0, maptype="planar", gridtype="rectangular", compactsupport=True, neighborhood="gaussian", std_coeff=0.5, random_state=None, verbose=0, ): self.n_columns = n_columns self.n_rows = n_rows self.initialcodebook = initialcodebook self.kerneltype = kerneltype self.maptype = maptype self.gridtype = gridtype self.compactsupport = compactsupport self.neighborhood = neighborhood self.std_coeff = std_coeff self.random_state = random_state self.verbose = verbose @staticmethod def _generate_labels_mapping(grid_labels): """Generate a mapping between grid labels and cluster labels.""" # Identify unique grid labels unique_labels = [ tuple(grid_label) for grid_label in np.unique(grid_labels, axis=0) ] # Generate mapping labels_mapping = { grid_label: cluster_label for grid_label, cluster_label in zip(unique_labels, range(len(unique_labels))) } return labels_mapping def _return_topological_neighbors(self, col, row): """Return the topological neighbors of a neuron.""" # Return common topological neighbors for the two grid types topological_neighbors = [ (col - 1, row), (col + 1, row), (col, row - 1), (col, row + 1), ] # Append extra topological neighbors for hexagonal grid type if self.gridtype == 'hexagonal': offset = (-1)**row topological_neighbors += [ (col - offset, row - offset), (col - offset, row + offset), ] # Apply constraints topological_neighbors = [ (col, row) for col, row in topological_neighbors if 0 <= col < self.n_columns and 0 <= row < self.n_rows and [col, row] in self.algorithm_.bmus.tolist() ] return topological_neighbors def _generate_neighbors(self, grid_labels, labels_mapping): """Generate pairs of neighboring labels.""" # Generate grid topological neighbors grid_topological_neighbors = [ product([tuple(grid_label)], self._return_topological_neighbors(*grid_label)) for grid_label in grid_labels ] # Flatten grid topological neighbors grid_topological_neighbors = [ pair for pairs in grid_topological_neighbors for pair in pairs ] # Generate cluster neighbors all_neighbors = [(labels_mapping[pair[0]], labels_mapping[pair[1]]) for pair in grid_topological_neighbors] all_neighbors = [ tuple(pair) for pair in np.unique(all_neighbors, axis=0) ] # Keep unique unordered pairs neighbors = [] for pair in all_neighbors: if pair not in neighbors and pair[::-1] not in neighbors: neighbors.append(pair) return np.array(neighbors) def fit(self, X, y=None, **fit_params): """Train the self-organizing map. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Training instances to cluster. y : Ignored """ # Check and normalize input data X = minmax_scale(check_array(X, dtype=np.float32)) # Check random_state self.random_state_ = check_random_state(self.random_state) # Initialize codebook if self.initialcodebook is None: if self.random_state is None: initialcodebook = None initialization = 'random' else: codebook_size = self.n_columns * self.n_rows * X.shape[1] initialcodebook = self.random_state_.random_sample( codebook_size).astype(np.float32) initialization = None elif self.initialcodebook == 'pca': initialcodebook = None initialization = 'random' else: initialcodebook = self.initialcodebook initialization = None # Create Somoclu object self.algorithm_ = Somoclu( n_columns=self.n_columns, n_rows=self.n_rows, initialcodebook=initialcodebook, kerneltype=self.kerneltype, maptype=self.maptype, gridtype=self.gridtype, compactsupport=self.compactsupport, neighborhood=self.neighborhood, std_coeff=self.std_coeff, initialization=initialization, data=None, verbose=self.verbose, ) # Fit Somoclu self.algorithm_.train(data=X, **fit_params) # Grid labels grid_labels = [ tuple(grid_label) for grid_label in self.algorithm_.bmus ] # Generate labels mapping self.labels_mapping_ = self._generate_labels_mapping(grid_labels) # Generate cluster labels self.labels_ = np.array( [self.labels_mapping_[grid_label] for grid_label in grid_labels]) # Generate labels neighbors self.neighbors_ = self._generate_neighbors( np.unique(grid_labels, axis=0), self.labels_mapping_) return self def fit_predict(self, X, y=None, **fit_params): """Train the self-organizing map and assign a cluster label to each sample. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to transform. u : Ignored Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ return self.fit(X, **fit_params).labels_