def _get_model_features(self): """ Compute and get the most important features of the current model """ SubCMediansWrapper_c.get_features(self._features, self._p_subcmedians_c) return self._features + [self.generation]
def fit(self, X, y=None, verbose=1): """ sklearn-like fit function, receives a dataset and build the subspace clustering that models the data """ print "" if X is None: return None if X.size < self.N: raise RuntimeError( 'The dataset provided is smaller than the sample size, use instead the fit_online function' ) X_ = self._check_X_matrix_validity(X) self._set_data_sample(X_, y) for iteration in xrange(self.NbIter): random_element = np.random.randint( 0, len(self.data_objects_index_not_in_sample)) random_index = self.data_objects_index_not_in_sample.pop( random_element) data_object_index_removed_from_sample = self.data_objects_index_in_sample.pop( 0) self.data_objects_index_in_sample.append(random_index) self.data_objects_index_not_in_sample.append( data_object_index_removed_from_sample) if y: self._send_array(X_[random_index, :], y[random_index]) else: self._send_array(X_[random_index, :]) SubCMediansWrapper_c.train_model_with_SubCMedianspoint( self._p_subcmedians_c, self._data_object) self.generation += 1 if verbose: sys.stdout.write("\r" + str(iteration) + "/" + str(self.NbIter)) sys.stdout.flush() print ""
def __init__(self, SDmax=STD_SDmax, D=STD_D, N=STD_N, NbIter=STD_NbIter, threshold_cluster_validity=STD_THRESHOLD_CLUSTER_VALIDITY, seed=STD_SEED, option_deletion=STD_OPT_DEL, option_insertion=STD_OPT_INS, option_FIFO=STD_FIFO, option_train_with_latest=STD_TRAIN_WITH_LATEST, option_lazy_hill_climbing=STD_LAZY_HILL_CLIMBING, population_size=STD_LAMBDA, nb_generations_generation_update=STD_ETA): """ Creates a SubCMedians customizable object. This version has more options than the one presented in the paper, we suggest to use the SubCMedians object instead. """ self.SDmax = SDmax self.D = D self.N = N self.NbIter = NbIter self.threshold_cluster_validity = threshold_cluster_validity self.option_deletion = option_deletion self.option_insertion = option_insertion self.option_FIFO = option_FIFO self.option_train_with_latest = option_train_with_latest self.seed = seed self.population_size = population_size self.nb_generations_generation_update = nb_generations_generation_update self.option_lazy_hill_climbing = option_lazy_hill_climbing self._p_subcmedians_c = SubCMediansWrapper_c.generate_SubCMediansclust( SDmax, D, N, threshold_cluster_validity, seed, option_deletion, option_insertion, option_FIFO, option_train_with_latest, option_lazy_hill_climbing, population_size, nb_generations_generation_update) self._model_getter = [] self._distances_to_cluster_getter = [] self._lengths = [] self._features = [] self._object_class_cluster = [] self._cluster_getter = [] self._aggregatedstats = {} self._prng = SubCMediansWrapper_c.generate_prng(self.seed) self._stream = SubCMediansWrapper_c.generate_array_SubCMedians_point( self._prng, N, D) self._data_object = SubCMediansWrapper_c.generate_SubCMedians_point( self._prng, D) self._cluster_object = SubCMediansWrapper_c.generate_SubCMedians_point( self._prng, SDmax) self.time_start = timer() self._parameters = [ "SDmax", "D", "N", "M", "option_deletion", "option_insertion", "option_FIFO", "option_train_with_latest", "seed", "option_lazy_hill_climbing", "population_size", "nb_generations_generation_update" ] self.generation = 0
def _get_subcmedians_model(self): """ Get SubCMedians current model """ SubCMediansWrapper_c.get_SubCMediansclust_model( self._model_getter, self._lengths, self._p_subcmedians_c) local_model = [ self._model_getter[i][0:self._lengths[i + 1]] for i in xrange(self._lengths[0]) ] return local_model
def _transform_array(self, x): """ Apply the transform function to objects x in order to compute the distance to each candidate center in the model. """ self._send_array(x) cluster, distance = SubCMediansWrapper_c.clusterize_SubCMedianspoint_with_model( self._p_subcmedians_c, self._data_object) SubCMediansWrapper_c.get_distances_to_core_point( cluster, self._p_subcmedians_c, self._data_object, self._distances_to_cluster_getter) return array(self._distances_to_cluster_getter)
def _get_class_clusters_current_data_sample(self): """ Get the class / cluster membership of te current data sample """ class_cluster_df = DataFrame(columns=["class", "cluster"]) size_D = SubCMediansWrapper_c.get_data_window_size( self._p_subcmedians_c) for i in xrange(size_D): SubCMediansWrapper_c.get_D_point_class_cluster( i, self._p_subcmedians_c, self._object_class_cluster) class_cluster_df.loc[i] = self._object_class_cluster return class_cluster_df
def _send_array(self, x, y=None): """ Send an data object represented as a numpy array or a list to the C library """ scm_py_list = [0 for _ in xrange(POINTDESCRIPTORS)] for i, dim_pos in enumerate(x): if not isnan(dim_pos): scm_py_list.append([i, 1, float(dim_pos)]) if y is not None: scm_py_list[POINTCLASSID] = int(y) scm_py_list[POINTWEIGHT] = len(scm_py_list) - POINTDESCRIPTORS SubCMediansWrapper_c.py2C_convert_SubCMedianspoint( scm_py_list, self._data_object)
def _cluster_data_object(self, x, y=None): """ Sends a data objected encoded as a numpy array or a list and cluster it """ self._send_array(x, y) cluster, distance = SubCMediansWrapper_c.clusterize_SubCMedianspoint_with_model( self._p_subcmedians_c, self._data_object) return [int(y), cluster], distance
def _set_data_sample(self, X, y=None): """ Set the data sample objects drawing randomly objects from the dataset X """ self.data_objects_index_in_sample = [] self.data_objects_index_not_in_sample = range(len(X)) for _ in xrange(self.N): random_element = np.random.randint( 0, len(self.data_objects_index_not_in_sample)) random_index = self.data_objects_index_not_in_sample.pop( random_element) self.data_objects_index_in_sample.append(random_index) if y: self._send_array(X[random_index, :], y[random_index]) else: self._send_array(X[random_index, :]) SubCMediansWrapper_c.insert_SubCMedians_point_in_D( self._p_subcmedians_c, self._data_object)
def _check_consistency_C_params_Py_params(self): """ Check the consistency of the C parameters with respect to Python object parameters """ c_parameters = SubCMediansWrapper_c.get_parameters( self._p_subcmedians_c) for i, param in enumerate(self._parameters): if getattr(self, param) != c_parameters[i]: raise RuntimeError( 'C capsule parameters and Python parameters are different ' '%s %s != %s' % (param, str(getattr(self, param)), c_parameters[i]))
def score(self, X): """ Compute the mean intra-cluster distance """ X_ = self._check_X_matrix_validity(X) scores = [] for i, x in enumerate(X_): self._send_array(x) cluster, distance = SubCMediansWrapper_c.clusterize_SubCMedianspoint_with_model( self._p_subcmedians_c, self._data_object) scores.append(distance) return np.asarray(scores).mean()
def predict(self, X): """ sklearn-like predict function, receives a dataset and compute the cluster membership of its data objects """ X_ = self._check_X_matrix_validity(X) Y_ = array([]) for i, x in enumerate(X_): self._send_array(x) cluster, distance = SubCMediansWrapper_c.clusterize_SubCMedianspoint_with_model( self._p_subcmedians_c, self._data_object) Y_ = append(Y_, cluster) return Y_
def set_subspace_model(self, model, base_weight=1): model_translation = [] total_size = 0 for i, center in enumerate(model): scm_py_list = [0 for _ in xrange(POINTDESCRIPTORS)] w = 0 for dim, dim_pos in enumerate(center): if not isnan(dim_pos): scm_py_list.append([dim, base_weight, float(dim_pos)]) w += 1 total_size += w scm_py_list[POINTINDEX] = i scm_py_list[POINTWEIGHT] = w model_translation.append(scm_py_list) if total_size > self.SDmax: raise ValueError( 'Invalid new model size %s for estimator %s.' 'Check the size of your model and provide a smaller or equal size model' 'with `SubCMedians.SDmax`.' % (total_size, self)) else: SubCMediansWrapper_c.clone_SubCMedians_point_from_list( model_translation, self._p_subcmedians_c)
def fit_online_mode(self, X, y=None): """ Sklearn-like fit function, receives a dataset and build the subspace clustering that models the data. This function has been created to deal with streams of data, in this case the dataset provided as an input will never appear again, so it does not make sense to keep record of the sample used or not """ if X is None: return None X_ = self._check_X_matrix_validity(X) if len(X_.shape) == 1: self._send_array(X_, y) SubCMediansWrapper_c.train_model_with_SubCMedianspoint( self._p_subcmedians_c, self._data_object) self.generation += 1 else: for i, x in enumerate(X_): if y: self._send_array(x, y[i]) else: self._send_array(x) SubCMediansWrapper_c.train_model_with_SubCMedianspoint( self._p_subcmedians_c, self._data_object) self.generation += 1
def set_params(self, **params): """ Set the parameters provided to the construtor """ if not params: self._reallocate_memory() return self for name in params: if not hasattr(self, name): raise ValueError('Invalid parameter %s for estimator %s.' 'Check the list of available parameters ' 'with `SubCMedians.get_params().keys()`.' % (name, self)) setattr(self, name, params[name]) SubCMediansWrapper_c.set_parameters( self._p_subcmedians_c, self.SDmax, self.D, self.N, self.threshold_cluster_validity, self.seed, self.option_deletion, self.option_insertion, self.option_FIFO, self.option_train_with_latest, self.option_lazy_hill_climbing, self.population_size, self.nb_generations_generation_update) self._reallocate_memory() return self
def _train_on_current_training_set(self, iterations): """ Train the SubCMedians algorithm without updating the dataset sample """ for i in xrange(iterations): SubCMediansWrapper_c.train_on_current_D(self._p_subcmedians_c)
def _print_me(self): """ Print description regarding the current SubCMedians model """ SubCMediansWrapper_c.print_SubCMediansClust(self._p_subcmedians_c)