def preprocess_neighbors(self, rebuild=False, save=True): neighbors_model_path = os.path.join(self.selected_dir, "neighbors_model" + ".pkl") neighbors_path = os.path.join(self.selected_dir, "neighbors" + ".npy") neighbors_weight_path = os.path.join(self.selected_dir, "neighbors_weight" + ".npy") test_neighbors_path = os.path.join(self.selected_dir, "test_neighbors" + ".npy") test_neighbors_weight_path = os.path.join( self.selected_dir, "test_neighbors_weight" + ".npy") if os.path.exists(neighbors_model_path) and \ os.path.exists(neighbors_path) and \ os.path.exists(test_neighbors_path) and rebuild == False: print("neighbors and neighbor_weight exist!!!") neighbors = np.load(neighbors_path) neighbors_weight = np.load(neighbors_weight_path) test_neighbors = np.load(test_neighbors_path) self.test_neighbors = test_neighbors return neighbors, neighbors_weight, test_neighbors print("neighbors and neighbor_weight do not exist, preprocessing!") train_num = self.train_X.shape[0] train_y = np.array(self.train_y) test_num = self.test_X.shape[0] max_neighbors = min(len(train_y), 200) print("data shape: {}, labeled_num: {}".format(str(self.train_X.shape), sum(train_y != -1))) nn_fit = NearestNeighbors(7, n_jobs=-4).fit(self.train_X) print("nn construction finished!") neighbor_result = nn_fit.kneighbors_graph( nn_fit._fit_X, max_neighbors, # 2, mode="distance") test_neighbors_result = nn_fit.kneighbors_graph(self.test_X, max_neighbors, mode="distance") print("neighbor_result got!") neighbors, neighbors_weight = csr_to_impact_matrix( neighbor_result, train_num, max_neighbors) test_neighbors, test_neighbors_weight = csr_to_impact_matrix( test_neighbors_result, test_num, max_neighbors) self.test_neighbors = test_neighbors print("preprocessed neighbors got!") # save neighbors information if save: pickle_save_data(neighbors_model_path, nn_fit) np.save(neighbors_path, neighbors) np.save(neighbors_weight_path, neighbors_weight) np.save(test_neighbors_path, test_neighbors) np.save(test_neighbors_weight_path, test_neighbors_weight) return neighbors, neighbors_weight, test_neighbors
def adaptive_evaluation_bkp(self): train_X = self.data.get_train_X() affinity_matrix = self.data.get_graph() affinity_matrix.setdiag(0) pred = self.pred_dist test_X = self.data.get_test_X() test_y = self.data.get_test_ground_truth() # nn_fit = self.data.get_neighbors_model() nn_fit = NearestNeighbors(n_jobs=-4).fit(train_X) logger.info("nn construction finished!") neighbor_result = nn_fit.kneighbors_graph(test_X, 100, mode="distance") logger.info("neighbor_result got!") estimate_k = 5 s = 0 rest_idxs = self.data.get_rest_idxs() # removed_idxs = self.remv labels = [] for i in tqdm(range(test_X.shape[0])): start = neighbor_result.indptr[i] end = neighbor_result.indptr[i + 1] j_in_this_row = neighbor_result.indices[start:end] data_in_this_row = neighbor_result.data[start:end] sorted_idx = data_in_this_row.argsort() assert (len(sorted_idx) == 100) j_in_this_row = j_in_this_row[sorted_idx] estimated_idxs = j_in_this_row[:estimate_k] estimated_idxs = np.array([i for i in estimated_idxs if i in rest_idxs]) adaptive_k = affinity_matrix[estimated_idxs, :].sum() / estimate_k selected_idxs = j_in_this_row[:int(adaptive_k)] p = pred[selected_idxs].sum(axis=0) labels.append(p.argmax()) s += adaptive_k # print(adaptive_k) acc = accuracy_score(test_y, labels) logger.info("exp accuracy: {}".format(acc)) print(s/test_X.shape[0])
class BaseLabelPropagation(BaseEstimator, ClassifierMixin, metaclass=ABCMeta): """Base class for label propagation module. Parameters ---------- kernel : {'knn', 'rbf', callable} String identifier for kernel function to use or the kernel function itself. Only 'rbf' and 'knn' strings are valid inputs. The function passed should take two inputs, each of shape [n_samples, n_features], and return a [n_samples, n_samples] shaped weight matrix gamma : float Parameter for rbf kernel n_neighbors : integer > 0 Parameter for knn kernel alpha : float Clamping factor max_iter : integer Change maximum number of iterations allowed tol : float Convergence tolerance: threshold to consider the system at steady state n_jobs : int or None, optional (default=None) The number of parallel jobs to run. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. """ def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=1, max_iter=30, tol=1e-3, n_jobs=None): self.max_iter = max_iter self.tol = tol # kernel parameters self.kernel = kernel self.gamma = gamma self.n_neighbors = n_neighbors # clamping factor self.alpha = alpha self.n_jobs = n_jobs self.graph_matrix = None def _get_kernel(self, X, y=None): if self.kernel == "rbf": if y is None: return rbf_kernel(X, X, gamma=self.gamma) else: return rbf_kernel(X, y, gamma=self.gamma) elif self.kernel == "knn": if self.nn_fit is None: t0 = time() self.nn_fit = NearestNeighbors(self.n_neighbors, n_jobs=self.n_jobs).fit(X) print("NearestNeighbors fit time cost:", time() - t0) if y is None: t0 = time() result = self.nn_fit.kneighbors_graph(self.nn_fit._fit_X, self.n_neighbors, mode='connectivity') print("construct kNN graph time cost:", time() - t0) return result else: return self.nn_fit.kneighbors(y, return_distance=False) elif callable(self.kernel): if y is None: return self.kernel(X, X) else: return self.kernel(X, y) else: raise ValueError("%s is not a valid kernel. Only rbf and knn" " or an explicit function " " are supported at this time." % self.kernel) @abstractmethod def _build_graph(self): raise NotImplementedError("Graph construction must be implemented" " to fit a label propagation model.") def predict(self, X): """Performs inductive inference across the model. Parameters ---------- X : array_like, shape = [n_samples, n_features] Returns ------- y : array_like, shape = [n_samples] Predictions for input data """ probas = self.predict_proba(X) return self.classes_[np.argmax(probas, axis=1)].ravel() def predict_proba(self, X): """Predict probability for each possible outcome. Compute the probability estimates for each single sample in X and each possible outcome seen during training (categorical distribution). Parameters ---------- X : array_like, shape = [n_samples, n_features] Returns ------- probabilities : array, shape = [n_samples, n_classes] Normalized probability distributions across class labels """ check_is_fitted(self, 'X_') X_2d = check_array( X, accept_sparse=['csc', 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia']) weight_matrices = self._get_kernel(self.X_, X_2d) if self.kernel == 'knn': probabilities = np.array([ np.sum(self.label_distributions_[weight_matrix], axis=0) for weight_matrix in weight_matrices ]) else: weight_matrices = weight_matrices.T probabilities = np.dot(weight_matrices, self.label_distributions_) normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T probabilities /= normalizer return probabilities def get_graph(self, X, y): X, y = check_X_y(X, y) self.X_ = X check_classification_targets(y) graph_matrix = self._build_graph() return graph_matrix def fit(self, X, y): """Fit a semi-supervised label propagation model based All the input data is provided matrix X (labeled and unlabeled) and corresponding label matrix y with a dedicated marker value for unlabeled samples. Parameters ---------- X : array-like, shape = [n_samples, n_features] A {n_samples by n_samples} size matrix will be created from this y : array_like, shape = [n_samples] n_labeled_samples (unlabeled points are marked as -1) All unlabeled samples will be transductively assigned labels Returns ------- self : returns an instance of self. """ t0 = time() X, y = check_X_y(X, y) self.X_ = X check_classification_targets(y) # actual graph construction (implementations should override this) graph_matrix = self._build_graph() t1 = time() # label construction # construct a categorical distribution for classification only classes = np.unique(y) classes = (classes[classes != -1]) self.classes_ = classes n_samples, n_classes = len(y), len(classes) alpha = self.alpha if self._variant == 'spreading' and \ (alpha is None or alpha <= 0.0 or alpha >= 1.0): raise ValueError('alpha=%s is invalid: it must be inside ' 'the open interval (0, 1)' % alpha) y = np.asarray(y) unlabeled = y == -1 # initialize distributions self.label_distributions_ = np.zeros((n_samples, n_classes)) for label in classes: self.label_distributions_[y == label, classes == label] = 1 y_static = np.copy(self.label_distributions_) if self._variant == 'propagation': # LabelPropagation y_static[unlabeled] = 0 else: # LabelSpreading y_static *= 1 - alpha l_previous = np.zeros((self.X_.shape[0], n_classes)) unlabeled = unlabeled[:, np.newaxis] if sparse.isspmatrix(graph_matrix): graph_matrix = graph_matrix.tocsr() for self.n_iter_ in range(self.max_iter): if np.abs(self.label_distributions_ - l_previous).sum() < self.tol: break l_previous = self.label_distributions_ self.label_distributions_ = safe_sparse_dot( graph_matrix, self.label_distributions_) if self._variant == 'propagation': normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] self.label_distributions_ /= normalizer self.label_distributions_ = np.where(unlabeled, self.label_distributions_, y_static) else: # clamp self.label_distributions_ = np.multiply( alpha, self.label_distributions_) + y_static else: warnings.warn('max_iter=%d was reached without convergence.' % self.max_iter, category=ConvergenceWarning) self.n_iter_ += 1 normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] self.label_distributions_ /= normalizer # set the transduction item transduction = self.classes_[np.argmax(self.label_distributions_, axis=1)] self.transduction_ = transduction.ravel() t2 = time() print("building graph time cost: {}, spreading time cost: {}".format( t1 - t0, t2 - t1)) return self
class BaseLabelPropagation(six.with_metaclass(ABCMeta, BaseEstimator, ClassifierMixin)): """Base class for label propagation module. Parameters ---------- kernel : {'knn', 'rbf'} String identifier for kernel function to use. Only 'rbf' and 'knn' kernels are currently supported.. gamma : float Parameter for rbf kernel alpha : float Clamping factor max_iter : float Change maximum number of iterations allowed tol : float Convergence tolerance: threshold to consider the system at steady state n_neighbors : integer > 0 Parameter for knn kernel """ def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=1, max_iter=30, tol=1e-3): self.max_iter = max_iter self.tol = tol # kernel parameters self.kernel = kernel self.gamma = gamma self.n_neighbors = n_neighbors # clamping factor self.alpha = alpha def _get_kernel(self, X, y=None): if self.kernel == "rbf": if y is None: return rbf_kernel(X, X, gamma=self.gamma) else: return rbf_kernel(X, y, gamma=self.gamma) elif self.kernel == "knn": if self.nn_fit is None: self.nn_fit = NearestNeighbors(self.n_neighbors).fit(X) if y is None: # Nearest neighbors returns a directed matrix. dir_graph = self.nn_fit.kneighbors_graph(self.nn_fit._fit_X, self.n_neighbors, mode='connectivity') # Making the matrix symmetric un_graph = dir_graph + dir_graph.T # Since it is a connectivity matrix, all values should be # either 0 or 1 un_graph[un_graph > 1.0] = 1.0 return un_graph else: return self.nn_fit.kneighbors(y, return_distance=False) else: raise ValueError("%s is not a valid kernel. Only rbf and knn" " are supported at this time" % self.kernel) @abstractmethod def _build_graph(self): raise NotImplementedError("Graph construction must be implemented" " to fit a label propagation model.") def predict(self, X): """Performs inductive inference across the model. Parameters ---------- X : array_like, shape = [n_samples, n_features] Returns ------- y : array_like, shape = [n_samples] Predictions for input data """ probas = self.predict_proba(X) return self.classes_[np.argmax(probas, axis=1)].ravel() def predict_proba(self, X): """Predict probability for each possible outcome. Compute the probability estimates for each single sample in X and each possible outcome seen during training (categorical distribution). Parameters ---------- X : array_like, shape = [n_samples, n_features] Returns ------- probabilities : array, shape = [n_samples, n_classes] Normalized probability distributions across class labels """ check_is_fitted(self, 'X_') X_2d = check_array(X, accept_sparse = ['csc', 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia']) weight_matrices = self._get_kernel(self.X_, X_2d) if self.kernel == 'knn': probabilities = [] for weight_matrix in weight_matrices: ine = np.sum(self.label_distributions_[weight_matrix], axis=0) probabilities.append(ine) probabilities = np.array(probabilities) else: weight_matrices = weight_matrices.T probabilities = np.dot(weight_matrices, self.label_distributions_) normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T probabilities /= normalizer return probabilities def fit(self, X, y): """Fit a semi-supervised label propagation model based All the input data is provided matrix X (labeled and unlabeled) and corresponding label matrix y with a dedicated marker value for unlabeled samples. Parameters ---------- X : array-like, shape = [n_samples, n_features] A {n_samples by n_samples} size matrix will be created from this y : array_like, shape = [n_samples] n_labeled_samples (unlabeled points are marked as -1) All unlabeled samples will be transductively assigned labels Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y) self.X_ = X check_classification_targets(y) # actual graph construction (implementations should override this) graph_matrix = self._build_graph() # label construction # construct a categorical distribution for classification only classes = np.unique(y) classes = (classes[classes != -1]) self.classes_ = classes n_samples, n_classes = len(y), len(classes) y = np.asarray(y) unlabeled = y == -1 clamp_weights = np.ones((n_samples, 1)) clamp_weights[~unlabeled, 0] = 1 - self.alpha # initialize distributions self.label_distributions_ = np.zeros((n_samples, n_classes)) for label in classes: self.label_distributions_[y == label, classes == label] = 1 y_static = np.copy(self.label_distributions_) if self.alpha > 0.: y_static *= self.alpha y_static[unlabeled] = 0 l_previous = np.zeros((self.X_.shape[0], n_classes)) remaining_iter = self.max_iter if sparse.isspmatrix(graph_matrix): graph_matrix = graph_matrix.tocsr() while (_not_converged(self.label_distributions_, l_previous, self.tol) and remaining_iter > 1): l_previous = self.label_distributions_ self.label_distributions_ = safe_sparse_dot( graph_matrix, self.label_distributions_) # clamp self.label_distributions_ = np.multiply( clamp_weights, self.label_distributions_) + y_static remaining_iter -= 1 normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] self.label_distributions_ /= normalizer if remaining_iter <= 1: warnings.warn('max_iter was reached without convergence.', category=ConvergenceWarning) # set the transduction item transduction = self.classes_[np.argmax(self.label_distributions_, axis=1)] self.transduction_ = transduction.ravel() self.n_iter_ = self.max_iter - remaining_iter return self
def _preprocess_neighbors(self, rebuild=False, save=True): neighbors_model_path = os.path.join( self.selected_dir, "neighbors_model-step" + str(self.model.step) + ".pkl") neighbors_path = os.path.join( self.selected_dir, "neighbors-step" + str(self.model.step) + ".npy") neighbors_weight_path = os.path.join( self.selected_dir, "neighbors_weight-step" + str(self.model.step) + ".npy") test_neighbors_path = os.path.join( self.selected_dir, "test_neighbors-step" + str(self.model.step) + ".npy") test_neighbors_weight_path = os.path.join( self.selected_dir, "test_neighbors_weight-step" + str(self.model.step) + ".npy") if os.path.exists(neighbors_model_path) and \ os.path.exists(neighbors_path) and \ os.path.exists(test_neighbors_path) and rebuild == False and DEBUG == False: logger.info("neighbors and neighbor_weight exist!!!") self.neighbors = np.load(neighbors_path) self.neighbors_weight = np.load(neighbors_weight_path) self.test_neighbors = np.load(test_neighbors_path) return logger.info("neighbors and neighbor_weight " "do not exist, preprocessing!") train_X = self.get_full_train_X() train_num = train_X.shape[0] train_y = self.get_full_train_label() train_y = np.array(train_y) test_X = self.get_test_X() test_num = test_X.shape[0] self.max_neighbors = min(len(train_y), self.max_neighbors) logger.info("data shape: {}, labeled_num: {}".format( str(train_X.shape), sum(train_y != -1))) nn_fit = NearestNeighbors(7, n_jobs=-4).fit(train_X) logger.info("nn construction finished!") neighbor_result = nn_fit.kneighbors_graph( nn_fit._fit_X, self.max_neighbors, # 2, mode="distance") test_neighbors_result = nn_fit.kneighbors_graph(test_X, self.max_neighbors, mode="distance") logger.info("neighbor_result got!") self.neighbors, self.neighbors_weight = self.csr_to_impact_matrix( neighbor_result, train_num, self.max_neighbors) self.test_neighbors, test_neighbors_weight = self.csr_to_impact_matrix( test_neighbors_result, test_num, self.max_neighbors) logger.info("preprocessed neighbors got!") # save neighbors information if save: pickle_save_data(neighbors_model_path, nn_fit) np.save(neighbors_path, self.neighbors) np.save(neighbors_weight_path, self.neighbors_weight) np.save(test_neighbors_path, self.test_neighbors) np.save(test_neighbors_weight_path, test_neighbors_weight) return self.neighbors, self.test_neighbors
class BaseLabelPropagation(BaseEstimator, ClassifierMixin, metaclass=ABCMeta): def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=1, max_iter=30, tol=1e-3, n_jobs=None): self.max_iter = max_iter self.tol = tol # kernel parameters self.kernel = kernel self.gamma = gamma self.n_neighbors = n_neighbors # clamping factor self.alpha = alpha self.n_jobs = n_jobs def _get_kernel(self, X, y=None): if self.kernel == "rbf": if y is None: return rbf_kernel(X, X, gamma=self.gamma) else: return rbf_kernel(X, y, gamma=self.gamma) elif self.kernel == "knn": if self.nn_fit is None: self.nn_fit = NearestNeighbors(self.n_neighbors, n_jobs=self.n_jobs).fit(X) if y is None: return self.nn_fit.kneighbors_graph(self.nn_fit._fit_X, self.n_neighbors, mode='connectivity') else: return self.nn_fit.kneighbors(y, return_distance=False) elif callable(self.kernel): if y is None: return self.kernel(X, X) else: return self.kernel(X, y) else: raise ValueError("%s is not a valid kernel. Only rbf and knn" " or an explicit function " " are supported at this time." % self.kernel) def fit(self, X, y): """ Parameters ---------- X : array-like ,shape = [n_samples, n_features] input data matrix y : array-like, shape = [n_samples] n_labeled_samples (unlabeled = -1) Returns ---------- self : returns an instance of self. """ # initialize X_ self.X_ = X # actual graph construction graph_matrix = self._build_graph() # initialize classes classes = np.unique(y) classes = (classes[classes != -1]) ## self indexing array self.classes_ = classes # set n size n_samples, n_classes = len(y), len(classes) # set unlabeled to -1 y = np.asarray(y) unlabeled = y == -1 # initialize distributions self.label_distributions_ = np.zeros((n_samples, n_classes)) for label in classes: self.label_distributions_[y == label, classes == label] = 1 y_static = np.copy(self.label_distributions_) if self._variant == 'propagation': y_static[unlabeled] = 0 # initialize l_previous l_previous = np.zeros((self.X_.shape[0], n_classes)) # add a dimension to unlabeled unlabeled = unlabeled[:, np.newaxis] for self.n_iter_ in range(self.max_iter): if np.abs(self.label_distributions_ - l_previous).sum() < self.tol: break l_previous = self.label_distributions_ self.label_distributions_ = safe_sparse_dot( graph_matrix, self.label_distributions_) ## BLAS dot if self._variant == 'propagation': normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] self.label_distributions_ /= normalizer self.label_distributions_ = np.where(unlabeled, self.label_distributions_, y_static) else: self.n_iter_ += 1 normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] self.label_distributions_ /= normalizer # set the transduction item transduction = self.classes_[np.argmax(self.label_distributions_, axis=1)] self.transduction_ = transduction.ravel() return self def _build_graph(self): """Matrix representing a fully connected graph between each sample This basic implementation creates a non-stochastic affinity matrix, so class distributions will exceed 1 (normalization may be desired). """ if self.kernel == 'knn': self.nn_fit = None affinity_matrix = self._get_kernel(self.X_) normalizer = affinity_matrix.sum(axis=0) if sparse.isspmatrix(affinity_matrix): affinity_matrix.data /= np.diag(np.array(normalizer)) else: affinity_matrix /= normalizer[:, np.newaxis] return affinity_matrix