示例#1
0
    def preprocess_neighbors(self, rebuild=False, save=True):
        neighbors_model_path = os.path.join(self.selected_dir,
                                            "neighbors_model" + ".pkl")
        neighbors_path = os.path.join(self.selected_dir, "neighbors" + ".npy")
        neighbors_weight_path = os.path.join(self.selected_dir,
                                             "neighbors_weight" + ".npy")
        test_neighbors_path = os.path.join(self.selected_dir,
                                           "test_neighbors" + ".npy")
        test_neighbors_weight_path = os.path.join(
            self.selected_dir, "test_neighbors_weight" + ".npy")
        if os.path.exists(neighbors_model_path) and \
                os.path.exists(neighbors_path) and \
                os.path.exists(test_neighbors_path) and rebuild == False:
            print("neighbors and neighbor_weight exist!!!")
            neighbors = np.load(neighbors_path)
            neighbors_weight = np.load(neighbors_weight_path)
            test_neighbors = np.load(test_neighbors_path)
            self.test_neighbors = test_neighbors
            return neighbors, neighbors_weight, test_neighbors
        print("neighbors and neighbor_weight  do not exist, preprocessing!")
        train_num = self.train_X.shape[0]
        train_y = np.array(self.train_y)
        test_num = self.test_X.shape[0]
        max_neighbors = min(len(train_y), 200)
        print("data shape: {}, labeled_num: {}".format(str(self.train_X.shape),
                                                       sum(train_y != -1)))
        nn_fit = NearestNeighbors(7, n_jobs=-4).fit(self.train_X)
        print("nn construction finished!")
        neighbor_result = nn_fit.kneighbors_graph(
            nn_fit._fit_X,
            max_neighbors,
            # 2,
            mode="distance")
        test_neighbors_result = nn_fit.kneighbors_graph(self.test_X,
                                                        max_neighbors,
                                                        mode="distance")
        print("neighbor_result got!")
        neighbors, neighbors_weight = csr_to_impact_matrix(
            neighbor_result, train_num, max_neighbors)
        test_neighbors, test_neighbors_weight = csr_to_impact_matrix(
            test_neighbors_result, test_num, max_neighbors)
        self.test_neighbors = test_neighbors

        print("preprocessed neighbors got!")

        # save neighbors information
        if save:
            pickle_save_data(neighbors_model_path, nn_fit)
            np.save(neighbors_path, neighbors)
            np.save(neighbors_weight_path, neighbors_weight)
            np.save(test_neighbors_path, test_neighbors)
            np.save(test_neighbors_weight_path, test_neighbors_weight)
        return neighbors, neighbors_weight, test_neighbors
示例#2
0
 def adaptive_evaluation_bkp(self):
     train_X = self.data.get_train_X()
     affinity_matrix = self.data.get_graph()
     affinity_matrix.setdiag(0)
     pred = self.pred_dist
     test_X = self.data.get_test_X()
     test_y = self.data.get_test_ground_truth()
     # nn_fit = self.data.get_neighbors_model()
     nn_fit = NearestNeighbors(n_jobs=-4).fit(train_X)
     logger.info("nn construction finished!")
     neighbor_result = nn_fit.kneighbors_graph(test_X,
                                         100,
                                         mode="distance")
     logger.info("neighbor_result got!")
     estimate_k = 5
     s = 0
     rest_idxs = self.data.get_rest_idxs()
     # removed_idxs = self.remv
     labels = []
     for i in tqdm(range(test_X.shape[0])):
         start = neighbor_result.indptr[i]
         end = neighbor_result.indptr[i + 1]
         j_in_this_row = neighbor_result.indices[start:end]
         data_in_this_row = neighbor_result.data[start:end]
         sorted_idx = data_in_this_row.argsort()
         assert (len(sorted_idx) == 100)
         j_in_this_row = j_in_this_row[sorted_idx]
         estimated_idxs = j_in_this_row[:estimate_k]
         estimated_idxs = np.array([i for i in estimated_idxs if i in rest_idxs])
         adaptive_k = affinity_matrix[estimated_idxs, :].sum() / estimate_k
         selected_idxs = j_in_this_row[:int(adaptive_k)]
         p = pred[selected_idxs].sum(axis=0)
         labels.append(p.argmax())
         s += adaptive_k
         # print(adaptive_k)
     acc = accuracy_score(test_y, labels)
     logger.info("exp accuracy: {}".format(acc))
     print(s/test_X.shape[0])
示例#3
0
class BaseLabelPropagation(BaseEstimator, ClassifierMixin, metaclass=ABCMeta):
    """Base class for label propagation module.
    Parameters
    ----------
    kernel : {'knn', 'rbf', callable}
        String identifier for kernel function to use or the kernel function
        itself. Only 'rbf' and 'knn' strings are valid inputs. The function
        passed should take two inputs, each of shape [n_samples, n_features],
        and return a [n_samples, n_samples] shaped weight matrix
    gamma : float
        Parameter for rbf kernel
    n_neighbors : integer > 0
        Parameter for knn kernel
    alpha : float
        Clamping factor
    max_iter : integer
        Change maximum number of iterations allowed
    tol : float
        Convergence tolerance: threshold to consider the system at steady
        state
   n_jobs : int or None, optional (default=None)
        The number of parallel jobs to run.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.
    """
    def __init__(self,
                 kernel='rbf',
                 gamma=20,
                 n_neighbors=7,
                 alpha=1,
                 max_iter=30,
                 tol=1e-3,
                 n_jobs=None):

        self.max_iter = max_iter
        self.tol = tol

        # kernel parameters
        self.kernel = kernel
        self.gamma = gamma
        self.n_neighbors = n_neighbors

        # clamping factor
        self.alpha = alpha

        self.n_jobs = n_jobs

        self.graph_matrix = None

    def _get_kernel(self, X, y=None):
        if self.kernel == "rbf":
            if y is None:
                return rbf_kernel(X, X, gamma=self.gamma)
            else:
                return rbf_kernel(X, y, gamma=self.gamma)
        elif self.kernel == "knn":
            if self.nn_fit is None:
                t0 = time()
                self.nn_fit = NearestNeighbors(self.n_neighbors,
                                               n_jobs=self.n_jobs).fit(X)
                print("NearestNeighbors fit time cost:", time() - t0)
            if y is None:
                t0 = time()
                result = self.nn_fit.kneighbors_graph(self.nn_fit._fit_X,
                                                      self.n_neighbors,
                                                      mode='connectivity')
                print("construct kNN graph time cost:", time() - t0)
                return result
            else:
                return self.nn_fit.kneighbors(y, return_distance=False)
        elif callable(self.kernel):
            if y is None:
                return self.kernel(X, X)
            else:
                return self.kernel(X, y)
        else:
            raise ValueError("%s is not a valid kernel. Only rbf and knn"
                             " or an explicit function "
                             " are supported at this time." % self.kernel)

    @abstractmethod
    def _build_graph(self):
        raise NotImplementedError("Graph construction must be implemented"
                                  " to fit a label propagation model.")

    def predict(self, X):
        """Performs inductive inference across the model.
        Parameters
        ----------
        X : array_like, shape = [n_samples, n_features]
        Returns
        -------
        y : array_like, shape = [n_samples]
            Predictions for input data
        """
        probas = self.predict_proba(X)
        return self.classes_[np.argmax(probas, axis=1)].ravel()

    def predict_proba(self, X):
        """Predict probability for each possible outcome.
        Compute the probability estimates for each single sample in X
        and each possible outcome seen during training (categorical
        distribution).
        Parameters
        ----------
        X : array_like, shape = [n_samples, n_features]
        Returns
        -------
        probabilities : array, shape = [n_samples, n_classes]
            Normalized probability distributions across
            class labels
        """
        check_is_fitted(self, 'X_')

        X_2d = check_array(
            X, accept_sparse=['csc', 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'])
        weight_matrices = self._get_kernel(self.X_, X_2d)
        if self.kernel == 'knn':
            probabilities = np.array([
                np.sum(self.label_distributions_[weight_matrix], axis=0)
                for weight_matrix in weight_matrices
            ])
        else:
            weight_matrices = weight_matrices.T
            probabilities = np.dot(weight_matrices, self.label_distributions_)
        normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T
        probabilities /= normalizer
        return probabilities

    def get_graph(self, X, y):
        X, y = check_X_y(X, y)
        self.X_ = X
        check_classification_targets(y)
        graph_matrix = self._build_graph()
        return graph_matrix

    def fit(self, X, y):
        """Fit a semi-supervised label propagation model based
        All the input data is provided matrix X (labeled and unlabeled)
        and corresponding label matrix y with a dedicated marker value for
        unlabeled samples.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            A {n_samples by n_samples} size matrix will be created from this
        y : array_like, shape = [n_samples]
            n_labeled_samples (unlabeled points are marked as -1)
            All unlabeled samples will be transductively assigned labels
        Returns
        -------
        self : returns an instance of self.
        """
        t0 = time()
        X, y = check_X_y(X, y)
        self.X_ = X
        check_classification_targets(y)

        # actual graph construction (implementations should override this)
        graph_matrix = self._build_graph()
        t1 = time()

        # label construction
        # construct a categorical distribution for classification only
        classes = np.unique(y)
        classes = (classes[classes != -1])
        self.classes_ = classes

        n_samples, n_classes = len(y), len(classes)

        alpha = self.alpha
        if self._variant == 'spreading' and \
                (alpha is None or alpha <= 0.0 or alpha >= 1.0):
            raise ValueError('alpha=%s is invalid: it must be inside '
                             'the open interval (0, 1)' % alpha)
        y = np.asarray(y)
        unlabeled = y == -1

        # initialize distributions
        self.label_distributions_ = np.zeros((n_samples, n_classes))
        for label in classes:
            self.label_distributions_[y == label, classes == label] = 1

        y_static = np.copy(self.label_distributions_)
        if self._variant == 'propagation':
            # LabelPropagation
            y_static[unlabeled] = 0
        else:
            # LabelSpreading
            y_static *= 1 - alpha

        l_previous = np.zeros((self.X_.shape[0], n_classes))

        unlabeled = unlabeled[:, np.newaxis]
        if sparse.isspmatrix(graph_matrix):
            graph_matrix = graph_matrix.tocsr()

        for self.n_iter_ in range(self.max_iter):
            if np.abs(self.label_distributions_ - l_previous).sum() < self.tol:
                break

            l_previous = self.label_distributions_
            self.label_distributions_ = safe_sparse_dot(
                graph_matrix, self.label_distributions_)

            if self._variant == 'propagation':
                normalizer = np.sum(self.label_distributions_,
                                    axis=1)[:, np.newaxis]
                self.label_distributions_ /= normalizer
                self.label_distributions_ = np.where(unlabeled,
                                                     self.label_distributions_,
                                                     y_static)
            else:
                # clamp
                self.label_distributions_ = np.multiply(
                    alpha, self.label_distributions_) + y_static
        else:
            warnings.warn('max_iter=%d was reached without convergence.' %
                          self.max_iter,
                          category=ConvergenceWarning)
            self.n_iter_ += 1

        normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis]
        self.label_distributions_ /= normalizer

        # set the transduction item
        transduction = self.classes_[np.argmax(self.label_distributions_,
                                               axis=1)]
        self.transduction_ = transduction.ravel()

        t2 = time()
        print("building graph time cost: {}, spreading time cost: {}".format(
            t1 - t0, t2 - t1))

        return self
class BaseLabelPropagation(six.with_metaclass(ABCMeta, BaseEstimator,
                                              ClassifierMixin)):
    """Base class for label propagation module.

    Parameters
    ----------
    kernel : {'knn', 'rbf'}
        String identifier for kernel function to use.
        Only 'rbf' and 'knn' kernels are currently supported..

    gamma : float
        Parameter for rbf kernel

    alpha : float
        Clamping factor

    max_iter : float
        Change maximum number of iterations allowed

    tol : float
        Convergence tolerance: threshold to consider the system at steady
        state

    n_neighbors : integer > 0
        Parameter for knn kernel

    """

    def __init__(self, kernel='rbf', gamma=20, n_neighbors=7,
                 alpha=1, max_iter=30, tol=1e-3):

        self.max_iter = max_iter
        self.tol = tol

        # kernel parameters
        self.kernel = kernel
        self.gamma = gamma
        self.n_neighbors = n_neighbors

        # clamping factor
        self.alpha = alpha

    def _get_kernel(self, X, y=None):
        if self.kernel == "rbf":
            if y is None:
                return rbf_kernel(X, X, gamma=self.gamma)
            else:
                return rbf_kernel(X, y, gamma=self.gamma)
        elif self.kernel == "knn":
            if self.nn_fit is None:
                self.nn_fit = NearestNeighbors(self.n_neighbors).fit(X)
            if y is None:
                # Nearest neighbors returns a directed matrix.
                dir_graph = self.nn_fit.kneighbors_graph(self.nn_fit._fit_X,
                                                         self.n_neighbors,
                                                         mode='connectivity')
                # Making the matrix symmetric
                un_graph = dir_graph + dir_graph.T
                # Since it is a connectivity matrix, all values should be
                # either 0 or 1
                un_graph[un_graph > 1.0] = 1.0
                return un_graph
            else:
                return self.nn_fit.kneighbors(y, return_distance=False)
        else:
            raise ValueError("%s is not a valid kernel. Only rbf and knn"
                             " are supported at this time" % self.kernel)

    @abstractmethod
    def _build_graph(self):
        raise NotImplementedError("Graph construction must be implemented"
                                  " to fit a label propagation model.")

    def predict(self, X):
        """Performs inductive inference across the model.

        Parameters
        ----------
        X : array_like, shape = [n_samples, n_features]

        Returns
        -------
        y : array_like, shape = [n_samples]
            Predictions for input data
        """
        probas = self.predict_proba(X)
        return self.classes_[np.argmax(probas, axis=1)].ravel()

    def predict_proba(self, X):
        """Predict probability for each possible outcome.

        Compute the probability estimates for each single sample in X
        and each possible outcome seen during training (categorical
        distribution).

        Parameters
        ----------
        X : array_like, shape = [n_samples, n_features]

        Returns
        -------
        probabilities : array, shape = [n_samples, n_classes]
            Normalized probability distributions across
            class labels
        """
        check_is_fitted(self, 'X_')

        X_2d = check_array(X, accept_sparse = ['csc', 'csr', 'coo', 'dok',
                        'bsr', 'lil', 'dia'])
        weight_matrices = self._get_kernel(self.X_, X_2d)
        if self.kernel == 'knn':
            probabilities = []
            for weight_matrix in weight_matrices:
                ine = np.sum(self.label_distributions_[weight_matrix], axis=0)
                probabilities.append(ine)
            probabilities = np.array(probabilities)
        else:
            weight_matrices = weight_matrices.T
            probabilities = np.dot(weight_matrices, self.label_distributions_)
        normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T
        probabilities /= normalizer
        return probabilities

    def fit(self, X, y):
        """Fit a semi-supervised label propagation model based

        All the input data is provided matrix X (labeled and unlabeled)
        and corresponding label matrix y with a dedicated marker value for
        unlabeled samples.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            A {n_samples by n_samples} size matrix will be created from this

        y : array_like, shape = [n_samples]
            n_labeled_samples (unlabeled points are marked as -1)
            All unlabeled samples will be transductively assigned labels

        Returns
        -------
        self : returns an instance of self.
        """
        X, y = check_X_y(X, y)
        self.X_ = X
        check_classification_targets(y)

        # actual graph construction (implementations should override this)
        graph_matrix = self._build_graph()

        # label construction
        # construct a categorical distribution for classification only
        classes = np.unique(y)
        classes = (classes[classes != -1])
        self.classes_ = classes

        n_samples, n_classes = len(y), len(classes)

        y = np.asarray(y)
        unlabeled = y == -1
        clamp_weights = np.ones((n_samples, 1))
        clamp_weights[~unlabeled, 0] = 1 - self.alpha

        # initialize distributions
        self.label_distributions_ = np.zeros((n_samples, n_classes))
        for label in classes:
            self.label_distributions_[y == label, classes == label] = 1

        y_static = np.copy(self.label_distributions_)
        if self.alpha > 0.:
            y_static *= self.alpha
        y_static[unlabeled] = 0

        l_previous = np.zeros((self.X_.shape[0], n_classes))

        remaining_iter = self.max_iter
        if sparse.isspmatrix(graph_matrix):
            graph_matrix = graph_matrix.tocsr()
        while (_not_converged(self.label_distributions_, l_previous, self.tol)
                and remaining_iter > 1):
            l_previous = self.label_distributions_
            self.label_distributions_ = safe_sparse_dot(
                graph_matrix, self.label_distributions_)
            # clamp
            self.label_distributions_ = np.multiply(
                clamp_weights, self.label_distributions_) + y_static

            remaining_iter -= 1

        normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis]
        self.label_distributions_ /= normalizer

        if remaining_iter <= 1:
            warnings.warn('max_iter was reached without convergence.',
                          category=ConvergenceWarning)

        # set the transduction item
        transduction = self.classes_[np.argmax(self.label_distributions_,
                                               axis=1)]
        self.transduction_ = transduction.ravel()
        self.n_iter_ = self.max_iter - remaining_iter
        return self
示例#5
0
    def _preprocess_neighbors(self, rebuild=False, save=True):
        neighbors_model_path = os.path.join(
            self.selected_dir,
            "neighbors_model-step" + str(self.model.step) + ".pkl")
        neighbors_path = os.path.join(
            self.selected_dir,
            "neighbors-step" + str(self.model.step) + ".npy")
        neighbors_weight_path = os.path.join(
            self.selected_dir,
            "neighbors_weight-step" + str(self.model.step) + ".npy")
        test_neighbors_path = os.path.join(
            self.selected_dir,
            "test_neighbors-step" + str(self.model.step) + ".npy")
        test_neighbors_weight_path = os.path.join(
            self.selected_dir,
            "test_neighbors_weight-step" + str(self.model.step) + ".npy")
        if os.path.exists(neighbors_model_path) and \
                os.path.exists(neighbors_path) and \
                os.path.exists(test_neighbors_path) and rebuild == False and DEBUG == False:
            logger.info("neighbors and neighbor_weight exist!!!")
            self.neighbors = np.load(neighbors_path)
            self.neighbors_weight = np.load(neighbors_weight_path)
            self.test_neighbors = np.load(test_neighbors_path)
            return
        logger.info("neighbors and neighbor_weight "
                    "do not exist, preprocessing!")
        train_X = self.get_full_train_X()
        train_num = train_X.shape[0]
        train_y = self.get_full_train_label()
        train_y = np.array(train_y)
        test_X = self.get_test_X()
        test_num = test_X.shape[0]
        self.max_neighbors = min(len(train_y), self.max_neighbors)
        logger.info("data shape: {}, labeled_num: {}".format(
            str(train_X.shape), sum(train_y != -1)))
        nn_fit = NearestNeighbors(7, n_jobs=-4).fit(train_X)
        logger.info("nn construction finished!")
        neighbor_result = nn_fit.kneighbors_graph(
            nn_fit._fit_X,
            self.max_neighbors,
            # 2,
            mode="distance")
        test_neighbors_result = nn_fit.kneighbors_graph(test_X,
                                                        self.max_neighbors,
                                                        mode="distance")
        logger.info("neighbor_result got!")
        self.neighbors, self.neighbors_weight = self.csr_to_impact_matrix(
            neighbor_result, train_num, self.max_neighbors)
        self.test_neighbors, test_neighbors_weight = self.csr_to_impact_matrix(
            test_neighbors_result, test_num, self.max_neighbors)

        logger.info("preprocessed neighbors got!")

        # save neighbors information
        if save:
            pickle_save_data(neighbors_model_path, nn_fit)
            np.save(neighbors_path, self.neighbors)
            np.save(neighbors_weight_path, self.neighbors_weight)
            np.save(test_neighbors_path, self.test_neighbors)
            np.save(test_neighbors_weight_path, test_neighbors_weight)
        return self.neighbors, self.test_neighbors
示例#6
0
class BaseLabelPropagation(BaseEstimator, ClassifierMixin, metaclass=ABCMeta):
    def __init__(self,
                 kernel='rbf',
                 gamma=20,
                 n_neighbors=7,
                 alpha=1,
                 max_iter=30,
                 tol=1e-3,
                 n_jobs=None):

        self.max_iter = max_iter
        self.tol = tol

        # kernel parameters
        self.kernel = kernel
        self.gamma = gamma
        self.n_neighbors = n_neighbors

        # clamping factor
        self.alpha = alpha

        self.n_jobs = n_jobs

    def _get_kernel(self, X, y=None):
        if self.kernel == "rbf":
            if y is None:
                return rbf_kernel(X, X, gamma=self.gamma)
            else:
                return rbf_kernel(X, y, gamma=self.gamma)
        elif self.kernel == "knn":
            if self.nn_fit is None:
                self.nn_fit = NearestNeighbors(self.n_neighbors,
                                               n_jobs=self.n_jobs).fit(X)
            if y is None:
                return self.nn_fit.kneighbors_graph(self.nn_fit._fit_X,
                                                    self.n_neighbors,
                                                    mode='connectivity')
            else:
                return self.nn_fit.kneighbors(y, return_distance=False)
        elif callable(self.kernel):
            if y is None:
                return self.kernel(X, X)
            else:
                return self.kernel(X, y)
        else:
            raise ValueError("%s is not a valid kernel. Only rbf and knn"
                             " or an explicit function "
                             " are supported at this time." % self.kernel)

    def fit(self, X, y):
        """
        Parameters
        ----------
        X : array-like ,shape = [n_samples, n_features]
            input data matrix

        y : array-like, shape = [n_samples]
            n_labeled_samples (unlabeled = -1)

        Returns
        ----------
        self : returns an instance of self.
        """
        # initialize X_
        self.X_ = X

        # actual graph construction
        graph_matrix = self._build_graph()

        # initialize classes
        classes = np.unique(y)
        classes = (classes[classes != -1])  ## self indexing array
        self.classes_ = classes

        # set n size
        n_samples, n_classes = len(y), len(classes)

        # set unlabeled to -1
        y = np.asarray(y)
        unlabeled = y == -1

        # initialize distributions
        self.label_distributions_ = np.zeros((n_samples, n_classes))
        for label in classes:
            self.label_distributions_[y == label, classes == label] = 1

        y_static = np.copy(self.label_distributions_)
        if self._variant == 'propagation':
            y_static[unlabeled] = 0

        # initialize l_previous
        l_previous = np.zeros((self.X_.shape[0], n_classes))

        # add a dimension to unlabeled
        unlabeled = unlabeled[:, np.newaxis]

        for self.n_iter_ in range(self.max_iter):
            if np.abs(self.label_distributions_ - l_previous).sum() < self.tol:
                break

            l_previous = self.label_distributions_
            self.label_distributions_ = safe_sparse_dot(
                graph_matrix, self.label_distributions_)  ## BLAS dot

            if self._variant == 'propagation':
                normalizer = np.sum(self.label_distributions_,
                                    axis=1)[:, np.newaxis]
                self.label_distributions_ /= normalizer
                self.label_distributions_ = np.where(unlabeled,
                                                     self.label_distributions_,
                                                     y_static)
        else:
            self.n_iter_ += 1

        normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis]
        self.label_distributions_ /= normalizer

        # set the transduction item
        transduction = self.classes_[np.argmax(self.label_distributions_,
                                               axis=1)]
        self.transduction_ = transduction.ravel()
        return self

    def _build_graph(self):
        """Matrix representing a fully connected graph between each sample

        This basic implementation creates a non-stochastic affinity matrix, so
        class distributions will exceed 1 (normalization may be desired).
        """
        if self.kernel == 'knn':
            self.nn_fit = None
        affinity_matrix = self._get_kernel(self.X_)
        normalizer = affinity_matrix.sum(axis=0)
        if sparse.isspmatrix(affinity_matrix):
            affinity_matrix.data /= np.diag(np.array(normalizer))
        else:
            affinity_matrix /= normalizer[:, np.newaxis]
        return affinity_matrix