def get_dist_matrix(similarity_graph, base_path, image_count,
                     similar_count):
     dist_matrix_file = Path(base_path + "/" + "dist_matrix" +
                             str(image_count) + str(similar_count) + ".npy")
     if dist_matrix_file.is_file():
         dist_matrix = np.load(base_path + "/" + "dist_matrix" +
                               str(image_count) + str(similar_count) +
                               ".npy")
     else:
         similarity_graph = np.maximum(similarity_graph, similarity_graph.T)
         similarity_graph = csr_matrix(similarity_graph)
         similarity_graph_sparse = similarity_graph
         similarity_graph_sparse = validate_graph(similarity_graph_sparse,
                                                  False, np.float64)
         dist_matrix = shortest_path(similarity_graph_sparse,
                                     method="J",
                                     directed=False)
         np.save(
             base_path + "/" + "dist_matrix" + str(image_count) +
             str(similar_count) + ".npy", dist_matrix)
     return dist_matrix
Exemplo n.º 2
0
def minimum_spanning_tree_K(csgraph, k=1, overwrite=False):
    csgraph = validate_graph(csgraph, True, np.float64, dense_output=False,
                             copy_if_sparse=not overwrite)
    N = csgraph.shape[0]

    data = csgraph.data
    indices = csgraph.indices
    indptr = csgraph.indptr

    rank = np.zeros(N, dtype=np.int32)
    predecessors = np.arange(N, dtype=np.int32)

    i_sort = np.argsort(data).astype(np.int32)
    row_indices = np.zeros(len(data), dtype=np.int32)

    min_spanning_tree_K(data, indices, indptr, i_sort,
                        row_indices, predecessors, rank, k)

    sp_tree = csr_matrix((data, indices, indptr), (N, N))
    sp_tree.eliminate_zeros()

    return sp_tree
Exemplo n.º 3
0
    def fit(self, X, y=None):
        """Fit the clustering model

        Parameters
        ----------
        X : array_like
            the data to be clustered: shape = [n_samples, n_features]
        """
        if self.cutoff is None and self.cutoff_scale is None:
            raise ValueError("Must specify either cutoff or cutoff_frac")

        # Compute the distance-based graph G from the points in X
        if self.metric == 'precomputed':
            # Input is already a graph. Copy if sparse
            # so we can overwrite for efficiency below.
            self.X_fit_ = None
            G = validate_graph(X,
                               directed=True,
                               csr_output=True,
                               dense_output=False,
                               copy_if_sparse=True,
                               null_value_in=np.inf)
        elif not self.approximate:
            X = check_array(X)
            self.X_fit_ = X
            kwds = self.metric_params or {}
            G = pairwise_distances(X, metric=self.metric, **kwds)
            G = validate_graph(G,
                               directed=True,
                               csr_output=True,
                               dense_output=False,
                               copy_if_sparse=True,
                               null_value_in=np.inf)
        else:
            # generate a sparse graph using n_neighbors of each point
            X = check_array(X)
            self.X_fit_ = X
            n_neighbors = min(self.n_neighbors, X.shape[0] - 1)
            G = kneighbors_graph(X,
                                 n_neighbors=n_neighbors,
                                 mode='distance',
                                 metric=self.metric,
                                 metric_params=self.metric_params)

        # HACK to keep explicit zeros (minimum spanning tree removes them)
        zero_fillin = G.data[G.data > 0].min() * 1E-8
        G.data[G.data == 0] = zero_fillin

        # Compute the minimum spanning tree of this graph
        self.full_tree_ = minimum_spanning_tree(G, overwrite=True)

        # undo the hack to bring back explicit zeros
        self.full_tree_[self.full_tree_ == zero_fillin] = 0

        # Partition the data by the cutoff
        N = G.shape[0] - 1
        if self.cutoff is None:
            i_cut = N
        elif 0 <= self.cutoff < 1:
            i_cut = int((1 - self.cutoff) * N)
        elif self.cutoff >= 1:
            i_cut = int(N - self.cutoff)
        else:
            raise ValueError('self.cutoff must be positive, not {0}'
                             ''.format(self.cutoff))

        # create the mask; we zero-out values where the mask is True
        N = len(self.full_tree_.data)
        if i_cut < 0:
            mask = np.ones(N, dtype=bool)
        elif i_cut >= N:
            mask = np.zeros(N, dtype=bool)
        else:
            mask = np.ones(N, dtype=bool)
            part = np.argpartition(self.full_tree_.data, i_cut)
            mask[part[:i_cut]] = False

        # additionally cut values above the ``cutoff_scale``
        if self.cutoff_scale is not None:
            mask |= (self.full_tree_.data > self.cutoff_scale)

        # Trim the tree
        cluster_graph = self.full_tree_.copy()

        # Eliminate zeros from cluster_graph for efficiency.
        # We want to do this:
        #    cluster_graph.data[mask] = 0
        #    cluster_graph.eliminate_zeros()
        # but there could be explicit zeros in our data!
        # So we call eliminate_zeros() with a stand-in data array,
        # then replace the data when we're finished.
        original_data = cluster_graph.data
        cluster_graph.data = np.arange(1, len(cluster_graph.data) + 1)
        cluster_graph.data[mask] = 0
        cluster_graph.eliminate_zeros()
        cluster_graph.data = original_data[cluster_graph.data.astype(int) - 1]

        # find connected components
        n_components, labels = connected_components(cluster_graph,
                                                    directed=False)

        # remove clusters with fewer than min_cluster_size
        counts = np.bincount(labels)
        to_remove = np.where(counts < self.min_cluster_size)[0]

        if len(to_remove) > 0:
            for i in to_remove:
                labels[labels == i] = -1
            _, labels = np.unique(labels, return_inverse=True)
            labels -= 1  # keep -1 labels the same

        # update cluster_graph by eliminating non-clusters
        # operationally, this means zeroing-out rows & columns where
        # the label is negative.
        I = sparse.eye(len(labels))
        I.data[0, labels < 0] = 0

        # we could just do this:
        #   cluster_graph = I * cluster_graph * I
        # but we want to be able to eliminate the zeros, so we use
        # the same indexing trick as above
        original_data = cluster_graph.data
        cluster_graph.data = np.arange(1, len(cluster_graph.data) + 1)
        cluster_graph = I * cluster_graph * I
        cluster_graph.eliminate_zeros()
        cluster_graph.data = original_data[cluster_graph.data.astype(int) - 1]

        self.labels_ = labels
        self.cluster_graph_ = cluster_graph
        return self
Exemplo n.º 4
0
    def fit(self, X, y=None):
        """Fit the clustering model

        Parameters
        ----------
        X : array_like
            the data to be clustered: shape = [n_samples, n_features]
        threshold : str
            Algorithm to use for thresholding edge length in MST
        """
        # Compute the distance-based graph G from the points in X
        if self.metric == 'precomputed':
            # Input is already a graph. Copy if sparse
            # so we can overwrite for efficiency below.
            self.X_fit_ = None
            G = validate_graph(X,
                               directed=True,
                               csr_output=True,
                               dense_output=False,
                               copy_if_sparse=True,
                               null_value_in=np.inf)
        elif not self.approximate:
            X = check_array(X)
            self.X_fit_ = X
            kwds = self.metric_params or {}
            G = pairwise_distances(X, metric=self.metric, **kwds)
            G = validate_graph(G,
                               directed=True,
                               csr_output=True,
                               dense_output=False,
                               copy_if_sparse=True,
                               null_value_in=np.inf)
        else:
            # generate a sparse graph using n_neighbors of each point
            X = check_array(X)
            self.X_fit_ = X
            n_neighbors = min(self.n_neighbors, X.shape[0] - 1)
            G = kneighbors_graph(X,
                                 n_neighbors=n_neighbors,
                                 mode='distance',
                                 metric=self.metric,
                                 metric_params=self.metric_params)

        # HACK to keep explicit zeros (minimum spanning tree removes them)
        zero_fillin = G.data[G.data > 0].min() * 1E-8
        G.data[G.data == 0] = zero_fillin

        # Compute the minimum spanning tree of this graph
        self.full_tree_ = minimum_spanning_tree(G, overwrite=True)

        # undo the hack to bring back explicit zeros
        self.full_tree_[self.full_tree_ == zero_fillin] = 0

        if self.threshold == 'hermite':
            max_edge = self._hermite_threshold(self.full_tree_)
        else:
            max_edge = self._histogram_threshold(self.full_tree_)

        mask = self.full_tree_.data > max_edge

        cluster_graph = self.full_tree_.copy()
        original_data = cluster_graph.data
        cluster_graph.data = np.arange(1, len(cluster_graph.data) + 1)
        cluster_graph.data[mask] = 0
        cluster_graph.eliminate_zeros()
        cluster_graph.data = original_data[cluster_graph.data.astype(int) - 1]

        self.cluster_graph_ = cluster_graph
        self.n_components_, self.labels_ = connected_components(cluster_graph,
                                                                directed=False)
        return self
Exemplo n.º 5
0
    def fit(self, X, y=None):
        """Fit the clustering model

        Parameters
        ----------
        X : array_like
            the data to be clustered: shape = [n_samples, n_features]
        """
        if self.cutoff is None and self.cutoff_scale is None:
            raise ValueError("Must specify either cutoff or cutoff_frac")

        # Compute the distance-based graph G from the points in X
        if self.metric == 'precomputed':
            # Input is already a graph. Copy if sparse
            # so we can overwrite for efficiency below.
            self.X_fit_ = None
            G = validate_graph(X, directed=True,
                               csr_output=True, dense_output=False,
                               copy_if_sparse=True, null_value_in=np.inf)
        elif not self.approximate:
            X = check_array(X)
            self.X_fit_ = X
            kwds = self.metric_params or {}
            G = pairwise_distances(X, metric=self.metric, **kwds)
            G = validate_graph(G, directed=True,
                               csr_output=True, dense_output=False,
                               copy_if_sparse=True, null_value_in=np.inf)
        else:
            # generate a sparse graph using n_neighbors of each point
            X = check_array(X)
            self.X_fit_ = X
            n_neighbors = min(self.n_neighbors, X.shape[0] - 1)
            G = kneighbors_graph(X, n_neighbors=n_neighbors,
                                 mode='distance',
                                 metric=self.metric,
                                 metric_params=self.metric_params)

        # HACK to keep explicit zeros (minimum spanning tree removes them)
        zero_fillin = G.data[G.data > 0].min() * 1E-8
        G.data[G.data == 0] = zero_fillin

        # Compute the minimum spanning tree of this graph
        self.full_tree_ = minimum_spanning_tree(G, overwrite=True)

        # undo the hack to bring back explicit zeros
        self.full_tree_[self.full_tree_ == zero_fillin] = 0

        # Partition the data by the cutoff
        N = G.shape[0] - 1
        if self.cutoff is None:
            i_cut = N
        elif 0 <= self.cutoff < 1:
            i_cut = int((1 - self.cutoff) * N)
        elif self.cutoff >= 1:
            i_cut = int(N - self.cutoff)
        else:
            raise ValueError('self.cutoff must be positive, not {0}'
                             ''.format(self.cutoff))

        # create the mask; we zero-out values where the mask is True
        N = len(self.full_tree_.data)
        if i_cut < 0:
            mask = np.ones(N, dtype=bool)
        elif i_cut >= N:
            mask = np.zeros(N, dtype=bool)
        else:
            mask = np.ones(N, dtype=bool)
            part = np.argpartition(self.full_tree_.data, i_cut)
            mask[part[:i_cut]] = False

        # additionally cut values above the ``cutoff_scale``
        if self.cutoff_scale is not None:
            mask |= (self.full_tree_.data > self.cutoff_scale)

        # Trim the tree
        cluster_graph = self.full_tree_.copy()

        # Eliminate zeros from cluster_graph for efficiency.
        # We want to do this:
        #    cluster_graph.data[mask] = 0
        #    cluster_graph.eliminate_zeros()
        # but there could be explicit zeros in our data!
        # So we call eliminate_zeros() with a stand-in data array,
        # then replace the data when we're finished.
        original_data = cluster_graph.data
        cluster_graph.data = np.arange(1, len(cluster_graph.data) + 1)
        cluster_graph.data[mask] = 0
        cluster_graph.eliminate_zeros()
        cluster_graph.data = original_data[cluster_graph.data.astype(int) - 1]

        # find connected components
        n_components, labels = connected_components(cluster_graph,
                                                    directed=False)

        # remove clusters with fewer than min_cluster_size
        counts = np.bincount(labels)
        to_remove = np.where(counts < self.min_cluster_size)[0]

        if len(to_remove) > 0:
            for i in to_remove:
                labels[labels == i] = -1
            _, labels = np.unique(labels, return_inverse=True)
            labels -= 1  # keep -1 labels the same

        # update cluster_graph by eliminating non-clusters
        # operationally, this means zeroing-out rows & columns where
        # the label is negative.
        I = sparse.eye(len(labels))
        I.data[0, labels < 0] = 0

        # we could just do this:
        #   cluster_graph = I * cluster_graph * I
        # but we want to be able to eliminate the zeros, so we use
        # the same indexing trick as above
        original_data = cluster_graph.data
        cluster_graph.data = np.arange(1, len(cluster_graph.data) + 1)
        cluster_graph = I * cluster_graph * I
        cluster_graph.eliminate_zeros()
        cluster_graph.data = original_data[cluster_graph.data.astype(int) - 1]

        self.labels_ = labels
        self.cluster_graph_ = cluster_graph
        return self