示例#1
0
def test_int_float_dict():
    rng = np.random.RandomState(0)
    keys = np.unique(rng.randint(100, size=10).astype(np.intp, copy=False))
    values = rng.rand(len(keys))

    d = IntFloatDict(keys, values)
    for key, value in zip(keys, values):
        assert d[key] == value

    other_keys = np.arange(50, dtype=np.intp)[::2]
    other_values = np.full(50, 0.5)[::2]
    other = IntFloatDict(other_keys, other_values)
    # Complete smoke test
    max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
    average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
    def _init_inertia_from_scratch(self, global_bbox_id_list):
        pdist = self._computer_pdist(global_bbox_id_list, with_st_const=False)

        # Return a new array of given shape and type, without initializing entries.
        n_samples = len(global_bbox_id_list)
        n_nodes = 2 * n_samples - 1  # batch size
        A = np.empty(n_nodes, dtype=object)
        inertia = list()

        print(
            f'Constructing can not link for detections in the same image ... ')
        # start_time = time.time()
        mask = self.estimate_same_image_mask(global_bbox_id_list)
        # print("Constructing can not link for detections in the same image done in "
        #       "[--- %s seconds ---]" % (time.time() - start_time))

        start_time = time.time()
        for i in range(n_samples):
            row = np.where(mask[i, :] != 0)[0]  # np.where return a tuple
            data = pdist[i, row]
            A[i] = IntFloatDict(np.array(row, dtype=np.intp),
                                np.array(data, dtype=np.float64))  # fast dict
            # We keep only the upper triangular for the heap
            #  Generator expressions are faster than arrays on the following
            inertia.extend(
                _hierarchical.WeightedEdge(d, i, r) for r, d in zip(row, data)
                if r > i)
        # This function accepts an arbitrary list and converts it to a heap
        # (sorted list)
        heapify(inertia)
        print("Initializing A and inertia done in  %s seconds ..." %
              (time.time() - start_time))
        print(f'len(inertia) = {len(inertia)} ... ')

        return inertia, A, pdist
示例#3
0
def test_int_float_dict():
    rng = np.random.RandomState(0)
    keys = np.unique(rng.randint(100, size=10).astype(np.intp))
    values = rng.rand(len(keys))

    d = IntFloatDict(keys, values)
    for key, value in zip(keys, values):
        assert d[key] == value
    assert len(d) == len(keys)

    d.append(120, 3.)
    assert d[120] == 3.0
    assert len(d) == len(keys) + 1
    for i in range(2000):
        d.append(i + 1000, 4.0)
    assert d[1100] == 4.0
示例#4
0
def test_int_float_dict_argmin():
    # Test the argmin implementation on the IntFloatDict
    keys = np.arange(100, dtype=np.intp)
    values = np.arange(100, dtype=np.float64)
    d = IntFloatDict(keys, values)
    assert argmin(d) == (0, 0)
    def _init_inertia_from_clusters(self, parent, children, used_node, pdist,
                                    clusters, linkage):
        # Return a new array of given shape and type, without initializing entries.
        n_samples = pdist.shape[0]
        n_nodes = 2 * n_samples - 1  # batch size
        A = np.empty(n_nodes, dtype=object)
        inertia = list()

        start_time = time.time()
        labels = self.get_complete_node_list(parent=parent,
                                             n_samples=n_samples,
                                             used_node=used_node)
        num_clusters = len(labels)  # for this batch
        cluster_pdist_dict = {
            x: []
            for x in labels
        }  # 2d array, first [] saves the rows, second the data.

        for k in range(num_clusters - 1):
            n_k = labels[k]
            rows = self.get_leaves(children=children,
                                   n_leaves=n_samples,
                                   node_id=n_k)

            for l in range(k + 1, num_clusters):
                n_l = labels[l]
                combined_cluster = clusters[n_k] + clusters[n_l]

                # -------------------------------- do we need to check image id? think in future
                image_ids = [x[0] for x in combined_cluster]
                # Detections xi and xj that are in the same images may be two isolated nodes
                # So they will be skipped if two clusters have overlap in image ids
                if not self.is_unique_list(image_ids):
                    continue
                # -------------------------------------------
                cols = self.get_leaves(children=children,
                                       n_leaves=n_samples,
                                       node_id=n_l)
                assert len(clusters[n_k]) > 0 and len(clusters[n_l]) > 0
                dist = AhcMetric.cluster_dist_from_pdist_rows_cols(
                    pdist=pdist, rows=rows, cols=cols, linkage=linkage)
                # if appearance_dist >= 2:  # 1.0212946 ok, [-1,1] for similarity, 0,2 for distance 1 - simialrity
                #     print(f'self.temporal_distance(clusters[n_k], clusters[n_l]) ='
                #           f' {self.temporal_distance(clusters[n_k], clusters[n_l])}')
                assert dist <= 2.0

                cluster_pdist_dict[n_k].append([n_l, dist])
                cluster_pdist_dict[n_l].append([n_k, dist])
                # We keep only the upper triangular for the heap
                # Generator expressions are faster than arrays on the following
                if n_l < n_k:
                    inertia.append(_hierarchical.WeightedEdge(dist, n_l, n_k))
                else:
                    inertia.append(_hierarchical.WeightedEdge(dist, n_k, n_l))

        for i in labels:
            if len(cluster_pdist_dict[i]) == 0:
                A[i] = 0
            else:
                # 0 is the row, 1 is the dist
                A[i] = IntFloatDict(
                    np.array([x[0] for x in cluster_pdist_dict[i]],
                             dtype=np.intp),
                    np.array([x[1] for x in cluster_pdist_dict[i]],
                             dtype=np.float64))  # fast dict

        heapify(inertia)
        del cluster_pdist_dict, labels
        print("Initializing A and inertia done in %s ..." %
              (time.time() - start_time))
        return inertia, A
示例#6
0
def linkage_tree(X,
                 connectivity=None,
                 n_clusters=None,
                 linkage='complete',
                 affinity="euclidean",
                 return_distance=False):
    """Linkage agglomerative clustering based on a Feature matrix.
    The inertia matrix uses a Heapq-based representation.
    This is the structured version, that takes into account some topological
    structure between samples.
    Read more in the :ref:`User Guide <hierarchical_clustering>`.
    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        feature matrix representing n_samples samples to be clustered
    connectivity : sparse matrix (optional).
        connectivity matrix. Defines for each sample the neighboring samples
        following a given structure of the data. The matrix is assumed to
        be symmetric and only the upper triangular half is used.
        Default is None, i.e, the Ward algorithm is unstructured.
    n_clusters : int (optional)
        Stop early the construction of the tree at n_clusters. This is
        useful to decrease computation time if the number of clusters is
        not small compared to the number of samples. In this case, the
        complete tree is not computed, thus the 'children' output is of
        limited use, and the 'parents' output should rather be used.
        This option is valid only when specifying a connectivity matrix.
    linkage : {"average", "complete", "single"}, optional, default: "complete"
        Which linkage criteria to use. The linkage criterion determines which
        distance to use between sets of observation.
            - average uses the average of the distances of each observation of
              the two sets
            - complete or maximum linkage uses the maximum distances between
              all observations of the two sets.
            - single uses the minimum of the distances between all observations
              of the two sets.
    affinity : string or callable, optional, default: "euclidean".
        which metric to use. Can be "euclidean", "manhattan", or any
        distance know to paired distance (see metric.pairwise)
    return_distance : bool, default False
        whether or not to return the distances between the clusters.
    Returns
    -------
    children : 2D array, shape (n_nodes-1, 2)
        The children of each non-leaf node. Values less than `n_samples`
        correspond to leaves of the tree which are the original samples.
        A node `i` greater than or equal to `n_samples` is a non-leaf
        node and has children `children_[i - n_samples]`. Alternatively
        at the i-th iteration, children[i][0] and children[i][1]
        are merged to form node `n_samples + i`
    n_connected_components : int
        The number of connected components in the graph.
    n_leaves : int
        The number of leaves in the tree.
    parents : 1D array, shape (n_nodes, ) or None
        The parent of each node. Only returned when a connectivity matrix
        is specified, elsewhere 'None' is returned.
    distances : ndarray, shape (n_nodes-1,)
        Returned when return_distance is set to True.
        distances[i] refers to the distance between children[i][0] and
        children[i][1] when they are merged.
    See also
    --------
    ward_tree : hierarchical clustering with ward linkage
    """
    X = np.asarray(X)
    if X.ndim == 1:
        X = np.reshape(X, (-1, 1))
    n_samples, n_features = X.shape

    linkage_choices = {
        'complete': _hierarchical.max_merge,
        'average': _hierarchical.average_merge,
        'single': None
    }  # Single linkage is handled differently
    try:
        join_func = linkage_choices[linkage]
    except KeyError:
        raise ValueError('Unknown linkage option, linkage should be one '
                         'of %s, but %s was given' %
                         (linkage_choices.keys(), linkage))

    if affinity == 'cosine' and np.any(~np.any(X, axis=1)):
        raise ValueError(
            'Cosine affinity cannot be used when X contains zero vectors')

    if connectivity is None:
        from scipy.cluster import hierarchy  # imports PIL

        if n_clusters is not None:
            warnings.warn(
                'Partial build of the tree is implemented '
                'only for structured clustering (i.e. with '
                'explicit connectivity). The algorithm '
                'will build the full tree and only '
                'retain the lower branches required '
                'for the specified number of clusters',
                stacklevel=2)

        if affinity == 'precomputed':
            # for the linkage function of hierarchy to work on precomputed
            # data, provide as first argument an ndarray of the shape returned
            # by pdist: it is a flat array containing the upper triangular of
            # the distance matrix.
            i, j = np.triu_indices(X.shape[0], k=1)
            X = X[i, j]
        elif affinity == 'l2':
            # Translate to something understood by scipy
            affinity = 'euclidean'
        elif affinity in ('l1', 'manhattan'):
            affinity = 'cityblock'
        elif callable(affinity):
            X = affinity(X)
            i, j = np.triu_indices(X.shape[0], k=1)
            X = X[i, j]
        out = hierarchy.linkage(X, method=linkage, metric=affinity)
        children_ = out[:, :2].astype(np.int, copy=False)

        if return_distance:
            distances = out[:, 2]
            return children_, 1, n_samples, None, distances
        return children_, 1, n_samples, None

    connectivity, n_connected_components = _fix_connectivity(X,
                                                             connectivity,
                                                             affinity=affinity)
    connectivity = connectivity.tocoo()
    # Put the diagonal to zero
    diag_mask = (connectivity.row != connectivity.col)
    connectivity.row = connectivity.row[diag_mask]
    connectivity.col = connectivity.col[diag_mask]
    connectivity.data = connectivity.data[diag_mask]
    del diag_mask

    if affinity == 'precomputed':
        distances = X[connectivity.row,
                      connectivity.col].astype('float64',
                                               **_astype_copy_false(X))
    else:
        # FIXME We compute all the distances, while we could have only computed
        # the "interesting" distances
        distances = paired_distances(X[connectivity.row],
                                     X[connectivity.col],
                                     metric=affinity)
    connectivity.data = distances

    if n_clusters is None:
        n_nodes = 2 * n_samples - 1
    else:
        assert n_clusters <= n_samples
        n_nodes = 2 * n_samples - n_clusters

    if linkage == 'single':
        return _single_linkage_tree(connectivity, n_samples, n_nodes,
                                    n_clusters, n_connected_components,
                                    return_distance)

    if return_distance:
        distances = np.empty(n_nodes - n_samples)
    # create inertia heap and connection matrix
    A = np.empty(n_nodes, dtype=object)
    inertia = list()

    # LIL seems to the best format to access the rows quickly,
    # without the numpy overhead of slicing CSR indices and data.
    connectivity = connectivity.tolil()
    # We are storing the graph in a list of IntFloatDict
    for ind, (data, row) in enumerate(zip(connectivity.data,
                                          connectivity.rows)):
        A[ind] = IntFloatDict(np.asarray(row, dtype=np.intp),
                              np.asarray(data, dtype=np.float64))
        # We keep only the upper triangular for the heap
        # Generator expressions are faster than arrays on the following
        inertia.extend(
            _hierarchical.WeightedEdge(d, ind, r) for r, d in zip(row, data)
            if r < ind)
    del connectivity

    heapify(inertia)

    # prepare the main fields
    parent = np.arange(n_nodes, dtype=np.intp)
    used_node = np.ones(n_nodes, dtype=np.intp)
    children = []

    # recursive merge loop
    for k in range(n_samples, n_nodes):
        # identify the merge
        while True:
            edge = heappop(inertia)
            if used_node[edge.a] and used_node[edge.b]:
                break
        i = edge.a
        j = edge.b

        if return_distance:
            # store distances
            distances[k - n_samples] = edge.weight

        parent[i] = parent[j] = k
        children.append((i, j))
        # Keep track of the number of elements per cluster
        n_i = used_node[i]
        n_j = used_node[j]
        used_node[k] = n_i + n_j
        used_node[i] = used_node[j] = False

        # update the structure matrix A and the inertia matrix
        # a clever 'min', or 'max' operation between A[i] and A[j]
        coord_col = join_func(A[i], A[j], used_node, n_i, n_j)
        for l, d in coord_col:
            A[l].append(k, d)
            # Here we use the information from coord_col (containing the
            # distances) to update the heap
            heappush(inertia, _hierarchical.WeightedEdge(d, k, l))
        A[k] = coord_col
        # Clear A[i] and A[j] to save memory
        A[i] = A[j] = 0

    # Separate leaves in children (empty lists up to now)
    n_leaves = n_samples

    # # return numpy array for efficient caching
    children = np.array(children)[:, ::-1]

    if return_distance:
        return children, n_connected_components, n_leaves, parent, distances
    return children, n_connected_components, n_leaves, parent