def __init__(self, N, vectors, coverage_ratio=0.2):
        """
        Performs exact nearest neighbour search on the data set.

        vectors can either be a numpy matrix with all the vectors
        as columns OR a python array containing the individual
        numpy vectors.
        """
        # We need a dict from vector string representation to index
        self.vector_dict = {}
        self.N = N
        self.coverage_ratio = coverage_ratio

        # Get numpy array representation of input
        self.vectors = numpy_array_from_list_or_numpy_array(vectors)

        # Build map from vector string representation to vector
        for index in range(self.vectors.shape[1]):
            self.vector_dict[self.__vector_to_string(
                self.vectors[:, index])] = index

        # Get transposed version of vector matrix, so that the rows
        # are the vectors (needed by cdist)
        vectors_t = numpy.transpose(self.vectors)

        # Determine the indices of query vectors used for comparance
        # with approximated search.
        query_count = numpy.floor(self.coverage_ratio *
                                  self.vectors.shape[1])
        self.query_indices = []
        for k in range(int(query_count)):
            index = numpy.floor(k * (self.vectors.shape[1] / query_count))
            index = min(index, self.vectors.shape[1] - 1)
            self.query_indices.append(int(index))

        print('\nStarting exact search (query set size=%d)...\n' % query_count)

        # For each query vector get radius of closest N neighbours
        self.nearest_radius = {}
        self.exact_search_time_per_vector = 0.0

        for index in self.query_indices:

            v = vectors_t[index, :].reshape(1, self.vectors.shape[0])
            exact_search_start_time = time.time()
            D = cdist(v, vectors_t, 'euclidean')

            # Get radius of closest N neighbours
            self.nearest_radius[index] = scipy.sort(D)[0, N]

            # Save time needed for exact search
            exact_search_time = time.time() - exact_search_start_time
            self.exact_search_time_per_vector += exact_search_time

        print('\Done with exact search...\n')

        # Normalize search time
        self.exact_search_time_per_vector /= float(len(self.query_indices))
    def __init__(self, N, vectors, coverage_ratio=0.2):
        """
        Performs exact nearest neighbour search on the data set.

        vectors can either be a numpy matrix with all the vectors
        as columns OR a python array containing the individual
        numpy vectors.
        """
        # We need a dict from vector string representation to index
        self.vector_dict = {}
        self.N = N
        self.coverage_ratio = coverage_ratio

        # Get numpy array representation of input
        self.vectors = numpy_array_from_list_or_numpy_array(vectors)

        # Build map from vector string representation to vector
        for index in range(self.vectors.shape[1]):
            self.vector_dict[self.__vector_to_string(
                self.vectors[:, index])] = index

        # Get transposed version of vector matrix, so that the rows
        # are the vectors (needed by cdist)
        vectors_t = numpy.transpose(self.vectors)

        # Determine the indices of query vectors used for comparance
        # with approximated search.
        query_count = numpy.floor(self.coverage_ratio *
                                  self.vectors.shape[1])
        self.query_indices = []
        for k in range(int(query_count)):
            index = numpy.floor(k*(self.vectors.shape[1]/query_count))
            index = min(index, self.vectors.shape[1]-1)
            self.query_indices.append(int(index))

        print '\nStarting exact search (query set size=%d)...\n' % query_count

        # For each query vector get radius of closest N neighbours
        self.nearest_radius = {}
        self.exact_search_time_per_vector = 0.0

        for index in self.query_indices:

            v = vectors_t[index, :].reshape(1, self.vectors.shape[0])
            exact_search_start_time = time.time()
            D = cdist(v, vectors_t, 'euclidean')

            # Get radius of closest N neighbours
            self.nearest_radius[index] = scipy.sort(D)[0, N]

            # Save time needed for exact search
            exact_search_time = time.time() - exact_search_start_time
            self.exact_search_time_per_vector += exact_search_time

        print '\Done with exact search...\n'

        # Normalize search time
        self.exact_search_time_per_vector /= float(len(self.query_indices))
示例#3
0
    def __init__(self, N, vectors, coverage_ratio=0.2):
        """
        Performs exact nearest neighbour search on the data set.

        vectors can either be a numpy matrix with all the vectors
        as columns OR a python array containing the individual
        numpy vectors.
        """
        # We need a dict from vector string representation to index
        self.vector_dict = {}
        self.N = N
        self.coverage_ratio = coverage_ratio
        numpy_vectors = numpy_array_from_list_or_numpy_array(vectors)

        # Get numpy array representation of input
        self.vectors = numpy.vstack([unitvec(v) for v in numpy_vectors.T])

        # Build map from vector string representation to vector
        for index, v in enumerate(self.vectors):
            self.vector_dict[self.__vector_to_string(v)] = index

        # Determine the indices of query vectors used for comparance
        # with approximated search.
        query_count = numpy.floor(self.coverage_ratio *
                                  len(self.vectors))
        self.query_indices = []
        for k in range(int(query_count)):
            index = numpy.floor(k * (float(len(self.vectors)) / query_count))
            index = min(index, len(self.vectors) - 1)
            self.query_indices.append(int(index))

        print('\nStarting exact search (query set size=%d)...\n' % query_count)

        # For each query vector get the closest N neighbours
        self.closest = {}
        self.exact_search_time_per_vector = 0.0

        for index in self.query_indices:
            v = self.vectors[index, numpy.newaxis]
            exact_search_start_time = time.time()
            D = cdist(v, self.vectors, 'euclidean')
            self.closest[index] = scipy.argsort(D)[0, 1:N+1]

            # Save time needed for exact search
            exact_search_time = time.time() - exact_search_start_time
            self.exact_search_time_per_vector += exact_search_time

        print('Done with exact search...\n')

        # Normalize search time
        self.exact_search_time_per_vector /= float(len(self.query_indices))
示例#4
0
    def __init__(self, hash_name, projection_count, training_set):
        """
        Computes principal components for training vector set. Uses
        first projection_count principal components for projections.

        Training set must be either a numpy matrix or a list of
        numpy vectors.
        """
        super(PCABinaryProjections, self).__init__(hash_name)
        self.projection_count = projection_count

        # Only do training if training set was specified
        if not training_set is None:
            # Get numpy array representation of input
            training_set = numpy_array_from_list_or_numpy_array(training_set)

            # Get subspace size from training matrix
            self.dim = training_set.shape[0]

            # Get transposed training set matrix for PCA
            training_set_t = numpy.transpose(training_set)

            # Compute principal components
            (eigenvalues, eigenvectors) = perform_pca(training_set_t)

            # Get largest N eigenvalue/eigenvector indices
            largest_eigenvalue_indices = numpy.flipud(
                scipy.argsort(eigenvalues))[:projection_count]

            # Create matrix for first N principal components
            self.components = numpy.zeros((self.dim,
                                           len(largest_eigenvalue_indices)))

            # Put first N principal components into matrix
            for index in range(len(largest_eigenvalue_indices)):
                self.components[:, index] = \
                    eigenvectors[:, largest_eigenvalue_indices[index]]

            # We need the component vectors to be in the rows
            self.components = numpy.transpose(self.components)
        else:
            self.dim = None
            self.components = None

        # This is only used in case we need to process sparse vectors
        self.components_csr = None
示例#5
0
    def __init__(self, hash_name, projection_count, training_set):
        """
        Computes principal components for training vector set. Uses
        first projection_count principal components for projections.

        Training set must be either a numpy matrix or a list of
        numpy vectors.
        """
        super(PCABinaryProjections, self).__init__(hash_name)
        self.projection_count = projection_count

        # Only do training if training set was specified
        if not training_set is None:
            # Get numpy array representation of input
            training_set = numpy_array_from_list_or_numpy_array(training_set)

            # Get subspace size from training matrix
            self.dim = training_set.shape[0]

            # Get transposed training set matrix for PCA
            training_set_t = numpy.transpose(training_set)

            # Compute principal components
            (eigenvalues, eigenvectors) = perform_pca(training_set_t)

            # Get largest N eigenvalue/eigenvector indices
            largest_eigenvalue_indices = numpy.flipud(
                scipy.argsort(eigenvalues))[:projection_count]

            # Create matrix for first N principal components
            self.components = numpy.zeros(
                (self.dim, len(largest_eigenvalue_indices)))

            # Put first N principal components into matrix
            for index in range(len(largest_eigenvalue_indices)):
                self.components[:, index] = \
                    eigenvectors[:, largest_eigenvalue_indices[index]]

            # We need the component vectors to be in the rows
            self.components = numpy.transpose(self.components)
        else:
            self.dim = None
            self.components = None

        # This is only used in case we need to process sparse vectors
        self.components_csr = None
    def __init__(self, hash_name, projection_count, training_set, bin_width):
        """
        Computes principal components for training vector set. Uses
        first projection_count principal components for projections.

        Training set must be either a numpy matrix or a list of
        numpy vectors.
        """
        super(PCADiscretizedProjections, self).__init__(hash_name)
        self.projection_count = projection_count
        self.bin_width = bin_width

        # Get numpy array representation of input
        training_set = numpy_array_from_list_or_numpy_array(training_set)

        # Get subspace size from training matrix
        self.dim = training_set.shape[0]

        # Get transposed training set matrix for PCA
        training_set_t = numpy.transpose(training_set)

        # Compute principal components
        (eigenvalues, eigenvectors) = perform_pca(training_set_t)

        # Get largest N eigenvalue/eigenvector indices
        largest_eigenvalue_indices = numpy.flipud(
            scipy.argsort(eigenvalues))[:projection_count]

        # Create matrix for first N principal components
        self.components = numpy.zeros((self.dim,
                                       len(largest_eigenvalue_indices)))

        # Put first N principal components into matrix
        for index in range(len(largest_eigenvalue_indices)):
            self.components[:, index] = \
                eigenvectors[:, largest_eigenvalue_indices[index]]

        # We need the component vectors to be in the rows
        self.components = numpy.transpose(self.components)
    def __init__(self, hash_name, projection_count, training_set, bin_width):
        """
        Computes principal components for training vector set. Uses
        first projection_count principal components for projections.

        Training set must be either a numpy matrix or a list of
        numpy vectors.
        """
        super(PCADiscretizedProjections, self).__init__(hash_name)
        self.projection_count = projection_count
        self.bin_width = bin_width

        # Get numpy array representation of input
        training_set = numpy_array_from_list_or_numpy_array(training_set)

        # Get subspace size from training matrix
        self.dim = training_set.shape[0]

        # Get transposed training set matrix for PCA
        training_set_t = numpy.transpose(training_set)

        # Compute principal components
        (eigenvalues, eigenvectors) = perform_pca(training_set_t)

        # Get largest N eigenvalue/eigenvector indices
        largest_eigenvalue_indices = numpy.flipud(
            scipy.argsort(eigenvalues))[:projection_count]

        # Create matrix for first N principal components
        self.components = numpy.zeros(
            (self.dim, len(largest_eigenvalue_indices)))

        # Put first N principal components into matrix
        for index in range(len(largest_eigenvalue_indices)):
            self.components[:, index] = \
                eigenvectors[:, largest_eigenvalue_indices[index]]

        # We need the component vectors to be in the rows
        self.components = numpy.transpose(self.components)