def __init__(self, N, vectors, coverage_ratio=0.2): """ Performs exact nearest neighbour search on the data set. vectors can either be a numpy matrix with all the vectors as columns OR a python array containing the individual numpy vectors. """ # We need a dict from vector string representation to index self.vector_dict = {} self.N = N self.coverage_ratio = coverage_ratio # Get numpy array representation of input self.vectors = numpy_array_from_list_or_numpy_array(vectors) # Build map from vector string representation to vector for index in range(self.vectors.shape[1]): self.vector_dict[self.__vector_to_string( self.vectors[:, index])] = index # Get transposed version of vector matrix, so that the rows # are the vectors (needed by cdist) vectors_t = numpy.transpose(self.vectors) # Determine the indices of query vectors used for comparance # with approximated search. query_count = numpy.floor(self.coverage_ratio * self.vectors.shape[1]) self.query_indices = [] for k in range(int(query_count)): index = numpy.floor(k * (self.vectors.shape[1] / query_count)) index = min(index, self.vectors.shape[1] - 1) self.query_indices.append(int(index)) print('\nStarting exact search (query set size=%d)...\n' % query_count) # For each query vector get radius of closest N neighbours self.nearest_radius = {} self.exact_search_time_per_vector = 0.0 for index in self.query_indices: v = vectors_t[index, :].reshape(1, self.vectors.shape[0]) exact_search_start_time = time.time() D = cdist(v, vectors_t, 'euclidean') # Get radius of closest N neighbours self.nearest_radius[index] = scipy.sort(D)[0, N] # Save time needed for exact search exact_search_time = time.time() - exact_search_start_time self.exact_search_time_per_vector += exact_search_time print('\Done with exact search...\n') # Normalize search time self.exact_search_time_per_vector /= float(len(self.query_indices))
def __init__(self, N, vectors, coverage_ratio=0.2): """ Performs exact nearest neighbour search on the data set. vectors can either be a numpy matrix with all the vectors as columns OR a python array containing the individual numpy vectors. """ # We need a dict from vector string representation to index self.vector_dict = {} self.N = N self.coverage_ratio = coverage_ratio # Get numpy array representation of input self.vectors = numpy_array_from_list_or_numpy_array(vectors) # Build map from vector string representation to vector for index in range(self.vectors.shape[1]): self.vector_dict[self.__vector_to_string( self.vectors[:, index])] = index # Get transposed version of vector matrix, so that the rows # are the vectors (needed by cdist) vectors_t = numpy.transpose(self.vectors) # Determine the indices of query vectors used for comparance # with approximated search. query_count = numpy.floor(self.coverage_ratio * self.vectors.shape[1]) self.query_indices = [] for k in range(int(query_count)): index = numpy.floor(k*(self.vectors.shape[1]/query_count)) index = min(index, self.vectors.shape[1]-1) self.query_indices.append(int(index)) print '\nStarting exact search (query set size=%d)...\n' % query_count # For each query vector get radius of closest N neighbours self.nearest_radius = {} self.exact_search_time_per_vector = 0.0 for index in self.query_indices: v = vectors_t[index, :].reshape(1, self.vectors.shape[0]) exact_search_start_time = time.time() D = cdist(v, vectors_t, 'euclidean') # Get radius of closest N neighbours self.nearest_radius[index] = scipy.sort(D)[0, N] # Save time needed for exact search exact_search_time = time.time() - exact_search_start_time self.exact_search_time_per_vector += exact_search_time print '\Done with exact search...\n' # Normalize search time self.exact_search_time_per_vector /= float(len(self.query_indices))
def __init__(self, N, vectors, coverage_ratio=0.2): """ Performs exact nearest neighbour search on the data set. vectors can either be a numpy matrix with all the vectors as columns OR a python array containing the individual numpy vectors. """ # We need a dict from vector string representation to index self.vector_dict = {} self.N = N self.coverage_ratio = coverage_ratio numpy_vectors = numpy_array_from_list_or_numpy_array(vectors) # Get numpy array representation of input self.vectors = numpy.vstack([unitvec(v) for v in numpy_vectors.T]) # Build map from vector string representation to vector for index, v in enumerate(self.vectors): self.vector_dict[self.__vector_to_string(v)] = index # Determine the indices of query vectors used for comparance # with approximated search. query_count = numpy.floor(self.coverage_ratio * len(self.vectors)) self.query_indices = [] for k in range(int(query_count)): index = numpy.floor(k * (float(len(self.vectors)) / query_count)) index = min(index, len(self.vectors) - 1) self.query_indices.append(int(index)) print('\nStarting exact search (query set size=%d)...\n' % query_count) # For each query vector get the closest N neighbours self.closest = {} self.exact_search_time_per_vector = 0.0 for index in self.query_indices: v = self.vectors[index, numpy.newaxis] exact_search_start_time = time.time() D = cdist(v, self.vectors, 'euclidean') self.closest[index] = scipy.argsort(D)[0, 1:N+1] # Save time needed for exact search exact_search_time = time.time() - exact_search_start_time self.exact_search_time_per_vector += exact_search_time print('Done with exact search...\n') # Normalize search time self.exact_search_time_per_vector /= float(len(self.query_indices))
def __init__(self, hash_name, projection_count, training_set): """ Computes principal components for training vector set. Uses first projection_count principal components for projections. Training set must be either a numpy matrix or a list of numpy vectors. """ super(PCABinaryProjections, self).__init__(hash_name) self.projection_count = projection_count # Only do training if training set was specified if not training_set is None: # Get numpy array representation of input training_set = numpy_array_from_list_or_numpy_array(training_set) # Get subspace size from training matrix self.dim = training_set.shape[0] # Get transposed training set matrix for PCA training_set_t = numpy.transpose(training_set) # Compute principal components (eigenvalues, eigenvectors) = perform_pca(training_set_t) # Get largest N eigenvalue/eigenvector indices largest_eigenvalue_indices = numpy.flipud( scipy.argsort(eigenvalues))[:projection_count] # Create matrix for first N principal components self.components = numpy.zeros((self.dim, len(largest_eigenvalue_indices))) # Put first N principal components into matrix for index in range(len(largest_eigenvalue_indices)): self.components[:, index] = \ eigenvectors[:, largest_eigenvalue_indices[index]] # We need the component vectors to be in the rows self.components = numpy.transpose(self.components) else: self.dim = None self.components = None # This is only used in case we need to process sparse vectors self.components_csr = None
def __init__(self, hash_name, projection_count, training_set): """ Computes principal components for training vector set. Uses first projection_count principal components for projections. Training set must be either a numpy matrix or a list of numpy vectors. """ super(PCABinaryProjections, self).__init__(hash_name) self.projection_count = projection_count # Only do training if training set was specified if not training_set is None: # Get numpy array representation of input training_set = numpy_array_from_list_or_numpy_array(training_set) # Get subspace size from training matrix self.dim = training_set.shape[0] # Get transposed training set matrix for PCA training_set_t = numpy.transpose(training_set) # Compute principal components (eigenvalues, eigenvectors) = perform_pca(training_set_t) # Get largest N eigenvalue/eigenvector indices largest_eigenvalue_indices = numpy.flipud( scipy.argsort(eigenvalues))[:projection_count] # Create matrix for first N principal components self.components = numpy.zeros( (self.dim, len(largest_eigenvalue_indices))) # Put first N principal components into matrix for index in range(len(largest_eigenvalue_indices)): self.components[:, index] = \ eigenvectors[:, largest_eigenvalue_indices[index]] # We need the component vectors to be in the rows self.components = numpy.transpose(self.components) else: self.dim = None self.components = None # This is only used in case we need to process sparse vectors self.components_csr = None
def __init__(self, hash_name, projection_count, training_set, bin_width): """ Computes principal components for training vector set. Uses first projection_count principal components for projections. Training set must be either a numpy matrix or a list of numpy vectors. """ super(PCADiscretizedProjections, self).__init__(hash_name) self.projection_count = projection_count self.bin_width = bin_width # Get numpy array representation of input training_set = numpy_array_from_list_or_numpy_array(training_set) # Get subspace size from training matrix self.dim = training_set.shape[0] # Get transposed training set matrix for PCA training_set_t = numpy.transpose(training_set) # Compute principal components (eigenvalues, eigenvectors) = perform_pca(training_set_t) # Get largest N eigenvalue/eigenvector indices largest_eigenvalue_indices = numpy.flipud( scipy.argsort(eigenvalues))[:projection_count] # Create matrix for first N principal components self.components = numpy.zeros((self.dim, len(largest_eigenvalue_indices))) # Put first N principal components into matrix for index in range(len(largest_eigenvalue_indices)): self.components[:, index] = \ eigenvectors[:, largest_eigenvalue_indices[index]] # We need the component vectors to be in the rows self.components = numpy.transpose(self.components)
def __init__(self, hash_name, projection_count, training_set, bin_width): """ Computes principal components for training vector set. Uses first projection_count principal components for projections. Training set must be either a numpy matrix or a list of numpy vectors. """ super(PCADiscretizedProjections, self).__init__(hash_name) self.projection_count = projection_count self.bin_width = bin_width # Get numpy array representation of input training_set = numpy_array_from_list_or_numpy_array(training_set) # Get subspace size from training matrix self.dim = training_set.shape[0] # Get transposed training set matrix for PCA training_set_t = numpy.transpose(training_set) # Compute principal components (eigenvalues, eigenvectors) = perform_pca(training_set_t) # Get largest N eigenvalue/eigenvector indices largest_eigenvalue_indices = numpy.flipud( scipy.argsort(eigenvalues))[:projection_count] # Create matrix for first N principal components self.components = numpy.zeros( (self.dim, len(largest_eigenvalue_indices))) # Put first N principal components into matrix for index in range(len(largest_eigenvalue_indices)): self.components[:, index] = \ eigenvectors[:, largest_eigenvalue_indices[index]] # We need the component vectors to be in the rows self.components = numpy.transpose(self.components)