def test_summary_size(data_to_test): ''' Tests that the summary returned has number of rows equal to the required projectiont dimension''' print('\n') sketch_dim = 500 n, d = data_to_test.shape true_norm = np.linalg.norm(data_to_test, ord='fro')**2 sketches = { 'gaussian': GaussianSketch(sketch_dim, n, d), 'srht_HAD': SRHTSketch(sketch_dim, n, d, 'HAD'), 'srht_DCT': SRHTSketch(sketch_dim, n, d), 'countsketch': CountSketch(sketch_dim, n, d), 'sjltsketch': SparseJLT(sketch_dim, n, d, col_sparsity=5) } for sk_name, sk_method in sketches.items(): g = sk_method g.sketch(data_to_test) summary = g.get() summary_u, summary_sig, summary_vt = g.get(in_svd=True) assert summary.shape == (sketch_dim, d) assert summary_u.shape == (sketch_dim, d) assert summary_sig.shape == (d, ) assert summary_vt.shape == (d, d) sk_norm = np.linalg.norm(summary, ord='fro')**2 err = np.abs(sk_norm - true_norm) / true_norm print(f'{sk_name}:\t{err:.5f}')
def __init__(self,n_data_rows:int, n_data_cols:int,\ sk_dim:int,sk_mode='Gaussian',sparse_data=None,\ ihs_mode='single',sjlt_sparsity=5,gamma=1.0,batch_size=None): """ #def __init__(self, sk_dim:int,sk_mode='FD',gamma=1.0,batch_size=None): Approximate ridge regression using the FD sketch. sk_dim (int) - the number of rows retained in the FD sketch. sk_mode (str) : mode for frequent directions FD or RFD. alpha : float - the regularisation parameter for ridge regression. """ self.gamma = gamma self.sk_mode = sk_mode self.n_data_rows = n_data_rows self.n_data_cols = n_data_cols self.sk_dim = min([sk_dim, n_data_cols]) #sk_dim self.ihs_mode = ihs_mode # if self.sk_mode not in ['FD', 'RFD']: # raise NotImplementedError('Only F(ast) and R(obust) FD methods are supported.') if self.sk_mode == 'Gaussian': self.sketcher = GaussianSketch(self.sk_dim, self.n_data_rows, self.n_data_cols) elif self.sk_mode == 'SRHT': try: self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows, self.n_data_cols, 'HAD') except: self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows, self.n_data_cols, 'DCT') elif self.sk_mode == 'SJLT': self.sketcher = SparseJLT(self.sk_dim, self.n_data_rows, self.n_data_cols, col_sparsity=sjlt_sparsity) elif self.sk_mode == 'CountSketch': self.sketcher = CountSketch(self.sk_dim, self.n_data_rows, self.n_data_cols) # Set ihs mode to be single for deterministic sketches elif self.sk_mode == 'FD': self.ihs_mode = 'single' self.sketcher = FastFrequentDirections(self.sk_dim, self.n_data_rows, self.n_data_cols) elif self.sk_mode == 'RFD': self.ihs_mode = 'single' self.sketcher = RobustFrequentDirections(self.sk_dim, self.n_data_rows, self.n_data_cols) if batch_size == None: self.batch_size = self.sk_dim else: self.batch_size = batch_size self.sparse_data_is_set = False # Init for dense sketches to False if (sparse_data is not None) and (self.sk_mode == 'SJLT' or self.sk_mode == 'CountSketch'): self.sketcher.set_sparse_data(sparse_data) self.sparse_data_is_set = True
def __init__(self,n_data_rows:int, n_data_cols:int,\ sk_dim:int,sk_mode='Gaussian',sparse_data=None,sjlt_sparsity=5): """ Approximate OLS regression using random projections Parameters: - rp_dim (int) : the number of rows retained in the random projection. - rp_mode (str) : sketch mode used to decide on the sketch. - method: sketch and solve or iterative hessian """ self.sk_mode = sk_mode self.n_data_rows = n_data_rows self.n_data_cols = n_data_cols self.sk_dim = min([sk_dim, n_data_rows]) if self.sk_mode == 'Gaussian': self.sketcher = GaussianSketch(self.sk_dim, self.n_data_rows, self.n_data_cols) elif self.sk_mode == 'SRHT': # Add 1 to the number of data columns as we append a column for y later on try: self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows, self.n_data_cols + 1, 'HAD') except: self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows, self.n_data_cols + 1, 'DCT') elif self.sk_mode == 'SJLT': self.sketcher = SparseJLT(self.sk_dim, self.n_data_rows, self.n_data_cols + 1, col_sparsity=sjlt_sparsity) elif self.sk_mode == 'CountSketch': self.sketcher = CountSketch(self.sk_dim, self.n_data_rows, self.n_data_cols + 1) self.sparse_data_is_set = False # Init for dense sketches to False if (sparse_data is not None) and (self.sk_mode == 'SJLT' or self.sk_mode == 'CountSketch'): self.sketcher.set_sparse_data(sparse_data) self.sparse_data_is_set = True
def __init__(self,n_data_rows:int, n_data_cols:int,\ sk_dim:int,sk_mode='Gaussian',sparse_data=None,\ ihs_mode='multi',sjlt_sparsity=5): """ Instantiates the IHS sketching construction. """ self.sk_mode = sk_mode self.n_data_rows = n_data_rows self.n_data_cols = n_data_cols self.sk_dim = min([sk_dim, n_data_rows]) self.ihs_mode = ihs_mode if self.sk_mode == 'Gaussian': self.sketcher = GaussianSketch(self.sk_dim, self.n_data_rows, self.n_data_cols) elif self.sk_mode == 'SRHT': # Add 1 to the number of data columns as we append a column for y later on try: self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows, self.n_data_cols, 'HAD') except: self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows, self.n_data_cols, 'DCT') elif self.sk_mode == 'SJLT': self.sketcher = SparseJLT(self.sk_dim, self.n_data_rows, self.n_data_cols, col_sparsity=sjlt_sparsity) elif self.sk_mode == 'CountSketch': self.sketcher = CountSketch(self.sk_dim, self.n_data_rows, self.n_data_cols) self.sparse_data_is_set = False # Init for dense sketches to False if (sparse_data is not None) and (self.sk_mode == 'SJLT' or self.sk_mode == 'CountSketch'): self.sketcher.set_sparse_data(sparse_data) self.sparse_data_is_set = True
def graph2map(Vs, Es, nr_graphs, h, k, table_size, random_files, nr_tables=1, max_p=2, dirac=False): """ for a collection of graphs generate feature maps by traversing local neighborhoods, generating strings and sketching the k-gram ferquency distribution for h: the depth at which neighborhood strings are generated k: the k in k-grams table_size: the count-sketch hashtable size random_files: needed for count-sketch initialization nr_tables: count-sketch parameter max_p: the maximum polynomial degree for the poly-kernel """ print('Count sketch data structures initialization') cs = CountSketch(table_size, nr_tables * max_p, random_files) cs_cosine = CountSketch(table_size, nr_tables * max_p, random_files) print('Process graphs') vectors = [[] for _ in range(2 * h)] vectors_cosine = [[] for _ in range(2 * h)] dirac_vectors = [] labels_maps = [{} for _ in range(h)] dirac_map = {} print(len(Vs), len(Es), nr_graphs) for i in range(nr_graphs): if i % 500 == 0: print('i = ', i) print('number of features', len(labels_maps[0])) V, E = Vs[i], Es[i] feature_maps = generate_feature_maps(V, E, h) if dirac: dirac_vector = feature_map_to_vector(feature_maps[1], dirac_map) dirac_vectors.append(dirac_vector) for j in range(1, h + 1): label_vector = feature_map_to_k_gram_vector( feature_maps[j], labels_maps[j - 1], k) vectors[2 * (j - 1)].append(label_vector) vectors_cosine[2 * (j - 1)].append(normalize_vector(label_vector)) cs.clear() cs_cosine.clear() sketch_polynomial_feature_map(label_vector, cs, False) sketch_polynomial_feature_map(label_vector, cs_cosine, True) vectors[2 * (j - 1) + 1].append( tensorsketch.compute_tensorsketch_from_cs( cs, max_p, nr_tables)) vectors_cosine[2 * (j - 1) + 1].append( tensorsketch.compute_tensorsketch_from_cs( cs_cosine, max_p, nr_tables)) for j in range(h): maxlen_j = len(vectors[2 * j][nr_graphs - 1]) print('total number of features', maxlen_j) for l in range(len(vectors[2 * j])): vectors[2 * j][l] += [0] * (maxlen_j - len(vectors[2 * j][l])) vectors_cosine[2 * j][l] += [0] * (maxlen_j - len(vectors_cosine[2 * j][l])) if dirac: maxlen_dirac = len(dirac_vectors[nr_graphs - 1]) for l in range(len(dirac_vectors)): dirac_vectors[l] += [0] * (maxlen_dirac - len(dirac_vectors[l])) return vectors, vectors_cosine, dirac_vectors
os_idx = 1 if platform.system() == 'Windows': os_idx = 0 filename = filenames[1 + os_idx] h = 1 k = 2 max_p = 2 table_size = 500 max_value = 200000 nr_tables = 1 nr_graphs_per_class = 100 dirac = False cs = CountSketch(table_size, nr_tables * max_p, random_files[os_idx], max_value) cs_cosine = CountSketch(table_size, nr_tables * max_p, random_files[os_idx], max_value) print('Count sketch data structures initialized') print(dirname, filename) ratio = 0.5 Vs, Es, classes = read_write_utilities.read_my_format( filename, 3400, ratio) start = time.time() vectors, vectors_cosine, dirac_vectors = graph2map(Vs, Es, len(classes), h, k, table_size, nr_tables, max_p, dirac) print('elapsed time ', time.time() - start)
for j in range(p): table_j = count_sketch.get_table(i * p + j) tables_fft_i[j] = np.fft.fft(table_j) sketch_i = componentwise_multiplication(tables_fft_i) sketch_i = [val / math.sqrt(k) for val in sketch_i] tensorsketches.append(sketch_i) count_sketch.clear() return [np.real(val) for sketch in tensorsketches for val in sketch] if __name__ == "__main__": print('Tensorsketch') random_files = ['<path to random files>', ''] table_size = 300 v1 = np.array( [10, 2, 3, 1100, 3, 28, 300, 12, 3, 21, 11, 20, 18, 16, 31, 300], dtype=np.uint64) v2 = np.array( [10, 29, 3, 1001, 3, 28, 109, 12, 13, 21, 110, 20, 108, 16, 301, 30], dtype=np.uint64) print(len(v1)) print(len(v2)) p = 2 k = 1 cs = CountSketch(table_size, k * p, random_files[0], 1000) tensorsketches1 = compute_tensorsketch(cs, v1, p, k) tensorsketches2 = compute_tensorsketch(cs, v2, p, k) print(np.dot(v1, v2)**p) print(np.real(np.dot(tensorsketches1, tensorsketches2)))
class ClassicalSketch: """ """ def __init__(self,n_data_rows:int, n_data_cols:int,\ sk_dim:int,sk_mode='Gaussian',sparse_data=None,sjlt_sparsity=5): """ Approximate OLS regression using random projections Parameters: - rp_dim (int) : the number of rows retained in the random projection. - rp_mode (str) : sketch mode used to decide on the sketch. - method: sketch and solve or iterative hessian """ self.sk_mode = sk_mode self.n_data_rows = n_data_rows self.n_data_cols = n_data_cols self.sk_dim = min([sk_dim, n_data_rows]) if self.sk_mode == 'Gaussian': self.sketcher = GaussianSketch(self.sk_dim, self.n_data_rows, self.n_data_cols) elif self.sk_mode == 'SRHT': # Add 1 to the number of data columns as we append a column for y later on try: self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows, self.n_data_cols + 1, 'HAD') except: self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows, self.n_data_cols + 1, 'DCT') elif self.sk_mode == 'SJLT': self.sketcher = SparseJLT(self.sk_dim, self.n_data_rows, self.n_data_cols + 1, col_sparsity=sjlt_sparsity) elif self.sk_mode == 'CountSketch': self.sketcher = CountSketch(self.sk_dim, self.n_data_rows, self.n_data_cols + 1) self.sparse_data_is_set = False # Init for dense sketches to False if (sparse_data is not None) and (self.sk_mode == 'SJLT' or self.sk_mode == 'CountSketch'): self.sketcher.set_sparse_data(sparse_data) self.sparse_data_is_set = True def _sketch_data_targets(self, X, y, seed=100): """ Sketches the [data,target] concatenated data for sketch-and-solve. Slightly different functionality for dense or sparse sketches as sparse data is set in the __init__ function """ if self.sparse_data_is_set: self.sketcher.sketch(seed) else: Xy = np.c_[X, y] self.sketcher.sketch(Xy, seed) def _solve(self, in_svd=False): """ Gets the sketch matrix and splits into the [SX, Sy] parts. Then solves the regression instance using SVD """ _sketch = self.sketcher.get(in_svd=False) SX, Sy = _sketch[:, :-1], _sketch[:, -1].reshape(-1, 1) u, sig, vt = np.linalg.svd(SX, full_matrices=False) sig = sig[:, np.newaxis] sig_inv = 1. / sig weights = (vt.T @ (sig_inv * u.T)) @ Sy weights = np.linalg.lstsq( SX, Sy, rcond=None)[0] # rcond = None is to ignore a warning flag. return weights def _time_sketch_solve(self, X, y, seed=100): """ Times each of the individual process for sketch and solve methods """ times = {'Total': 0., 'Sketch': 0., 'SVD': 0., 'Solve': 0.} TIMER_START = timer() SKETCH_TIMER = timer() self._sketch_data_targets(X, y, seed) times['Sketch'] = timer() - SKETCH_TIMER _sketch = self.sketcher.get(in_svd=False) SX, Sy = _sketch[:, :-1], _sketch[:, -1].reshape(-1, 1) SVD_TIMER = timer() u, sig, vt = np.linalg.svd(SX, full_matrices=False) times['SVD'] = timer() - SVD_TIMER SOLVE_TIME = timer() sig = sig[:, np.newaxis] sig_inv = 1. / sig weights = (vt.T @ (sig_inv * u.T)) @ Sy weights = np.linalg.lstsq(SX, Sy, rcond=None)[0] times['Solve'] = timer() - SOLVE_TIME times['Total'] = timer() - TIMER_START return weights, times def fit(self, X, y, seed=100, in_svd=False, timing=False): """ Fits the sketched regression model with a classical sketch to data X and y. First step is to sketch the data using the given sketch from init. Second step is to solve the regression instance, using either the lstsq solver, or in SVD format. """ if timing: return self._time_sketch_solve(X, y, seed) self._sketch_data_targets(X, y, seed) weights = self._solve() return weights
class IterativeHessianOLS: """ An iterative solver for the optimisation problem f(x) = 1/2 ||Ax - y||_2^2 by randomised newton method. """ def __init__(self,n_data_rows:int, n_data_cols:int,\ sk_dim:int,sk_mode='Gaussian',sparse_data=None,\ ihs_mode='multi',sjlt_sparsity=5): """ Instantiates the IHS sketching construction. """ self.sk_mode = sk_mode self.n_data_rows = n_data_rows self.n_data_cols = n_data_cols self.sk_dim = min([sk_dim, n_data_rows]) self.ihs_mode = ihs_mode if self.sk_mode == 'Gaussian': self.sketcher = GaussianSketch(self.sk_dim, self.n_data_rows, self.n_data_cols) elif self.sk_mode == 'SRHT': # Add 1 to the number of data columns as we append a column for y later on try: self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows, self.n_data_cols, 'HAD') except: self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows, self.n_data_cols, 'DCT') elif self.sk_mode == 'SJLT': self.sketcher = SparseJLT(self.sk_dim, self.n_data_rows, self.n_data_cols, col_sparsity=sjlt_sparsity) elif self.sk_mode == 'CountSketch': self.sketcher = CountSketch(self.sk_dim, self.n_data_rows, self.n_data_cols) self.sparse_data_is_set = False # Init for dense sketches to False if (sparse_data is not None) and (self.sk_mode == 'SJLT' or self.sk_mode == 'CountSketch'): self.sketcher.set_sparse_data(sparse_data) self.sparse_data_is_set = True def _init_iterations(self, X, y, iterations): """ Initialises the arrays we use for iterations - current_weights the vector we will update under iterative scheme - weights_hist is an array which contains all of the updated weights used for error history - XTy is the projection of the targets onto the column space of the data X """ current_weights = np.zeros((self.n_data_cols, 1), dtype=float) weights_hist = np.zeros((self.n_data_cols, iterations), dtype=float) XTy = (X.T @ y).reshape(-1, 1) return current_weights, weights_hist, XTy def _grad(self, X, vec, XTy): """ Returns the gradient function nabla f(x) = A.T ( Ax - y ) We use as input the vector XTy so no need to recompute """ return X.T @ (X @ vec) - XTy def _iterate_multiple(self, X, y, iterations=10, timing=False): if timing: return self._iterate_multiple_timing(X, y, iterations) current_x, all_x, XTy = self._init_iterations(X, y, iterations) for it in range(iterations): ####################################################### # 1. Generate a sketch and obtain the svd factors for efficient solving. self.sketcher.sketch(X, seed=1000 * it) u, sig, vt = self.sketcher.get(in_svd=True) sig = sig[:, np.newaxis] sig_inv = 1. / sig gradient = self._grad(X, current_x, XTy) update = -vt.T @ ( sig_inv**2 * (vt @ gradient) ) # This solves lineat system H update = - gradient current_x += update all_x[:, it] = current_x[:, 0] return current_x, all_x def _iterate_multiple_timing(self, X, y, iterations=10): """ Performs the iterations but also records the timing of each individual part. """ times = { 'Total': 0., 'Sketch': np.zeros(iterations, dtype=float), 'SVD': np.zeros(iterations, dtype=float), 'Solve': np.zeros(iterations, dtype=float) } TIMER_START = timer() current_x, all_x, XTy = self._init_iterations(X, y, iterations) for it in range(iterations): ####################################################### # 1. Generate a sketch and obtain the svd factors for efficient solving. SKETCH_TIMER = timer() self.sketcher.sketch(X, seed=1000 * it) times['Sketch'][it] = timer() - SKETCH_TIMER SVD_TIMER = timer() u, sig, vt = self.sketcher.get(in_svd=True) times['SVD'][it] = timer() - SVD_TIMER sig = sig[:, np.newaxis] sig_inv = 1. / sig SOLVE_TIME = timer() gradient = self._grad(X, current_x, XTy) update = -vt.T @ ( sig_inv**2 * (vt @ gradient) ) # This solves lineat system H update = - gradient current_x += update times['Solve'][it] = timer() - SOLVE_TIME all_x[:, it] = current_x[:, 0] ####################################################### times['Total'] = timer() - TIMER_START return current_x, all_x, times def fit(self, X, y, iterations=10, timing=False): """ Fits the model without any timing on data X and targets y """ if self.ihs_mode == 'multi': if timing: return self._iterate_multiple(X, y, iterations, timing=True) x, all_x = self._iterate_multiple(X, y, iterations) return x, all_x