def test_raises_exception_proj_dim_larger_than_n(data_to_test,all_sketch_methods): '''Ensures that the projection dimension is smaller than n''' n,d = data_to_test.shape for sketch_method in all_sketch_methods: with pytest.raises(Exception): print('Testing ', sketch_method) sX = rp(data_to_test,n+1,sketch_method)
def test_embedding_improves_with_proj_dim(data_to_test,all_sketch_methods): ''' Test that error decreases as we increase projection dimension. nb. This test should be used as a calibration tool as *not all* tests preserve the ordering on the error as the sketch dimension increases. As a result "failing" the test isn't necessarily bad provided it doesn't happen too regularly. Note that the errors are generaly relatively quite similar. ''' n,d = data_to_test.shape sketch_dims = [d,10*d,20*d] errors = [0,0,0] trials = 5 covariance = data_to_test.T@data_to_test for sketch_method in all_sketch_methods: for idx in range(len(sketch_dims)): sketch_dim = sketch_dims[idx] print(idx) error = 0 for i in range(trials): summary = rp(data_to_test,sketch_dim,sketch_method) SA = summary.sketch() sketch_covariance = SA.T@SA error += np.linalg.norm(sketch_covariance - covariance,ord='fro')/np.linalg.norm(covariance,ord='fro') errors[idx] = error / trials print('Errors for {}\n'.format(sketch_method)) print(errors) assert errors[2] <= errors[1] assert errors[1] <= errors[0]
def test_summary_size(data_to_test,all_sketch_methods): ''' Tests that the summary returned has number of rows equal to the required projection dimension''' sketch_dim = 100 for sketch_method in all_sketch_methods: if sketch_method == 'sjlt': col_sparsity = 2 else: col_sparsity = 1 summary = rp(data_to_test,sketch_dim,sketch_method, col_sparsity) _sketch = summary.sketch() print('Sketch size is ', _sketch.shape) assert _sketch.shape[0] == sketch_dim def test_accept_dense_data(all_sketch_methods): ''' Tests that (i) a dense numpy ndarray can be accepted and (ii) sparse matrices within the method can be accessed. (iii). All sketch methods can act upon dense input data. ''' dense_data = data_to_test() sparse_data = sparse.coo_matrix(dense_data) n,d = dense_data.shape for sketch_method in all_sketch_methods: summary = rp(dense_data,5*d,sketch_method) # could just check the coo_data arrays but then run into sparsity # implementation issues so do array-wise instead. assert np.array_equal(sparse_data.row,summary.rows) assert np.array_equal(sparse_data.col, summary.cols) assert np.array_equal(sparse_data.data, summary.vals) _sketch = summary.sketch()
def __init__(self, data, targets, sketch_method, sketch_dimension, col_sparsity=1): # optimisation setup self.A = data self.b = targets # Need to deal with sparse type if isinstance(self.A, np.ndarray): self.ATb = self.A.T @ self.b else: self.ATb = sparse.csr_matrix.dot(self.A.T, self.b) #self.ATb = np.squeeze(self.ATb.toarray()) self.n, self.d = self.A.shape self.x = np.zeros((self.d, )) # initialise the startin point. self.sketch_method = sketch_method self.sketch_dimension = sketch_dimension self.col_sparsity = col_sparsity # initialise the sketch to avoid the repeated costs self.sketcher = rp(self.A, self.sketch_dimension, self.sketch_method, self.col_sparsity) self.coo_data = coo_matrix(data) self.rows = self.coo_data.row self.cols = self.coo_data.col self.vals = self.coo_data.data
def test_summary_method(data_to_test,all_sketch_methods): ''' Tests that the correct sketch method will be executed.''' sketch_dim = 100 for sketch_method in all_sketch_methods: summary = rp(data_to_test,sketch_dim,sketch_method) assert summary.sketch_type == sketch_method
def test_accepts_non_power2(data_to_test,all_sketch_methods): '''Ensures that the projection dimension is smaller than n''' n,d = data_to_test.shape noise = np.random.randn(10,d) _data = np.concatenate((data_to_test,noise),axis=0) for sketch_method in all_sketch_methods: sX = rp(_data,5*d,sketch_method) _sketch = sX.sketch() assert _sketch.shape == (5*d,d)
def error_vs_dimensionality(): dimension = [2**i for i in range(4, 9)] METHODS = sketches + ['Exact', 'Sketch & Solve'] # Output dictionaries error_to_truth = {_: {} for _ in METHODS} for _ in METHODS: for d in dimension: error_to_truth[_][d] = 0 print(error_to_truth) for d in dimension: n = 100 * d print(f'TESTING {n},{d}') ii = dimension.index(d) sampling_rate = 10 num_iterations = 5 for method in METHODS: if method == 'sjlt': col_sparsity = 4 else: col_sparsity = 1 for trial in range(NTRIALS): # Generate the data X, y, x_star = gaussian_design_unconstrained(n, d, 1.0) if method is "Exact": print('Exact method.') x_hat = np.linalg.lstsq(X, y)[0] elif method is "Sketch & Solve": sketch_size = sampling_rate * num_iterations * d print(f"S&S with {sketch_size} sketch size") _sketch = rp(X, sketch_size, 'countSketch', col_sparsity) SA, Sb = _sketch.sketch_data_targets(y) x_hat = np.linalg.lstsq(SA, Sb)[0] else: sketch_size = sampling_rate * d print( f"Using {num_iterations} iterations, sketch_size {sketch_size} and {method}" ) my_ihs = ihs(X, y, method, sketch_size, col_sparsity) x_hat = my_ihs.ols_fit_new_sketch(num_iterations) error = (prediction_error(X, x_star, x_hat))**(0.5) error_to_truth[method][d] += error for _ in METHODS: for d in dimension: error_to_truth[_][d] /= NTRIALS error_to_truth['Dimensions'] = dimension pretty = PrettyPrinter(indent=4) pretty.pprint(error_to_truth) save_dir = '../../output/ihs_baselines/' np.save(save_dir + 'error_vs_dims', error_to_truth)
def test_sketch_data_targets(data_to_test,all_sketch_methods): ''' Test that the output is correct dimensionality when the input is the data-target pair (A,b). Note that this test will fail when the input has been extended for the SRHT and the same extension has not been applied to y. ''' n,d = data_to_test.shape y = np.random.randn(n) sketch_dim = 5*d for sketch_method in all_sketch_methods: summary = rp(data_to_test,sketch_dim,sketch_method) SA,Sb = summary.sketch_data_targets(y) assert SA.shape == (sketch_dim,d) assert Sb.shape == (sketch_dim,)
def test_accept_dense_data(all_sketch_methods): ''' Tests that (i) a dense numpy ndarray can be accepted and (ii) sparse matrices within the method can be accessed. (iii). All sketch methods can act upon dense input data. ''' dense_data = data_to_test() sparse_data = sparse.coo_matrix(dense_data) n,d = dense_data.shape for sketch_method in all_sketch_methods: summary = rp(dense_data,5*d,sketch_method) # could just check the coo_data arrays but then run into sparsity # implementation issues so do array-wise instead. assert np.array_equal(sparse_data.row,summary.rows) assert np.array_equal(sparse_data.col, summary.cols) assert np.array_equal(sparse_data.data, summary.vals) _sketch = summary.sketch()
def summary_time_quality(): '''Generate summaries, time, and measure error. Write the experimental output to summary_time_quality dict''' summary_time_quality_results = {} for data in datasets.keys(): summary_time_quality_results[data] = {} for sketch_type in sketches: summary_time_quality_results[data][sketch_type] = {} for gamma in projection_dimensions: summary_time_quality_results[data][sketch_type][gamma] = {} pretty = PrettyPrinter(indent=4) pretty.pprint(summary_time_quality_results) for data in datasets.keys(): if data != 'specular': continue print("-" * 80) print("Dataset: {}".format(data)) input_file = datasets[data]["filepath"] input_dest = datasets[data]["input_destination"] if datasets[data]['sparse_format'] == True: df = load_npz('../../' + input_file) df = df.tocsr() else: df = np.load('../../' + input_file) X = df[:, :-1] nn, d = X.shape n = 2**np.int(np.floor(np.log2(nn))) X = X[:n, :] print(n, d) cov_mat = X.T @ X # Convert the d x d (small) covariance matrix to a ndarray for compatibility #cov_mat = cov_mat.toarray() print(cov_mat.shape, type(cov_mat)) if datasets[data]['sparse_format'] == True: # Convert the d x d (small) covariance matrix to a ndarray for compatibility cov_mat = cov_mat.toarray() print(cov_mat.shape, type(cov_mat)) frob_norm_cov_mat = np.linalg.norm(cov_mat, 'fro') spec_norm_cov_mat = np.linalg.norm(cov_mat, 2) for gamma, sketch_type in itertools.product(projection_dimensions, sketches): # if data == 'specular' and sketch_type == 'gaussian' and gamma > 2: # print('Timeout so autoset') # if gamma == 4: # sketch_time = 336.0 # elif gamma == 8: # sketch_time = 1823.0 # elif gamma == 10: # sketch_time = 0.0 # frob_error = 0.0 # spec_error = 0.0 # product_time = 0.0 # summary_time_quality_results[data][sketch_type][gamma]['sketch_time'] = sketch_time # summary_time_quality_results[data][sketch_type][gamma]['product_time'] = product_time # summary_time_quality_results[data][sketch_type][gamma]['frob_error'] = frob_error # summary_time_quality_results[data][sketch_type][gamma]['spec_error'] = spec_error # continue print(gamma, sketch_type) print('*' * 80) sketch_time = 0 product_time = 0 distortion = 0 approx_cov_mat = np.zeros((d, d)) proj_dim = np.int(gamma * d) # We use s = 4 globally for the sjlt. if sketch_type == 'sjlt': col_sparsity = 4 else: col_sparsity = 1 _sketch = rp(X, proj_dim, sketch_type, col_sparsity) for _ in range(NTRIALS): sketch_start = default_timer() sX = _sketch.sketch() sketch_time += default_timer() - sketch_start product_start = default_timer() estimate = sX.T @ sX product_time += default_timer() - product_start approx_cov_mat += estimate sketch_time /= NTRIALS product_time /= NTRIALS approx_cov_mat /= NTRIALS frob_error = np.linalg.norm(approx_cov_mat - cov_mat, ord='fro') / frob_norm_cov_mat spec_error = np.linalg.norm(approx_cov_mat - cov_mat, ord=2) / spec_norm_cov_mat print(f'Testing {data},{gamma},{sketch_type}:') print(f'Sketch time: {sketch_time}, Product Time: {product_time}') print(f'Frob error: {frob_error}, Spectral error: {spec_error}') summary_time_quality_results[data][sketch_type][gamma][ 'sketch_time'] = sketch_time summary_time_quality_results[data][sketch_type][gamma][ 'product_time'] = product_time summary_time_quality_results[data][sketch_type][gamma][ 'frob_error'] = frob_error summary_time_quality_results[data][sketch_type][gamma][ 'spec_error'] = spec_error pretty = PrettyPrinter(indent=4) pretty.pprint(summary_time_quality_results) with open('../../output/baselines/summary_time_quality_results.json', 'w') as outfile: #with open('../../output/baselines/summary_time_quality_specular.json', 'w') as outfile: json.dump(summary_time_quality_results, outfile)
def solution_error_vs_row_dim(): ''' Increase `n` the input dimension of the problem and measure the solution error in both: (i) Euclidean norm (`mean_square_error`) (ii) Prediction norm (`prediction_error`). Error measurements are taken with respect to: (i) the optimal solution x_opt (ii) the ground truth ''' print('Experimental setup:') print(f'IHS sketch size {SKETCH_SIZE}') print(f'Sketch and solve sketch size {CLASSICAL_SKETCH_SIZE}') print(f'Number of rounds {ROUNDS}') # Output dictionaries MSE_OPT = { sketches[i]: np.zeros(len(ROWDIMS), ) for i in range(len(sketches)) } PRED_ERROR_OPT = { sketches[i]: np.zeros(len(ROWDIMS), ) for i in range(len(sketches)) } MSE_TRUTH = { sketches[i]: np.zeros(len(ROWDIMS), ) for i in range(len(sketches)) } PRED_ERROR_TRUTH = { sketches[i]: np.zeros(len(ROWDIMS), ) for i in range(len(sketches)) } MSE_OPT['Sketch & Solve'] = np.zeros(len(ROWDIMS), ) PRED_ERROR_OPT['Sketch & Solve'] = np.zeros(len(ROWDIMS), ) MSE_TRUTH['Sketch & Solve'] = np.zeros(len(ROWDIMS), ) PRED_ERROR_TRUTH['Sketch & Solve'] = np.zeros(len(ROWDIMS), ) MSE_TRUTH['Exact'] = np.zeros(len(ROWDIMS), ) PRED_ERROR_TRUTH['Exact'] = np.zeros(len(ROWDIMS), ) ## Experiment for n in ROWDIMS: print(f'Testing {n} rows') experiment_index = ROWDIMS.index(n) _iters = ROUNDS[experiment_index] ihs_sketch_size = SKETCH_SIZE classic_sketch_size = CLASSICAL_SKETCH_SIZE[experiment_index] for trial in range(NTRIALS): print("TRIAL {}".format(trial)) X, y, x_true = gaussian_design_unconstrained(n, D, variance=1.0) x_opt = np.linalg.lstsq(X, y)[0] for sketch_method in METHODS: print('*' * 80) if sketch_method in sketches or sketch_method == 'Sketch & Solve': if sketch_method == 'sjlt': col_sparsity = 4 else: col_sparsity = 1 if sketch_method == 'Sketch & Solve': _sketch = rp(X, classic_sketch_size, 'countSketch', col_sparsity) SA, Sb = _sketch.sketch_data_targets(y) x_ss = np.linalg.lstsq(SA, Sb)[0] MSE_OPT[sketch_method][ experiment_index] += mean_square_error( x_opt, x_ss) PRED_ERROR_OPT[sketch_method][ experiment_index] += prediction_error( X, x_opt, x_ss) MSE_TRUTH[sketch_method][ experiment_index] += mean_square_error( x_true, x_ss) PRED_ERROR_TRUTH[sketch_method][ experiment_index] += prediction_error( X, x_true, x_ss) else: print(f'{sketch_method} IHS') my_ihs = ihs(X, y, sketch_method, ihs_sketch_size, col_sparsity) x_ihs, x_iters = my_ihs.ols_fit_new_sketch_track_errors( _iters) x_errors = x_opt[:, None] - x_iters print(x_errors.shape) MSE_OPT[sketch_method][ experiment_index] += mean_square_error( x_opt, x_ihs) PRED_ERROR_OPT[sketch_method][ experiment_index] += prediction_error( X, x_opt, x_ihs) MSE_TRUTH[sketch_method][ experiment_index] += mean_square_error( x_true, x_ihs) PRED_ERROR_TRUTH[sketch_method][ experiment_index] += prediction_error( X, x_true, x_ihs) else: # solve exactly #x_opt = np.linalg.lstsq(X,y)[0] MSE_TRUTH["Exact"][experiment_index] += mean_square_error( x_opt, x_true) PRED_ERROR_TRUTH["Exact"][ experiment_index] += prediction_error( X, x_opt, x_true) for _dict in [MSE_OPT, PRED_ERROR_OPT, MSE_TRUTH, PRED_ERROR_TRUTH]: for _key in _dict.keys(): _dict[_key] /= NTRIALS pretty = PrettyPrinter(indent=4) pretty.pprint(MSE_OPT) pretty.pprint(PRED_ERROR_OPT) pretty.pprint(MSE_TRUTH) pretty.pprint(PRED_ERROR_TRUTH) save_dir = '../../output/baselines/' np.save(save_dir + 'ihs_ols_mse_OPT', MSE_OPT) np.save(save_dir + 'ihs_ols_pred_error_OPT', PRED_ERROR_OPT) np.save(save_dir + 'ihs_ols_mse_TRUTH', MSE_TRUTH) np.save(save_dir + 'ihs_ols_pred_error_TRUTH', PRED_ERROR_TRUTH)