def test_raises_exception_proj_dim_larger_than_n(data_to_test,all_sketch_methods):
    '''Ensures that the projection dimension is smaller than n'''
    n,d = data_to_test.shape
    for sketch_method in all_sketch_methods:
        with pytest.raises(Exception):
            print('Testing ', sketch_method)
            sX = rp(data_to_test,n+1,sketch_method)
def test_embedding_improves_with_proj_dim(data_to_test,all_sketch_methods):
    '''
    Test that error decreases as we increase projection dimension.

    nb. This test should be used as a calibration tool as *not all* tests
    preserve the ordering on the error as the sketch dimension increases.
    As a result "failing" the test isn't necessarily bad provided it doesn't
    happen too regularly.
    Note that the errors are generaly relatively quite similar.
    '''
    n,d = data_to_test.shape
    sketch_dims = [d,10*d,20*d]
    errors = [0,0,0]
    trials = 5
    covariance = data_to_test.T@data_to_test

    for sketch_method in all_sketch_methods:
        for idx in range(len(sketch_dims)):
            sketch_dim = sketch_dims[idx]
            print(idx)
            error = 0
            for i in range(trials):
                summary = rp(data_to_test,sketch_dim,sketch_method)
                SA = summary.sketch()
                sketch_covariance = SA.T@SA
                error += np.linalg.norm(sketch_covariance - covariance,ord='fro')/np.linalg.norm(covariance,ord='fro')
            errors[idx] = error / trials

        print('Errors for {}\n'.format(sketch_method))
        print(errors)
        assert errors[2] <= errors[1]
        assert errors[1] <= errors[0]
def test_summary_size(data_to_test,all_sketch_methods):
    '''
    Tests that the summary returned has number of rows equal
    to the required projection dimension'''
    sketch_dim = 100

    for sketch_method in all_sketch_methods:
        if sketch_method == 'sjlt':
            col_sparsity = 2
        else:
            col_sparsity = 1
        summary = rp(data_to_test,sketch_dim,sketch_method, col_sparsity)
        _sketch = summary.sketch()
        print('Sketch size is ', _sketch.shape)
        assert _sketch.shape[0] == sketch_dim

    def test_accept_dense_data(all_sketch_methods):
        '''
        Tests that
        (i) a dense numpy ndarray can be accepted and
        (ii) sparse matrices within the method can be accessed.
        (iii). All sketch methods can act upon dense input data.
        '''
        dense_data = data_to_test()
        sparse_data = sparse.coo_matrix(dense_data)
        n,d = dense_data.shape
        for sketch_method in all_sketch_methods:
            summary = rp(dense_data,5*d,sketch_method)

            # could just check the coo_data arrays but then run into sparsity
            # implementation issues so do array-wise instead.
            assert np.array_equal(sparse_data.row,summary.rows)
            assert np.array_equal(sparse_data.col, summary.cols)
            assert np.array_equal(sparse_data.data, summary.vals)
            _sketch = summary.sketch()
    def __init__(self,
                 data,
                 targets,
                 sketch_method,
                 sketch_dimension,
                 col_sparsity=1):

        # optimisation setup
        self.A = data
        self.b = targets

        # Need to deal with sparse type
        if isinstance(self.A, np.ndarray):
            self.ATb = self.A.T @ self.b
        else:
            self.ATb = sparse.csr_matrix.dot(self.A.T, self.b)
            #self.ATb = np.squeeze(self.ATb.toarray())

        self.n, self.d = self.A.shape
        self.x = np.zeros((self.d, ))  # initialise the startin point.

        self.sketch_method = sketch_method
        self.sketch_dimension = sketch_dimension
        self.col_sparsity = col_sparsity
        # initialise the sketch to avoid the repeated costs
        self.sketcher = rp(self.A, self.sketch_dimension, self.sketch_method,
                           self.col_sparsity)
        self.coo_data = coo_matrix(data)
        self.rows = self.coo_data.row
        self.cols = self.coo_data.col
        self.vals = self.coo_data.data
def test_summary_method(data_to_test,all_sketch_methods):
    '''
    Tests that the correct sketch method
    will be executed.'''
    sketch_dim = 100
    for sketch_method in all_sketch_methods:
        summary = rp(data_to_test,sketch_dim,sketch_method)
        assert summary.sketch_type == sketch_method
def test_accepts_non_power2(data_to_test,all_sketch_methods):
    '''Ensures that the projection dimension is smaller than n'''
    n,d = data_to_test.shape
    noise = np.random.randn(10,d)
    _data = np.concatenate((data_to_test,noise),axis=0)
    for sketch_method in all_sketch_methods:
        sX = rp(_data,5*d,sketch_method)
        _sketch = sX.sketch()
        assert _sketch.shape == (5*d,d)
def error_vs_dimensionality():
    dimension = [2**i for i in range(4, 9)]
    METHODS = sketches + ['Exact', 'Sketch & Solve']

    # Output dictionaries
    error_to_truth = {_: {} for _ in METHODS}
    for _ in METHODS:
        for d in dimension:
            error_to_truth[_][d] = 0
    print(error_to_truth)

    for d in dimension:
        n = 100 * d
        print(f'TESTING {n},{d}')
        ii = dimension.index(d)
        sampling_rate = 10
        num_iterations = 5
        for method in METHODS:
            if method == 'sjlt':
                col_sparsity = 4
            else:
                col_sparsity = 1
            for trial in range(NTRIALS):
                # Generate the data
                X, y, x_star = gaussian_design_unconstrained(n, d, 1.0)
                if method is "Exact":
                    print('Exact method.')
                    x_hat = np.linalg.lstsq(X, y)[0]

                elif method is "Sketch & Solve":
                    sketch_size = sampling_rate * num_iterations * d
                    print(f"S&S with {sketch_size} sketch size")
                    _sketch = rp(X, sketch_size, 'countSketch', col_sparsity)
                    SA, Sb = _sketch.sketch_data_targets(y)
                    x_hat = np.linalg.lstsq(SA, Sb)[0]
                else:
                    sketch_size = sampling_rate * d
                    print(
                        f"Using {num_iterations} iterations, sketch_size {sketch_size} and {method}"
                    )
                    my_ihs = ihs(X, y, method, sketch_size, col_sparsity)
                    x_hat = my_ihs.ols_fit_new_sketch(num_iterations)

                error = (prediction_error(X, x_star, x_hat))**(0.5)
                error_to_truth[method][d] += error
    for _ in METHODS:
        for d in dimension:
            error_to_truth[_][d] /= NTRIALS
    error_to_truth['Dimensions'] = dimension
    pretty = PrettyPrinter(indent=4)
    pretty.pprint(error_to_truth)
    save_dir = '../../output/ihs_baselines/'
    np.save(save_dir + 'error_vs_dims', error_to_truth)
def test_sketch_data_targets(data_to_test,all_sketch_methods):
    '''
    Test that the output is correct dimensionality
    when the input is the data-target pair (A,b).

    Note that this test will fail when the input has been extended
    for the SRHT and the same extension has not been applied to y.

    '''
    n,d = data_to_test.shape
    y = np.random.randn(n)
    sketch_dim = 5*d
    for sketch_method in all_sketch_methods:
        summary = rp(data_to_test,sketch_dim,sketch_method)
        SA,Sb = summary.sketch_data_targets(y)
        assert SA.shape == (sketch_dim,d)
        assert Sb.shape == (sketch_dim,)
    def test_accept_dense_data(all_sketch_methods):
        '''
        Tests that
        (i) a dense numpy ndarray can be accepted and
        (ii) sparse matrices within the method can be accessed.
        (iii). All sketch methods can act upon dense input data.
        '''
        dense_data = data_to_test()
        sparse_data = sparse.coo_matrix(dense_data)
        n,d = dense_data.shape
        for sketch_method in all_sketch_methods:
            summary = rp(dense_data,5*d,sketch_method)

            # could just check the coo_data arrays but then run into sparsity
            # implementation issues so do array-wise instead.
            assert np.array_equal(sparse_data.row,summary.rows)
            assert np.array_equal(sparse_data.col, summary.cols)
            assert np.array_equal(sparse_data.data, summary.vals)
            _sketch = summary.sketch()
예제 #10
0
def summary_time_quality():
    '''Generate summaries, time, and measure error.
    Write the experimental output to summary_time_quality dict'''

    summary_time_quality_results = {}
    for data in datasets.keys():
        summary_time_quality_results[data] = {}
        for sketch_type in sketches:
            summary_time_quality_results[data][sketch_type] = {}
            for gamma in projection_dimensions:
                summary_time_quality_results[data][sketch_type][gamma] = {}
    pretty = PrettyPrinter(indent=4)
    pretty.pprint(summary_time_quality_results)

    for data in datasets.keys():
        if data != 'specular':
            continue
        print("-" * 80)
        print("Dataset: {}".format(data))
        input_file = datasets[data]["filepath"]
        input_dest = datasets[data]["input_destination"]
        if datasets[data]['sparse_format'] == True:
            df = load_npz('../../' + input_file)
            df = df.tocsr()
        else:
            df = np.load('../../' + input_file)

        X = df[:, :-1]
        nn, d = X.shape
        n = 2**np.int(np.floor(np.log2(nn)))
        X = X[:n, :]
        print(n, d)
        cov_mat = X.T @ X
        # Convert the d x d (small) covariance matrix to a ndarray for compatibility
        #cov_mat = cov_mat.toarray()
        print(cov_mat.shape, type(cov_mat))

        if datasets[data]['sparse_format'] == True:
            # Convert the d x d (small) covariance matrix to a ndarray for compatibility
            cov_mat = cov_mat.toarray()
            print(cov_mat.shape, type(cov_mat))

        frob_norm_cov_mat = np.linalg.norm(cov_mat, 'fro')
        spec_norm_cov_mat = np.linalg.norm(cov_mat, 2)

        for gamma, sketch_type in itertools.product(projection_dimensions,
                                                    sketches):
            # if data == 'specular' and sketch_type == 'gaussian' and gamma > 2:
            #     print('Timeout so autoset')
            #     if gamma  == 4:
            #         sketch_time = 336.0
            #     elif gamma == 8:
            #         sketch_time = 1823.0
            #     elif gamma == 10:
            # sketch_time = 0.0
            # frob_error = 0.0
            # spec_error = 0.0
            # product_time = 0.0
            # summary_time_quality_results[data][sketch_type][gamma]['sketch_time'] = sketch_time
            # summary_time_quality_results[data][sketch_type][gamma]['product_time'] = product_time
            # summary_time_quality_results[data][sketch_type][gamma]['frob_error'] = frob_error
            # summary_time_quality_results[data][sketch_type][gamma]['spec_error'] = spec_error
            # continue
            print(gamma, sketch_type)
            print('*' * 80)
            sketch_time = 0
            product_time = 0
            distortion = 0
            approx_cov_mat = np.zeros((d, d))
            proj_dim = np.int(gamma * d)

            # We use s = 4 globally for the sjlt.
            if sketch_type == 'sjlt':
                col_sparsity = 4
            else:
                col_sparsity = 1

            _sketch = rp(X, proj_dim, sketch_type, col_sparsity)

            for _ in range(NTRIALS):
                sketch_start = default_timer()
                sX = _sketch.sketch()
                sketch_time += default_timer() - sketch_start

                product_start = default_timer()
                estimate = sX.T @ sX
                product_time += default_timer() - product_start
                approx_cov_mat += estimate

            sketch_time /= NTRIALS
            product_time /= NTRIALS
            approx_cov_mat /= NTRIALS
            frob_error = np.linalg.norm(approx_cov_mat - cov_mat,
                                        ord='fro') / frob_norm_cov_mat
            spec_error = np.linalg.norm(approx_cov_mat - cov_mat,
                                        ord=2) / spec_norm_cov_mat

            print(f'Testing {data},{gamma},{sketch_type}:')
            print(f'Sketch time: {sketch_time}, Product Time: {product_time}')
            print(f'Frob error: {frob_error}, Spectral error: {spec_error}')

            summary_time_quality_results[data][sketch_type][gamma][
                'sketch_time'] = sketch_time
            summary_time_quality_results[data][sketch_type][gamma][
                'product_time'] = product_time
            summary_time_quality_results[data][sketch_type][gamma][
                'frob_error'] = frob_error
            summary_time_quality_results[data][sketch_type][gamma][
                'spec_error'] = spec_error
    pretty = PrettyPrinter(indent=4)
    pretty.pprint(summary_time_quality_results)
    with open('../../output/baselines/summary_time_quality_results.json',
              'w') as outfile:
        #with open('../../output/baselines/summary_time_quality_specular.json', 'w') as outfile:
        json.dump(summary_time_quality_results, outfile)
예제 #11
0
def solution_error_vs_row_dim():
    '''
    Increase `n` the input dimension of the problem and
    measure the solution error in both:
    (i) Euclidean norm (`mean_square_error`)
    (ii) Prediction norm (`prediction_error`).

    Error measurements are taken with respect to:
    (i) the optimal solution x_opt
    (ii) the ground truth

    '''
    print('Experimental setup:')
    print(f'IHS sketch size {SKETCH_SIZE}')
    print(f'Sketch and solve sketch size {CLASSICAL_SKETCH_SIZE}')
    print(f'Number of rounds {ROUNDS}')

    # Output dictionaries
    MSE_OPT = {
        sketches[i]: np.zeros(len(ROWDIMS), )
        for i in range(len(sketches))
    }
    PRED_ERROR_OPT = {
        sketches[i]: np.zeros(len(ROWDIMS), )
        for i in range(len(sketches))
    }
    MSE_TRUTH = {
        sketches[i]: np.zeros(len(ROWDIMS), )
        for i in range(len(sketches))
    }
    PRED_ERROR_TRUTH = {
        sketches[i]: np.zeros(len(ROWDIMS), )
        for i in range(len(sketches))
    }

    MSE_OPT['Sketch & Solve'] = np.zeros(len(ROWDIMS), )
    PRED_ERROR_OPT['Sketch & Solve'] = np.zeros(len(ROWDIMS), )
    MSE_TRUTH['Sketch & Solve'] = np.zeros(len(ROWDIMS), )
    PRED_ERROR_TRUTH['Sketch & Solve'] = np.zeros(len(ROWDIMS), )

    MSE_TRUTH['Exact'] = np.zeros(len(ROWDIMS), )
    PRED_ERROR_TRUTH['Exact'] = np.zeros(len(ROWDIMS), )

    ## Experiment
    for n in ROWDIMS:
        print(f'Testing {n} rows')
        experiment_index = ROWDIMS.index(n)
        _iters = ROUNDS[experiment_index]
        ihs_sketch_size = SKETCH_SIZE
        classic_sketch_size = CLASSICAL_SKETCH_SIZE[experiment_index]

        for trial in range(NTRIALS):
            print("TRIAL {}".format(trial))
            X, y, x_true = gaussian_design_unconstrained(n, D, variance=1.0)
            x_opt = np.linalg.lstsq(X, y)[0]

            for sketch_method in METHODS:
                print('*' * 80)
                if sketch_method in sketches or sketch_method == 'Sketch & Solve':
                    if sketch_method == 'sjlt':
                        col_sparsity = 4
                    else:
                        col_sparsity = 1

                    if sketch_method == 'Sketch & Solve':
                        _sketch = rp(X, classic_sketch_size, 'countSketch',
                                     col_sparsity)
                        SA, Sb = _sketch.sketch_data_targets(y)
                        x_ss = np.linalg.lstsq(SA, Sb)[0]
                        MSE_OPT[sketch_method][
                            experiment_index] += mean_square_error(
                                x_opt, x_ss)
                        PRED_ERROR_OPT[sketch_method][
                            experiment_index] += prediction_error(
                                X, x_opt, x_ss)
                        MSE_TRUTH[sketch_method][
                            experiment_index] += mean_square_error(
                                x_true, x_ss)
                        PRED_ERROR_TRUTH[sketch_method][
                            experiment_index] += prediction_error(
                                X, x_true, x_ss)
                    else:
                        print(f'{sketch_method} IHS')
                        my_ihs = ihs(X, y, sketch_method, ihs_sketch_size,
                                     col_sparsity)
                        x_ihs, x_iters = my_ihs.ols_fit_new_sketch_track_errors(
                            _iters)
                        x_errors = x_opt[:, None] - x_iters
                        print(x_errors.shape)
                        MSE_OPT[sketch_method][
                            experiment_index] += mean_square_error(
                                x_opt, x_ihs)
                        PRED_ERROR_OPT[sketch_method][
                            experiment_index] += prediction_error(
                                X, x_opt, x_ihs)
                        MSE_TRUTH[sketch_method][
                            experiment_index] += mean_square_error(
                                x_true, x_ihs)
                        PRED_ERROR_TRUTH[sketch_method][
                            experiment_index] += prediction_error(
                                X, x_true, x_ihs)
                else:
                    # solve exactly
                    #x_opt = np.linalg.lstsq(X,y)[0]
                    MSE_TRUTH["Exact"][experiment_index] += mean_square_error(
                        x_opt, x_true)
                    PRED_ERROR_TRUTH["Exact"][
                        experiment_index] += prediction_error(
                            X, x_opt, x_true)

    for _dict in [MSE_OPT, PRED_ERROR_OPT, MSE_TRUTH, PRED_ERROR_TRUTH]:
        for _key in _dict.keys():
            _dict[_key] /= NTRIALS

    pretty = PrettyPrinter(indent=4)
    pretty.pprint(MSE_OPT)
    pretty.pprint(PRED_ERROR_OPT)
    pretty.pprint(MSE_TRUTH)
    pretty.pprint(PRED_ERROR_TRUTH)

    save_dir = '../../output/baselines/'
    np.save(save_dir + 'ihs_ols_mse_OPT', MSE_OPT)
    np.save(save_dir + 'ihs_ols_pred_error_OPT', PRED_ERROR_OPT)
    np.save(save_dir + 'ihs_ols_mse_TRUTH', MSE_TRUTH)
    np.save(save_dir + 'ihs_ols_pred_error_TRUTH', PRED_ERROR_TRUTH)