Пример #1
0
def test_summary_size(data_to_test):
    '''
    Tests that the summary returned has number of rows equal
    to the required projectiont dimension'''
    print('\n')
    sketch_dim = 500
    n, d = data_to_test.shape
    true_norm = np.linalg.norm(data_to_test, ord='fro')**2
    sketches = {
        'gaussian': GaussianSketch(sketch_dim, n, d),
        'srht_HAD': SRHTSketch(sketch_dim, n, d, 'HAD'),
        'srht_DCT': SRHTSketch(sketch_dim, n, d),
        'countsketch': CountSketch(sketch_dim, n, d),
        'sjltsketch': SparseJLT(sketch_dim, n, d, col_sparsity=5)
    }
    for sk_name, sk_method in sketches.items():
        g = sk_method
        g.sketch(data_to_test)
        summary = g.get()
        summary_u, summary_sig, summary_vt = g.get(in_svd=True)
        assert summary.shape == (sketch_dim, d)
        assert summary_u.shape == (sketch_dim, d)
        assert summary_sig.shape == (d, )
        assert summary_vt.shape == (d, d)
        sk_norm = np.linalg.norm(summary, ord='fro')**2
        err = np.abs(sk_norm - true_norm) / true_norm
        print(f'{sk_name}:\t{err:.5f}')
    def __init__(self,n_data_rows:int, n_data_cols:int,\
            sk_dim:int,sk_mode='Gaussian',sparse_data=None,\
            ihs_mode='single',sjlt_sparsity=5,gamma=1.0,batch_size=None):
        """
        #def __init__(self, sk_dim:int,sk_mode='FD',gamma=1.0,batch_size=None):
        Approximate ridge regression using the FD sketch.
        sk_dim (int) - the number of rows retained in the FD sketch.
        sk_mode (str) : mode for frequent directions FD or RFD.
        alpha : float - the regularisation parameter for ridge regression.
        """
        self.gamma = gamma
        self.sk_mode = sk_mode
        self.n_data_rows = n_data_rows
        self.n_data_cols = n_data_cols
        self.sk_dim = min([sk_dim, n_data_cols])  #sk_dim
        self.ihs_mode = ihs_mode
        # if self.sk_mode not in ['FD', 'RFD']:
        #     raise NotImplementedError('Only F(ast) and R(obust) FD methods are supported.')

        if self.sk_mode == 'Gaussian':
            self.sketcher = GaussianSketch(self.sk_dim, self.n_data_rows,
                                           self.n_data_cols)
        elif self.sk_mode == 'SRHT':
            try:
                self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows,
                                           self.n_data_cols, 'HAD')
            except:
                self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows,
                                           self.n_data_cols, 'DCT')
        elif self.sk_mode == 'SJLT':
            self.sketcher = SparseJLT(self.sk_dim,
                                      self.n_data_rows,
                                      self.n_data_cols,
                                      col_sparsity=sjlt_sparsity)
        elif self.sk_mode == 'CountSketch':
            self.sketcher = CountSketch(self.sk_dim, self.n_data_rows,
                                        self.n_data_cols)
        # Set ihs mode to be single for deterministic sketches
        elif self.sk_mode == 'FD':
            self.ihs_mode = 'single'
            self.sketcher = FastFrequentDirections(self.sk_dim,
                                                   self.n_data_rows,
                                                   self.n_data_cols)
        elif self.sk_mode == 'RFD':
            self.ihs_mode = 'single'
            self.sketcher = RobustFrequentDirections(self.sk_dim,
                                                     self.n_data_rows,
                                                     self.n_data_cols)

        if batch_size == None:
            self.batch_size = self.sk_dim
        else:
            self.batch_size = batch_size

        self.sparse_data_is_set = False  # Init for dense sketches to False
        if (sparse_data is not None) and (self.sk_mode == 'SJLT'
                                          or self.sk_mode == 'CountSketch'):
            self.sketcher.set_sparse_data(sparse_data)
            self.sparse_data_is_set = True
Пример #3
0
    def __init__(self,n_data_rows:int, n_data_cols:int,\
                sk_dim:int,sk_mode='Gaussian',sparse_data=None,sjlt_sparsity=5):
        """
        Approximate OLS regression using random projections
        Parameters:
        - rp_dim (int)    : the number of rows retained in the random projection.
        - rp_mode (str)   : sketch mode used to decide on the sketch.
            - method: sketch and solve or iterative hessian
        """
        self.sk_mode = sk_mode
        self.n_data_rows = n_data_rows
        self.n_data_cols = n_data_cols
        self.sk_dim = min([sk_dim, n_data_rows])
        if self.sk_mode == 'Gaussian':
            self.sketcher = GaussianSketch(self.sk_dim, self.n_data_rows,
                                           self.n_data_cols)
        elif self.sk_mode == 'SRHT':
            # Add 1 to the number of data columns as we append a column for y later on
            try:
                self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows,
                                           self.n_data_cols + 1, 'HAD')
            except:
                self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows,
                                           self.n_data_cols + 1, 'DCT')
        elif self.sk_mode == 'SJLT':
            self.sketcher = SparseJLT(self.sk_dim,
                                      self.n_data_rows,
                                      self.n_data_cols + 1,
                                      col_sparsity=sjlt_sparsity)
        elif self.sk_mode == 'CountSketch':
            self.sketcher = CountSketch(self.sk_dim, self.n_data_rows,
                                        self.n_data_cols + 1)

        self.sparse_data_is_set = False  # Init for dense sketches to False
        if (sparse_data is not None) and (self.sk_mode == 'SJLT'
                                          or self.sk_mode == 'CountSketch'):
            self.sketcher.set_sparse_data(sparse_data)
            self.sparse_data_is_set = True
Пример #4
0
    def __init__(self,n_data_rows:int, n_data_cols:int,\
                sk_dim:int,sk_mode='Gaussian',sparse_data=None,\
                ihs_mode='multi',sjlt_sparsity=5):
        """
        Instantiates the IHS sketching construction.
        """
        self.sk_mode = sk_mode
        self.n_data_rows = n_data_rows
        self.n_data_cols = n_data_cols
        self.sk_dim = min([sk_dim, n_data_rows])
        self.ihs_mode = ihs_mode

        if self.sk_mode == 'Gaussian':
            self.sketcher = GaussianSketch(self.sk_dim, self.n_data_rows,
                                           self.n_data_cols)
        elif self.sk_mode == 'SRHT':
            # Add 1 to the number of data columns as we append a column for y later on
            try:
                self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows,
                                           self.n_data_cols, 'HAD')
            except:
                self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows,
                                           self.n_data_cols, 'DCT')
        elif self.sk_mode == 'SJLT':
            self.sketcher = SparseJLT(self.sk_dim,
                                      self.n_data_rows,
                                      self.n_data_cols,
                                      col_sparsity=sjlt_sparsity)
        elif self.sk_mode == 'CountSketch':
            self.sketcher = CountSketch(self.sk_dim, self.n_data_rows,
                                        self.n_data_cols)

        self.sparse_data_is_set = False  # Init for dense sketches to False
        if (sparse_data is not None) and (self.sk_mode == 'SJLT'
                                          or self.sk_mode == 'CountSketch'):
            self.sketcher.set_sparse_data(sparse_data)
            self.sparse_data_is_set = True
Пример #5
0
def graph2map(Vs,
              Es,
              nr_graphs,
              h,
              k,
              table_size,
              random_files,
              nr_tables=1,
              max_p=2,
              dirac=False):
    """
    for a collection of graphs generate feature maps by traversing local neighborhoods, generating strings and sketching
    the k-gram ferquency distribution for 
    h: the depth at which neighborhood strings are generated
    k: the k in k-grams
    table_size: the count-sketch hashtable size
    random_files: needed for count-sketch initialization
    nr_tables: count-sketch parameter
    max_p: the maximum polynomial degree for the poly-kernel
    """
    print('Count sketch data structures initialization')
    cs = CountSketch(table_size, nr_tables * max_p, random_files)
    cs_cosine = CountSketch(table_size, nr_tables * max_p, random_files)

    print('Process graphs')
    vectors = [[] for _ in range(2 * h)]
    vectors_cosine = [[] for _ in range(2 * h)]
    dirac_vectors = []
    labels_maps = [{} for _ in range(h)]
    dirac_map = {}
    print(len(Vs), len(Es), nr_graphs)
    for i in range(nr_graphs):
        if i % 500 == 0:
            print('i = ', i)
            print('number of features', len(labels_maps[0]))
        V, E = Vs[i], Es[i]
        feature_maps = generate_feature_maps(V, E, h)
        if dirac:
            dirac_vector = feature_map_to_vector(feature_maps[1], dirac_map)
            dirac_vectors.append(dirac_vector)
        for j in range(1, h + 1):
            label_vector = feature_map_to_k_gram_vector(
                feature_maps[j], labels_maps[j - 1], k)
            vectors[2 * (j - 1)].append(label_vector)
            vectors_cosine[2 * (j - 1)].append(normalize_vector(label_vector))

            cs.clear()
            cs_cosine.clear()
            sketch_polynomial_feature_map(label_vector, cs, False)
            sketch_polynomial_feature_map(label_vector, cs_cosine, True)
            vectors[2 * (j - 1) + 1].append(
                tensorsketch.compute_tensorsketch_from_cs(
                    cs, max_p, nr_tables))
            vectors_cosine[2 * (j - 1) + 1].append(
                tensorsketch.compute_tensorsketch_from_cs(
                    cs_cosine, max_p, nr_tables))

    for j in range(h):
        maxlen_j = len(vectors[2 * j][nr_graphs - 1])
        print('total number of features', maxlen_j)
        for l in range(len(vectors[2 * j])):
            vectors[2 * j][l] += [0] * (maxlen_j - len(vectors[2 * j][l]))
            vectors_cosine[2 * j][l] += [0] * (maxlen_j -
                                               len(vectors_cosine[2 * j][l]))
    if dirac:
        maxlen_dirac = len(dirac_vectors[nr_graphs - 1])
        for l in range(len(dirac_vectors)):
            dirac_vectors[l] += [0] * (maxlen_dirac - len(dirac_vectors[l]))
    return vectors, vectors_cosine, dirac_vectors
Пример #6
0
    os_idx = 1
    if platform.system() == 'Windows':
        os_idx = 0

    filename = filenames[1 + os_idx]
    h = 1
    k = 2
    max_p = 2
    table_size = 500
    max_value = 200000
    nr_tables = 1
    nr_graphs_per_class = 100
    dirac = False

    cs = CountSketch(table_size, nr_tables * max_p, random_files[os_idx],
                     max_value)
    cs_cosine = CountSketch(table_size, nr_tables * max_p,
                            random_files[os_idx], max_value)

    print('Count sketch data structures initialized')

    print(dirname, filename)
    ratio = 0.5
    Vs, Es, classes = read_write_utilities.read_my_format(
        filename, 3400, ratio)

    start = time.time()
    vectors, vectors_cosine, dirac_vectors = graph2map(Vs, Es, len(classes), h,
                                                       k, table_size,
                                                       nr_tables, max_p, dirac)
    print('elapsed time ', time.time() - start)
Пример #7
0
        for j in range(p):
            table_j = count_sketch.get_table(i * p + j)
            tables_fft_i[j] = np.fft.fft(table_j)
        sketch_i = componentwise_multiplication(tables_fft_i)
        sketch_i = [val / math.sqrt(k) for val in sketch_i]
        tensorsketches.append(sketch_i)
    count_sketch.clear()
    return [np.real(val) for sketch in tensorsketches for val in sketch]


if __name__ == "__main__":
    print('Tensorsketch')
    random_files = ['<path to random files>', '']
    table_size = 300
    v1 = np.array(
        [10, 2, 3, 1100, 3, 28, 300, 12, 3, 21, 11, 20, 18, 16, 31, 300],
        dtype=np.uint64)
    v2 = np.array(
        [10, 29, 3, 1001, 3, 28, 109, 12, 13, 21, 110, 20, 108, 16, 301, 30],
        dtype=np.uint64)
    print(len(v1))
    print(len(v2))
    p = 2
    k = 1
    cs = CountSketch(table_size, k * p, random_files[0], 1000)
    tensorsketches1 = compute_tensorsketch(cs, v1, p, k)
    tensorsketches2 = compute_tensorsketch(cs, v2, p, k)

    print(np.dot(v1, v2)**p)
    print(np.real(np.dot(tensorsketches1, tensorsketches2)))
Пример #8
0
class ClassicalSketch:
    """
    """
    def __init__(self,n_data_rows:int, n_data_cols:int,\
                sk_dim:int,sk_mode='Gaussian',sparse_data=None,sjlt_sparsity=5):
        """
        Approximate OLS regression using random projections
        Parameters:
        - rp_dim (int)    : the number of rows retained in the random projection.
        - rp_mode (str)   : sketch mode used to decide on the sketch.
            - method: sketch and solve or iterative hessian
        """
        self.sk_mode = sk_mode
        self.n_data_rows = n_data_rows
        self.n_data_cols = n_data_cols
        self.sk_dim = min([sk_dim, n_data_rows])
        if self.sk_mode == 'Gaussian':
            self.sketcher = GaussianSketch(self.sk_dim, self.n_data_rows,
                                           self.n_data_cols)
        elif self.sk_mode == 'SRHT':
            # Add 1 to the number of data columns as we append a column for y later on
            try:
                self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows,
                                           self.n_data_cols + 1, 'HAD')
            except:
                self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows,
                                           self.n_data_cols + 1, 'DCT')
        elif self.sk_mode == 'SJLT':
            self.sketcher = SparseJLT(self.sk_dim,
                                      self.n_data_rows,
                                      self.n_data_cols + 1,
                                      col_sparsity=sjlt_sparsity)
        elif self.sk_mode == 'CountSketch':
            self.sketcher = CountSketch(self.sk_dim, self.n_data_rows,
                                        self.n_data_cols + 1)

        self.sparse_data_is_set = False  # Init for dense sketches to False
        if (sparse_data is not None) and (self.sk_mode == 'SJLT'
                                          or self.sk_mode == 'CountSketch'):
            self.sketcher.set_sparse_data(sparse_data)
            self.sparse_data_is_set = True

    def _sketch_data_targets(self, X, y, seed=100):
        """
        Sketches the [data,target] concatenated data for sketch-and-solve.
        Slightly different functionality for dense or sparse sketches as sparse 
        data is set in the __init__ function 
        """
        if self.sparse_data_is_set:
            self.sketcher.sketch(seed)
        else:
            Xy = np.c_[X, y]
            self.sketcher.sketch(Xy, seed)

    def _solve(self, in_svd=False):
        """
        Gets the sketch matrix and splits into the [SX, Sy] parts.
        Then solves the regression instance using SVD
        """
        _sketch = self.sketcher.get(in_svd=False)
        SX, Sy = _sketch[:, :-1], _sketch[:, -1].reshape(-1, 1)
        u, sig, vt = np.linalg.svd(SX, full_matrices=False)
        sig = sig[:, np.newaxis]
        sig_inv = 1. / sig
        weights = (vt.T @ (sig_inv * u.T)) @ Sy
        weights = np.linalg.lstsq(
            SX, Sy, rcond=None)[0]  # rcond = None is to ignore a warning flag.
        return weights

    def _time_sketch_solve(self, X, y, seed=100):
        """
        Times each of the individual process for sketch and solve methods
        """
        times = {'Total': 0., 'Sketch': 0., 'SVD': 0., 'Solve': 0.}
        TIMER_START = timer()

        SKETCH_TIMER = timer()
        self._sketch_data_targets(X, y, seed)
        times['Sketch'] = timer() - SKETCH_TIMER

        _sketch = self.sketcher.get(in_svd=False)
        SX, Sy = _sketch[:, :-1], _sketch[:, -1].reshape(-1, 1)

        SVD_TIMER = timer()
        u, sig, vt = np.linalg.svd(SX, full_matrices=False)
        times['SVD'] = timer() - SVD_TIMER

        SOLVE_TIME = timer()
        sig = sig[:, np.newaxis]
        sig_inv = 1. / sig
        weights = (vt.T @ (sig_inv * u.T)) @ Sy
        weights = np.linalg.lstsq(SX, Sy, rcond=None)[0]
        times['Solve'] = timer() - SOLVE_TIME
        times['Total'] = timer() - TIMER_START
        return weights, times

    def fit(self, X, y, seed=100, in_svd=False, timing=False):
        """
        Fits the sketched regression model with a classical sketch to 
        data X and y.
        First step is to sketch the data using the given sketch from init.
        Second step is to solve the regression instance, using either the lstsq
        solver, or in SVD format.
        """
        if timing:
            return self._time_sketch_solve(X, y, seed)
        self._sketch_data_targets(X, y, seed)
        weights = self._solve()
        return weights
Пример #9
0
class IterativeHessianOLS:
    """
    An iterative solver for the optimisation problem
    f(x) = 1/2 ||Ax - y||_2^2
    by randomised newton method.
    """

    def __init__(self,n_data_rows:int, n_data_cols:int,\
                sk_dim:int,sk_mode='Gaussian',sparse_data=None,\
                ihs_mode='multi',sjlt_sparsity=5):
        """
        Instantiates the IHS sketching construction.
        """
        self.sk_mode = sk_mode
        self.n_data_rows = n_data_rows
        self.n_data_cols = n_data_cols
        self.sk_dim = min([sk_dim, n_data_rows])
        self.ihs_mode = ihs_mode

        if self.sk_mode == 'Gaussian':
            self.sketcher = GaussianSketch(self.sk_dim, self.n_data_rows,
                                           self.n_data_cols)
        elif self.sk_mode == 'SRHT':
            # Add 1 to the number of data columns as we append a column for y later on
            try:
                self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows,
                                           self.n_data_cols, 'HAD')
            except:
                self.sketcher = SRHTSketch(self.sk_dim, self.n_data_rows,
                                           self.n_data_cols, 'DCT')
        elif self.sk_mode == 'SJLT':
            self.sketcher = SparseJLT(self.sk_dim,
                                      self.n_data_rows,
                                      self.n_data_cols,
                                      col_sparsity=sjlt_sparsity)
        elif self.sk_mode == 'CountSketch':
            self.sketcher = CountSketch(self.sk_dim, self.n_data_rows,
                                        self.n_data_cols)

        self.sparse_data_is_set = False  # Init for dense sketches to False
        if (sparse_data is not None) and (self.sk_mode == 'SJLT'
                                          or self.sk_mode == 'CountSketch'):
            self.sketcher.set_sparse_data(sparse_data)
            self.sparse_data_is_set = True

    def _init_iterations(self, X, y, iterations):
        """
        Initialises the arrays we use for iterations
        - current_weights the vector we will update under iterative scheme
        - weights_hist is an array which contains all of the updated weights used for error history
        - XTy is the projection of the targets onto the column space of the data X
        """
        current_weights = np.zeros((self.n_data_cols, 1), dtype=float)
        weights_hist = np.zeros((self.n_data_cols, iterations), dtype=float)
        XTy = (X.T @ y).reshape(-1, 1)
        return current_weights, weights_hist, XTy

    def _grad(self, X, vec, XTy):
        """
        Returns the gradient function 
        nabla f(x) = A.T ( Ax - y )

        We use as input the vector XTy so no need to recompute
        """
        return X.T @ (X @ vec) - XTy

    def _iterate_multiple(self, X, y, iterations=10, timing=False):
        if timing:
            return self._iterate_multiple_timing(X, y, iterations)
        current_x, all_x, XTy = self._init_iterations(X, y, iterations)
        for it in range(iterations):
            #######################################################
            # 1. Generate a sketch and obtain the svd factors for efficient solving.
            self.sketcher.sketch(X, seed=1000 * it)
            u, sig, vt = self.sketcher.get(in_svd=True)
            sig = sig[:, np.newaxis]
            sig_inv = 1. / sig
            gradient = self._grad(X, current_x, XTy)
            update = -vt.T @ (
                sig_inv**2 * (vt @ gradient)
            )  # This solves lineat system H update = - gradient
            current_x += update
            all_x[:, it] = current_x[:, 0]
        return current_x, all_x

    def _iterate_multiple_timing(self, X, y, iterations=10):
        """
        Performs the iterations but also records the timing of each individual part.
        """
        times = {
            'Total': 0.,
            'Sketch': np.zeros(iterations, dtype=float),
            'SVD': np.zeros(iterations, dtype=float),
            'Solve': np.zeros(iterations, dtype=float)
        }
        TIMER_START = timer()
        current_x, all_x, XTy = self._init_iterations(X, y, iterations)
        for it in range(iterations):
            #######################################################
            # 1. Generate a sketch and obtain the svd factors for efficient solving.
            SKETCH_TIMER = timer()
            self.sketcher.sketch(X, seed=1000 * it)
            times['Sketch'][it] = timer() - SKETCH_TIMER

            SVD_TIMER = timer()
            u, sig, vt = self.sketcher.get(in_svd=True)
            times['SVD'][it] = timer() - SVD_TIMER
            sig = sig[:, np.newaxis]
            sig_inv = 1. / sig

            SOLVE_TIME = timer()
            gradient = self._grad(X, current_x, XTy)
            update = -vt.T @ (
                sig_inv**2 * (vt @ gradient)
            )  # This solves lineat system H update = - gradient
            current_x += update
            times['Solve'][it] = timer() - SOLVE_TIME
            all_x[:, it] = current_x[:, 0]
            #######################################################
        times['Total'] = timer() - TIMER_START
        return current_x, all_x, times

    def fit(self, X, y, iterations=10, timing=False):
        """
        Fits the model without any timing on data X and targets y
        """
        if self.ihs_mode == 'multi':
            if timing:
                return self._iterate_multiple(X, y, iterations, timing=True)
            x, all_x = self._iterate_multiple(X, y, iterations)
        return x, all_x