예제 #1
0
    def select(self):

        shape = self.domain_shape

        if shape == (1, 1):
            # skip the calucation of newgrids if shape is of size 1
            matrix = sparse.csr_matrix(([1], ([0], [0])), shape=(1, 1))
            newgrid = 1

        else:
            eps = self.eps_par
            cur_noisy_x = self.x_hat
            noisycnt = cur_noisy_x.sum()
            # compute second level grid size
            if noisycnt <= 0:
                m2 = 1
            else:
                m2 = int(math.sqrt(noisycnt * eps / self.c2) - 1) + 1
            M2 = m2**2
            nn, mm = shape
            newgrid = int(math.sqrt(nn * mm * 1.0 / M2) - 1) + 1
            if newgrid <= 0:
                newgrid = 1
            num1 = int(util.old_div((nn - 1), newgrid) + 1)
            num2 = int(util.old_div((mm - 1), newgrid) + 1)
            # generate cell and pending queries base on new celss
            cells = GenerateCells(nn, mm, num1, num2, newgrid)
            matrix = cells_to_query(cells, (nn, mm))

        return matrix
예제 #2
0
    def mapping(self):
        n, m = self.domain_shape
        N = self.data_sum
        eps = self.eps_par

        if self.ag_flag:
            m1 = int(math.sqrt((N * eps) / self.c) / 4 - 1) + 1
            if m1 < 10:
                m1 = 10
            M = m1**2

            grid = int(math.sqrt(n * m * 1.0 / M) - 1) + 1
            if grid <= 0:
                grid = 1

        else:
            M = util.old_div((N * eps), self.c)
            if self.gz == 0:
                grid = int(math.sqrt(n * m / M) - 1) + 1
            else:
                grid = int(self.gz)
            if grid < 1:
                grid = 1

        num1 = int(util.old_div((n - 1), grid) + 1)
        num2 = int(util.old_div((m - 1), grid) + 1)

        # TODO: potential optimization if grid ==1 identity workload
        cells = UGridPartition.GenerateCells(n, m, num1, num2, grid)
        return cells_to_mapping(cells, (n, m))
예제 #3
0
    def select(self):

        shape = self.domain_shape

        if shape == (1, 1):
            # skip the calucation of newgrids if shape is of size 1
            return workload.RangeQueries((1, 1),
                                         lower=np.array([[0, 0]]),
                                         higher=np.array([[0, 0]]))

        else:
            eps = self.eps_par
            cur_noisy_x = self.x_hat
            noisycnt = cur_noisy_x.sum()
            # compute second level grid size
            if noisycnt <= 0:
                m2 = 1
            else:
                m2 = int(math.sqrt(noisycnt * eps / self.c2) - 1) + 1
            M2 = m2**2
            nn, mm = shape
            newgrid = int(math.sqrt(nn * mm * 1.0 / M2) - 1) + 1
            if newgrid <= 0:
                newgrid = 1
            num1 = int(util.old_div((nn - 1), newgrid) + 1)
            num2 = int(util.old_div((mm - 1), newgrid) + 1)

            # generate cell and pending queries base on new celss
            lower, higher = AdaptiveGrid.grid_split_range(
                (0, 0), (nn - 1, mm - 1), branching_list=[num1, num2])

        return workload.RangeQueries(self.domain_shape, np.array(lower),
                                     np.array(higher))
예제 #4
0
def rect_to_quads(x):
    '''
    Given an np array it splits it correctly to 4 quads in the midpoints
    can handle arrays of arbitrary shape (1D as well)
    '''

    n_rows = x.shape[0]
    n_cols = x.shape[1]
    # If ncol is odd, do vert splits in balanced manner
    col_parity = 0
    if n_cols % 2:
        col_parity = 1
    col_midpoint = util.old_div(x.shape[1], 2)
    row_midpoint = util.old_div(x.shape[0], 2)

    if x.shape[0] == 1:
        # if x has only one row then do only vertical split
        x1, x2 = np.split(x, [col_midpoint], axis=1)
        return [x1, x2]

    if x.shape[1] == 1:
        # if x has only one col then do only horizontal split
        x1, x2 = np.split(x, [row_midpoint], axis=0)
        return [x1, x2]

    # o/w do both splits
    x_h1, x_h2 = np.split(x, [row_midpoint], axis=0)
    x1, x2 = np.split(x_h1, [col_midpoint], axis=1)
    x3, x4 = np.split(x_h2, [col_midpoint + col_parity], axis=1)

    return [x1, x2, x3, x4]
예제 #5
0
    def hilbert(N):
        """
        Produce coordinates of an NxN Hilbert curve.    

        @param N:
             the length of side, assumed to be a power of 2 ( >= 2) 

        @returns:
              x and y, each as an array of integers representing coordinates
              of points along the Hilbert curve. Calling plot(x, y)
              will plot the Hilbert curve.  

        From Wikipedia
        """
        assert 2**int(math.ceil(math.log(
            N, 2))) == N, "N={0} is not a power of 2!".format(N)
        if N == 2:
            return np.array((0, 0, 1, 1)), np.array((0, 1, 1, 0))
        else:
            x, y = HilbertTransform.hilbert(util.old_div(N, 2))
            xl = np.r_[y, x, util.old_div(N, 2) + x, N - 1 - y]
            yl = np.r_[x,
                       util.old_div(N, 2) + y,
                       util.old_div(N, 2) + y,
                       util.old_div(N, 2) - 1 - x]
            return xl, yl
예제 #6
0
    def select(self):
        n, m = self.domain_shape
        N = self.data_sum
        eps = self.eps_par

        if self.ag_flag:
            m1 = int(math.sqrt((N * eps) / self.c) / 4 - 1) + 1
            if m1 < 10:
                m1 = 10
            M = m1**2

            grid = int(math.sqrt(n * m * 1.0 / M) - 1) + 1
            if grid <= 0:
                grid = 1

        else:
            M = util.old_div((N * eps), self.c)
            if self.gz == 0:
                grid = int(math.sqrt(n * m / M) - 1) + 1
            else:
                grid = int(self.gz)
            if grid < 1:
                grid = 1

        num1 = int(util.old_div((n - 1), grid) + 1)
        num2 = int(util.old_div((m - 1), grid) + 1)

        lower, upper = GenerateCells(n, m, num1, num2, grid)
        return workload.RangeQueries((n, m), np.array(lower), np.array(upper))
예제 #7
0
파일: greedyH.py 프로젝트: zshwuhan/ektelo
    def Run(self, QtQ, x, epsilon, seed):
        """
        QtQ - given the workload Q in matrix form, QtQ is the
              multiplication between the transpose of Q and Q.
        """
        assert seed is not None, 'seed must be set'

        prng = numpy.random.RandomState(seed)

        x = numpy.array(x)
        assert len(
            x.shape
        ) == 1, '%s is defined for 1D data only' % self.__class__.__name__

        n = len(x)
        err, inv, dist, query = self._GreedyHierByLv(QtQ, n, 0, withRoot=False)
        qmat = []
        y2 = []
        for c in range(len(dist)):
            if dist[c] > 0:
                lb, rb = query[c]
                currow = numpy.zeros(n)
                currow[lb:rb + 1] = dist[c]
                qmat.append(currow)
                y2.append(sum(x[lb:(rb + 1)]) * dist[c])

        qmat = numpy.array(qmat)
        y2 += prng.laplace(0.0, util.old_div(1.0, epsilon), len(y2))

        return numpy.dot(inv, numpy.dot(qmat.T, y2))
예제 #8
0
 def setUp(self):
     n = 1024
     scale = 1E5
     self.hist = numpy.array(list(range(n)))
     self.d = dataset.Dataset(self.hist, None)
     self.dist = numpy.random.exponential(1, n)
     self.dist = util.old_div(self.dist, float(self.dist.sum()))
     self.ds = dataset.DatasetSampled(self.dist, scale, None, 1001)
예제 #9
0
def GenerateCells(n, m, num1, num2, grid):
    # this function used to generate all the cells in UGrid
    assert math.ceil(util.old_div(n, float(grid))) == num1 and math.ceil(
        util.old_div(m, float(grid))
    ) == num2, "Unable to generate cells for Ugrid: check grid number and grid size"
    cells = []
    for i in range(num1):
        for j in range(num2):
            lb = [int(i * grid), int(j * grid)]
            rb = [int((i + 1) * grid - 1), int((j + 1) * grid - 1)]
            if rb[0] >= n:
                rb[0] = int(n - 1)
            if rb[1] >= m:
                rb[1] = int(m - 1)

            cells = cells + [[lb, rb]]

    return cells
예제 #10
0
    def _rebuild(partition, counts, n):
        """Rebuild an estimated data using uniform expansion."""
        estx = numpy.zeros(n)
        n2 = len(counts)
        for c in range(n2):
            lb, rb = partition[c]
            estx[lb:(rb + 1)] = util.old_div(counts[c], float(rb - lb + 1))

        return estx
예제 #11
0
def get_A(M, noise_scales):
    """
        Calculate matrix 'A' of measurements, scaled appropriately for inference

    """
    sf = (util.old_div(1.0, np.array(noise_scales))
          )  # reciprocal of each noise scale
    D = sparse.spdiags(sf, 0, sf.size, sf.size)
    return D * M  # scale rows
예제 #12
0
def get_y(ans, noise_scales):
    """
        Calculate 'y' of answers, scaled appropriately for inference
    """
    sf = (util.old_div(1.0, np.array(noise_scales))
          )  # reciprocal of each noise scale
    y = ans * sf  # element-wise multiplication
    y = y[:, np.newaxis]  # make column vector
    return y
예제 #13
0
def L1partition_approx(x, epsilon, ratio=0.5, gethist=False,seed =None):
    """Compute the noisy L1 histogram using interval buckets of size 2^k

    Args:
        x - list of numeric values. The input data vector
        epsilon - double. Total private budget
        ratio - double in (0, 1) the use ratio*epsilon for partition computation and (1-ratio)*epsilon for querying
                the count in each partition
        gethist - boolean. If set to truth, return the partition directly (the privacy budget used is still ratio*epsilon)

    Return:
        if gethist == False, return an estimated data vector. Otherwise, return the partition
    """
    assert seed is not None, "seed must be set"
    prng = numpy.random.RandomState(seed)

    n = len(x)
    # check that the input vector x is of appropriate type
    assert (x.dtype == numpy.dtype(int) or x.dtype == numpy.dtype("int32")), "Input vector must be int! %s given" %x.dtype
    y=x.astype('int32') #numpy type int32 is not not JSON serializable
    check = (x ==y)
    assert check.sum() == len(check), "Casting error from int to int32"
    x=y

    hist = cutil.L1partition_approx(n+1, x, epsilon, ratio, prng.randint(500000))
    hatx = numpy.zeros(n)
    rb = n
    if gethist:
        bucks = []
        for lb in hist[1:]:
            bucks.insert(0, [lb, rb-1])
            rb = lb
            if lb == 0:
                break
              
        return bucks
    else:
        for lb in hist[1:]:
            hatx[lb:rb] = util.old_div(max(0, sum(x[lb:rb]) + prng.laplace(0, util.old_div(1.0,(epsilon*(1-ratio))), 1)), float(rb - lb))
            rb = lb
            if lb == 0:
                break

        return hatx
예제 #14
0
 def g(*idx):
     """
     This function will receive an index tuple from numpy.fromfunction
     It's behavior depends on grid_shape: take (i,j) and divide by grid_shape (in each dimension)
     That becomes an identifier of the block; then assign a unique integer to it using pairing.
     """
     x = numpy.array(idx)
     y = numpy.array(grid_shape)
     return general_pairing(util.old_div(
         x, y))  # broadcasting integer division
예제 #15
0
    def Run(self, W, x, eps, seed):
        domain_dimension = len(self.domain_shape)
        eps_share = util.old_div(float(eps), domain_dimension)

        x = x.flatten()
        prng = np.random.RandomState(seed)
        
        Ms = []
        ys = []
        scale_factors = []
        for i in range(domain_dimension):
            # Reducde domain to get marginals
            marginal_mapping = mapper.MarginalPartition(
                domain_shape=self.domain_shape, proj_dim=i).mapping()
            reducer = transformation.ReduceByPartition(marginal_mapping)
            x_i = reducer.transform(x)

            if self.domain_shape[i] < 50:
                # run identity subplan
                M_i = selection.Identity(x_i.shape).select()
                y_i = measurement.Laplace(M_i, eps_share).measure(x_i, prng)
                noise_scale_factor = laplace_scale_factor(
                    M_i, eps_share)
                
            else:
                # run dawa subplan
                W = get_matrix(W)

                W_i = W * support.expansion_matrix(marginal_mapping)

                dawa = pmapper.Dawa(eps_share, self.ratio, self.approx)
                mapping = dawa.mapping(x_i, prng)

                reducer = transformation.ReduceByPartition(mapping)
                x_bar = reducer.transform(x_i)
                W_bar = W_i * support.expansion_matrix(mapping)

                M_bar = selection.GreedyH(x_bar.shape, W_bar).select()
                y_i = measurement.Laplace(
                    M_bar, eps_share * (1 - self.ratio)).measure(x_bar, prng)

                noise_scale_factor = laplace_scale_factor(
                    M_bar, eps_share * (1 - self.ratio))

                # expand the dawa reduction
                M_i = M_bar * support.reduction_matrix(mapping)

            MM = M_i * support.reduction_matrix(marginal_mapping)
            Ms.append(MM)
            ys.append(y_i)
            scale_factors.append(noise_scale_factor)

        x_hat = inference.LeastSquares(method='lsmr').infer(Ms, ys, scale_factors)

        return x_hat  
예제 #16
0
파일: data.py 프로젝트: zshwuhan/ektelo
    def statistics(self):
        assert self.hist is not None
        assert self.edges is not None

        hist_data = {}
        hist_data['nz_perc'] = util.old_div(np.count_nonzero(self.hist),
                                            float(self.hist.size))
        hist_data['max_bin_val'] = self.hist.max()
        hist_data['total_records'] = self.hist.sum()

        return hist_data
예제 #17
0
    def quad_split_range(cur_range_l, cur_range_u, **kwarg):
        '''
        Given an rectangular domain represented using boarder cordinates (upper_left, lower_right),
        it splits it correctly to 4 quads in the midpoints
        '''
        ul, lr = cur_range_l, cur_range_u
        upper, left = ul
        lower, right = lr

        n_rows = lower - upper + 1
        n_cols = right - left + 1

        # If ncol is odd, do vert splits in balanced manner
        col_parity = 0
        if n_cols % 2:
            col_parity = 1
        col_midpoint = left + util.old_div(n_cols, 2)
        row_midpoint = upper + util.old_div(n_rows, 2)

        if n_rows == 1:
            # if x has only one row then do only vertical split
            row = lr[0]
            return [ul, (row, col_midpoint)], [(row, col_midpoint - 1), lr]

        if n_cols == 1:
            # if x has only one col then do only horizontal split
            col = lr[1]
            return [ul, (row_midpoint, col)], [(row_midpoint - 1, col), lr]

        # o/w do both splits
        q1 = (ul, (row_midpoint - 1, col_midpoint - 1))
        q2 = ((upper, col_midpoint), (row_midpoint - 1, right))
        q3 = ((row_midpoint, left), (lower, col_midpoint - 1 + col_parity))
        q4 = ((row_midpoint, col_midpoint + col_parity), lr)

        lower = [coordinates[0] for coordinates in [q1, q2, q3, q4]]
        upper = [coordinates[1] for coordinates in [q1, q2, q3, q4]]
        return lower, upper
예제 #18
0
def GenerateCells(n, m, num1, num2, grid):
    '''
    Generate grid shaped celles for UniformGrid and AdaptiveGrid.and
    n, m: 2D domain shape
    num1, num2: number of cells along two dimensions
    grid: grid size
    '''
    assert math.ceil(util.old_div(n, float(grid))) == num1 and math.ceil(
        util.old_div(m, float(grid))
    ) == num2, "Unable to generate cells for Ugrid: check grid number and grid size"
    lower, upper = [], []
    for i in range(num1):
        for j in range(num2):
            lb = [int(i * grid), int(j * grid)]
            rb = [int((i + 1) * grid - 1), int((j + 1) * grid - 1)]
            if rb[0] >= n:
                rb[0] = int(n - 1)
            if rb[1] >= m:
                rb[1] = int(m - 1)

            lower.append(lb)
            upper.append(rb)

    return lower, upper
예제 #19
0
    def Run(self, W, x, eps):
        domain_dimension = len(self.domain_shape)
        eps_share = util.old_div(float(eps), domain_dimension)

        Ms = []
        ys = []
        scale_factors = []
        for i in range(domain_dimension):
            # Reducde domain to get marginals
            marginal_mapping = marginal_partition(self.domain_shape, i)

            x_i = x.reduce_by_partition(marginal_mapping)

            if self.domain_shape[i] < 50:
                # run identity subplan

                M_i = identity((self.domain_shape[i], ))
                y_i = x_i.laplace(M_i, eps_share)
                noise_scale_factor = laplace_scale_factor(M_i, eps_share)

            else:
                # run dawa subplan
                W_i = W * support.expansion_matrix(marginal_mapping)

                mapping = x_i.dawa(self.ratio, self.approx, eps_share)
                x_bar = x_i.reduce_by_partition(mapping)
                W_bar = W_i * support.expansion_matrix(mapping)

                M_bar = greedyH((len(set(mapping)), ), W_bar)
                y_i = x_bar.laplace(M_bar, eps_share * (1 - self.ratio))

                noise_scale_factor = laplace_scale_factor(
                    M_bar, eps_share * (1 - self.ratio))

                # expand the dawa reduction
                M_i = M_bar * support.reduction_matrix(mapping)

            # TODO: Ideally this would be just M_i * support.reduction_matrix(marginal_mapping)
            # but currently that returns an int type matrix
            # because the type of P_i is int
            MM = (support.reduction_matrix(marginal_mapping).T * M_i.T).T
            Ms.append(MM)
            ys.append(y_i)
            scale_factors.append(noise_scale_factor)

        x_hat = least_squares(Ms, ys, scale_factors)

        return x_hat
예제 #20
0
 def get_boarder(dim_len, branching):
     if branching > dim_len:
         split_num = dim_len
         boarder = [(i, i) for i in range(split_num)]
     elif dim_len % branching != 0:
         new_hsize = np.divide(float(dim_len), branching)
         split_num = [
             np.ceil(new_hsize * (i + 1)).astype(int)
             for i in range(branching - 1)
         ]
         temp = [i - 1 for i in split_num]
         boarder = list(zip(([0] + split_num), (temp + [dim_len - 1])))
     else:
         cell_size_h = util.old_div(dim_len, branching)
         boarder = [(i * cell_size_h, (i + 1) * cell_size_h - 1)
                    for i in range(branching)]
     return boarder
예제 #21
0
    def test_old_div(self):
        self.assertEqual(util.old_div(1, 2), 0)
        self.assertEqual(util.old_div(2, 2), 1)
        self.assertEqual(util.old_div(3, 2), 1)
        self.assertEqual(util.old_div(1, 2.0), 0.5)

        x = np.array((1.0,))
        y = np.array((2.0,), dtype=np.int_)
        z = np.array((2.0,), dtype=np.float_)
        w = 1.0
        zero = np.zeros((1,))
        half = 0.5 * np.ones((1,))

        self.assertEqual(util.old_div(x, y), zero)
        self.assertEqual(util.old_div(x, z), half)
        self.assertEqual(util.old_div(w, z), np.array(half))
예제 #22
0
    def Run(self, Q, x, epsilon, seed):
        """Run three engines in order with given epsilons to estimate a
        dataset x to answer query set Q
        
        Q - the query workload
        x - the underlying dataset
        epsilon - the total privacy budget
        """
        assert seed is not None, 'seed must be set'
        prng = numpy.random.RandomState(seed)

        n = len(x)

        pSeed = prng.randint(500000)
        eSeed = prng.randint(500000)

        if self._partition_engine is None:
            # ignore ratio when partition_engine is omitted
            return self._DirectRun(Q, x, epsilon, eSeed)
        else:
            if self._ratio < 0 or self._ratio >= 1:
                raise ValueError('ratio must in range [0, 1)')

            partition = self.Compute_partition(x, epsilon, pSeed)
            # check that partition buckets span domain
            assert min(itertools.chain(*partition)) == 0
            assert max(itertools.chain(*partition)) == (n - 1)

            eps2 = (
                1 - self._ratio
            ) * epsilon  # this is epsilon_2 used in paper (the epsilon for estimation)
            devs = abs(numpy.array(x) - (util.old_div(sum(x), float(len(x)))))

            counts = self._estimate_engine.Run(
                self._workload_reform(Q, partition, n),
                self._dataset_reform(x, partition),
                epsilon * (1 - self._ratio), eSeed)
            return self._rebuild(partition, counts, n)
예제 #23
0
    def __init__(self, uniformity, dom_shape, scale, seed=None):
        '''
        Generate synthetic data of varying uniformity
        uniformity: parameter in [0,1] where 1 produces perfectly uniform data, 0 is maximally non-uniform
        All cells set to zero except fraction equal to 'uniformity' value.
        All non-zero cells are set to same value, then shuffled randomly.
        '''
        self.init_params = util.init_params_from_locals(locals())
        self.u = uniformity
        assert 0 <= uniformity and uniformity <= 1
        n = numpy.prod(dom_shape)  # total domain size
        hist = numpy.zeros(n)
        num_nonzero = max(1, int(uniformity * n))
        hist_value = util.old_div(scale, num_nonzero)
        hist[0:num_nonzero] = hist_value

        prng = numpy.random.RandomState(seed)
        prng.shuffle(hist)

        super(DatasetUniformityVaryingSynthetic,
              self).__init__(hist.reshape(dom_shape),
                             reduce_to_domain_shape=None,
                             dist=None)
예제 #24
0
    def infer(self, Ms, ys, scale_factors=None):
        ''' Either:
            1) Ms is a single M and ys is a single y 
               (scale_factors ignored) or
            2) Ms and ys are lists of M matrices and y vectors
               and scale_factors is a list of the same length.
        '''
        A, y = self._apply_scales(Ms, ys, scale_factors)

        if self.known_total is not None:
            A, y = self.__known_total_problem(A, y)

        if self.method == 'standard':
            assert self.l2_reg == 0, 'l2 reg not supported with method=standard'
            assert isinstance(
                A,
                np.ndarray), "method 'standard' only works with dense matrices"
            (x_est, _, rank, _) = linalg.lstsq(A, y, lapack_driver='gelsy')
        elif self.method == 'lsmr':
            res = lsmr(A, y, atol=0, btol=0, damp=self.l2_reg)
            x_est = res[0]
        elif self.method == 'lsqr':
            res = lsqr(A, y, atol=0, btol=0, damp=self.l2_reg)
            x_est = res[0]

        if self.known_total is not None:
            x_est = np.append(x_est, self.known_total - x_est.sum())

        x_est = x_est.reshape(A.shape[1])  # reshape to match shape of x

        # James-Stein estimation
        if self.stein and x_est.size >= 3:
            adjustment = 1.0 - util.old_div((x_est.size - 2), (x_est**2).sum())
            x_est *= adjustment

        return x_est
예제 #25
0
def variance(N, b):
    '''Computes variance given domain of size N
    and branchng factor b.  Equation 3 from paper.'''
    h = math.ceil(math.log(N, b))

    return (((b - 1) * h**3) - (util.old_div((2 * (b + 1) * h**2), 3)))
예제 #26
0
def cantor_pairing(a, b):
    """
    A function returning a unique positive integer for every pair (a,b) of positive integers
    """
    return util.old_div((a + b) * (a + b + 1), 2) + b
예제 #27
0
 def fractionZeros(self):
     zero_count = (self.payload == 0).sum()
     return util.old_div(float(zero_count), self.payload.size)
예제 #28
0
        It's behavior depends on grid_shape: take (i,j) and divide by grid_shape (in each dimension)
        That becomes an identifier of the block; then assign a unique integer to it using pairing.
        """
        x = numpy.array(idx)
        y = numpy.array(grid_shape)
        return general_pairing(util.old_div(
            x, y))  # broadcasting integer division

    h = numpy.vectorize(g)

    # numpy.fromfunction builds an array of domain_shape by calling a function with each index tuple (e.g. (i,j))
    partition_array = numpy.fromfunction(h, domain_shape, dtype=int)
    # transform to canonical order
    partition_array = canonicalTransform(partition_array)
    return partition_array


if __name__ == '__main__':

    scale = 10000
    for u in [util.old_div(i, 10.0) for i in range(0, 11)]:
        print(u)
        d = DatasetUniformityVaryingSynthetic(uniformity=u,
                                              dom_shape=(10, ),
                                              scale=scale,
                                              seed=999)
        size = d.payload.size
        unif = numpy.empty_like(d.payload)
        unif.fill(util.old_div(scale, float(size)))
        print(sum(abs(unif - d.payload)))
예제 #29
0
파일: greedyH.py 프로젝트: zshwuhan/ektelo
    def _GreedyHierByLv(self, fullQtQ, n, offset, depth=0, withRoot=False):
        """Compute the weight distribution of one node of the tree by minimzing
        error locally.
        
        fullQtQ - the same matrix as QtQ in the Run method
        n - the size of the submatrix that is corresponding
            to current node
        offset - the location of the submatrix in fullQtQ that
                 is corresponding to current node
        depth - the depth of current node in the tree
        withRoot - whether the accurate root count is given
        
        Returns: error, inv, weights, queries
        error - the variance of query on current node with epsilon=1
        inv - for the query strategy (the actrual weighted queries to be asked)
              matrix A, inv is the inverse matrix of A^TA
        weights - the weights of queries to be asked
        queries - the list of queries to be asked (all with weight 1)
        """
        if n == 1:
            return numpy.linalg.norm(fullQtQ[:, offset], 2)**2, \
                   numpy.array([[1.0]]), \
                   numpy.array([1.0]), [[offset, offset]]

        QtQ = fullQtQ[:, offset:offset + n]
        if (numpy.min(QtQ, axis=1) == numpy.max(QtQ, axis=1)).all():
            mat = numpy.zeros([n, n])
            mat.fill(util.old_div(1.0, n**2))
            return numpy.linalg.norm(QtQ[:,0], 2)**2, \
                   mat, numpy.array([1.0]), [[offset, offset+n-1]]

        if n <= self._branch:
            bound = list(zip(list(range(n)), list(range(1, n + 1))))
        else:
            rem = n % self._branch
            step = util.old_div((n - rem), self._branch)
            swi = (self._branch - rem) * step
            sep = list(range(0, swi, step)) + list(range(swi, n,
                                                         step + 1)) + [n]
            bound = list(zip(sep[:-1], sep[1:]))

        serr, sinv, sdist, sq = list(
            zip(*[
                self._GreedyHierByLv(
                    fullQtQ, c[1] - c[0], offset + c[0], depth=depth + 1)
                for c in bound
            ]))
        invAuList = [c.sum(axis=0) for c in sinv]
        invAu = numpy.hstack(invAuList)
        k = invAu.sum()
        m1 = sum(
            map(
                lambda rng, v: numpy.linalg.norm(
                    numpy.dot(QtQ[:, rng[0]:rng[1]], v), 2)**2, bound,
                invAuList))
        m = numpy.linalg.norm(numpy.dot(QtQ, invAu), 2)**2
        sumerr = sum(serr)

        if withRoot:
            return sumerr, block_diag(*sinv), \
                   numpy.hstack([[0], numpy.hstack(sdist)]), \
                   [[offset, offset+n-1]] + list(itertools.chain(*sq))

        decay = util.old_div(1.0, (self._branch**(util.old_div(depth, 2.0))))
        err1 = numpy.array(list(range(self._granu, 0, -1)))**2
        err2 = numpy.array(list(range(self._granu)))**2 * decay
        toterr = 1.0 / err1 * (sumerr - ((m - m1) * decay + m1) * err2 /
                               (err1 + err2 * k))

        err = toterr.min() * self._granu**2
        perc = 1 - util.old_div(numpy.argmin(toterr), float(self._granu))
        inv = (util.old_div(1.0, perc))**2 * (
            block_diag(*sinv) - (1 - perc)**2 / (perc**2 + k * (1 - perc)**2) *
            numpy.dot(invAu.reshape([n, 1]), invAu.reshape([1, n])))
        dist = numpy.hstack([[1 - perc], perc * numpy.hstack(sdist)])
        return err, inv, dist, \
               [[offset, offset+n-1]] + list(itertools.chain(*sq))
예제 #30
0
 def testDatasetReduce(self):
     div = 4
     new_shape = (util.old_div(self.hist.shape[0], div), )
     dr = dataset.Dataset(hist=self.hist, reduce_to_domain_shape=new_shape)
     self.assertEqual(dr.domain_shape, new_shape)