示例#1
0
    def calc(self, data):
        """Calculate averages. Keys (tuples) are converted
        into linear indices based on their dimensions

        Parameters
        ----------
        data : RDD of (tuple, array) pairs, each array of shape (ncols,)
            Data to compute averages from

        Returns
        -------
        ts : array, shape (n, ncols)
        """

        dims = getdims(data)
        data = subtoind(data, dims.max)

        # loop over indices, averaging time series
        ts = zeros((self.n, len(data.first()[1])))
        for i in range(0, self.n):
            if len(self.inds[i]) > 0:
                ts[i, :] = self.select(
                    data, i).map(lambda (k, x): x).sum() / len(self.inds[i])

        return ts
示例#2
0
    def calc(self, data):
        """Calculate averages. Keys (tuples) are converted
        into linear indices based on their dimensions

        Parameters
        ----------
        data : RDD of (tuple, array) pairs, each array of shape (ncols,)
            Data to compute averages from

        Returns
        -------
        self : returns an instance of self.
        """

        dims = getdims(data)
        data = subtoind(data, dims.max)

        # loop over indices, computing average keys and average values
        keys = zeros((self.n, len(dims.count())))
        values = zeros((self.n, len(data.first()[1])))
        for idx, indlist in enumerate(self.inds):
            if len(indlist) > 0:
                values[idx, :] = self.select(data, idx).map(lambda (k, x): x).sum() / len(indlist)
                keys[idx, :] = mean(map(lambda (k, v): k, indtosub(map(lambda k: (k, 0), indlist), dims.max)), axis=0)

        self.keys = keys
        self.values = values

        return self
示例#3
0
    def calc(self, data):
        """Calculate averages. Keys (tuples) are converted
        into linear indices based on their dimensions

        Parameters
        ----------
        data : RDD of (tuple, array) pairs, each array of shape (ncols,)
            Data to compute averages from

        Returns
        -------
        self : returns an instance of self.
        """

        dims = getdims(data)
        data = subtoind(data, dims.max)

        # loop over indices, computing average keys and average values
        keys = zeros((self.n, len(dims.count())))
        values = zeros((self.n, len(data.first()[1])))
        for idx, indlist in enumerate(self.inds):
            if len(indlist) > 0:
                values[idx, :] = self.select(
                    data, idx).map(lambda (k, x): x).sum() / len(indlist)
                keys[idx, :] = mean(map(
                    lambda (k, v): k,
                    indtosub(map(lambda k: (k, 0), indlist), dims.max)),
                                    axis=0)

        self.keys = keys
        self.values = values

        return self
示例#4
0
 def test_get_dims_array(self):
     subs = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (1, 3, 1), (2, 3, 1),
             (1, 1, 2), (2, 1, 2), (1, 2, 2), (2, 2, 2), (1, 3, 2), (2, 3, 2)]
     data_local = map(lambda x: (x, array([1.0])), subs)
     dims = getdims(data_local)
     assert(allclose(dims.max, (2, 3, 2)))
     assert(allclose(dims.count(), (2, 3, 2)))
     assert(allclose(dims.min, (1, 1, 1)))
示例#5
0
 def test_get_dims_array(self):
     subs = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (1, 3, 1),
             (2, 3, 1), (1, 1, 2), (2, 1, 2), (1, 2, 2), (2, 2, 2),
             (1, 3, 2), (2, 3, 2)]
     data_local = map(lambda x: (x, array([1.0])), subs)
     dims = getdims(data_local)
     assert (allclose(dims.max, (2, 3, 2)))
     assert (allclose(dims.count(), (2, 3, 2)))
     assert (allclose(dims.min, (1, 1, 1)))
示例#6
0
    def calc(self, data):
        """Compute correlation between every data point
        and the average of a local neighborhood,
        by correlating each data point with the average of a
        local neighborhood in x and y (typically time series data)

        Parameters
        ----------
        data : RDD of (tuple, array) pairs
            The data to compute correlations on

        Returns
        -------
        corr : RDD of (tuple, float) pairs
            The local correlation for each record, sorted by keys
        """

        def clip(val, mn, mx):
            """Clip a value below by mn and above by mx"""
            if val < mn:
                return mn
            if val > mx:
                return mx
            else:
                return val

        def maptoneighborhood(ind, ts, sz, mn, mx):
            """Create a list of key value pairs with multiple shifted copies
            of the time series ts over a region specified by sz
            """
            rng_x = range(-sz, sz+1, 1)
            rng_y = range(-sz, sz+1, 1)
            out = list()
            for x in rng_x:
                for y in rng_y:
                    new_x = clip(ind[0] + x, mn[0], mx[0])
                    new_y = clip(ind[1] + y, mn[1], mx[1])
                    newind = (new_x, new_y, ind[2])
                    out.append((newind, ts))
            return out

        # get boundaries using dimension keys
        dims = getdims(data)

        # flat map to key value pairs where the key is neighborhood identifier and value is time series
        neighbors = data.flatMap(lambda (k, v): maptoneighborhood(k, v, self.neighborhood, dims.min[0:2], dims.max[0:2]))

        # reduce by key to get the average time series for each neighborhood
        means = neighbors.reduceByKey(lambda x, y: x + y).mapValues(lambda x: x / ((2*self.neighborhood+1)**2))

        # join with the original time series data to compute correlations
        result = data.join(means)

        # get correlations
        corr = result.mapValues(lambda x: corrcoef(x[0], x[1])[0, 1]).sortByKey()

        return corr
示例#7
0
 def test_get_dims_rdd(self):
     subs = [
         (1, 1, 1),
         (2, 1, 1),
         (1, 2, 1),
         (2, 2, 1),
         (1, 3, 1),
         (2, 3, 1),
         (1, 1, 2),
         (2, 1, 2),
         (1, 2, 2),
         (2, 2, 2),
         (1, 3, 2),
         (2, 3, 2),
     ]
     data_local = map(lambda x: (x, array([1.0])), subs)
     data = self.sc.parallelize(data_local)
     dims = getdims(data)
     assert allclose(dims.max, (2, 3, 2))
     assert allclose(dims.count(), (2, 3, 2))
     assert allclose(dims.min, (1, 1, 1))
示例#8
0
    def calc(self, data):
        """Calculate averages. Keys (tuples) are converted
        into linear indices based on their dimensions

        Parameters
        ----------
        data : RDD of (tuple, array) pairs, each array of shape (ncols,)
            Data to compute averages from

        Returns
        -------
        ts : array, shape (n, ncols)
        """

        dims = getdims(data)
        data = subtoind(data, dims.max)

        # loop over indices, averaging time series
        ts = zeros((self.n, len(data.first()[1])))
        for i in range(0, self.n):
            ts[i, :] = self.select(data, i).map(lambda (k, x): x).sum() / len(self.inds[i])

        return ts