예제 #1
0
    def test_ind_to_sub_rdd(self):
        dataLocal = map(lambda x: (x, array([1.0])), range(1, 13))

        data = Series(self.sc.parallelize(dataLocal))
        subs = data.indToSub(dims=[2, 3, 2]).keys().collect()
        assert(allclose(subs, array([(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (1, 3, 1), (2, 3, 1),
                                     (1, 1, 2), (2, 1, 2), (1, 2, 2), (2, 2, 2), (1, 3, 2), (2, 3, 2)])))
예제 #2
0
    def test_sub_to_ind_rdd(self):
        subs = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (1, 3, 1), (2, 3, 1),
                (1, 1, 2), (2, 1, 2), (1, 2, 2), (2, 2, 2), (1, 3, 2), (2, 3, 2)]
        dataLocal = map(lambda x: (x, array([1.0])), subs)

        data = Series(self.sc.parallelize(dataLocal))
        inds = array(data.subToInd().keys().collect())
        assert(allclose(inds, array(range(1, 13))))
예제 #3
0
 def test_standardization_axis1(self):
     rdd = self.sc.parallelize([(0, array([1, 2], dtype='float16')), (0, array([3, 4], dtype='float16'))])
     data = Series(rdd, dtype='float16')
     centered = data.center(1)
     standardized = data.standardize(1)
     zscored = data.zscore(1)
     assert(allclose(centered.first()[1], array([-1, -1]), atol=1e-3))
     assert(allclose(standardized.first()[1], array([1, 2]), atol=1e-3))
     assert(allclose(zscored.first()[1], array([-1, -1]), atol=1e-3))
예제 #4
0
 def test_select(self):
     rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))])
     data = Series(rdd, index=['label1', 'label2', 'label3', 'label4'])
     selection1 = data.select(['label1'])
     assert(allclose(selection1.first()[1], 4))
     selection1 = data.select('label1')
     assert(allclose(selection1.first()[1], 4))
     selection2 = data.select(['label1', 'label2'])
     assert(allclose(selection2.first()[1], array([4, 5])))
예제 #5
0
 def test_squelch(self):
     rdd = self.sc.parallelize([(0, array([1, 2])), (0, array([3, 4]))])
     data = Series(rdd)
     squelched = data.squelch(5)
     assert(allclose(squelched.collectValuesAsArray(), [[0, 0], [0, 0]]))
     squelched = data.squelch(3)
     assert(allclose(squelched.collectValuesAsArray(), [[0, 0], [3, 4]]))
     squelched = data.squelch(1)
     assert(allclose(squelched.collectValuesAsArray(), [[1, 2], [3, 4]]))
예제 #6
0
    def test_round_trip_rdd(self):
        subs = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (1, 3, 1), (2, 3, 1),
                (1, 1, 2), (2, 1, 2), (1, 2, 2), (2, 2, 2), (1, 3, 2), (2, 3, 2)]
        dataLocal = map(lambda x: (x, array([1.0])), subs)

        data = Series(self.sc.parallelize(dataLocal))
        start = data.keys().collect()
        stop = data.subToInd().indToSub().keys().collect()
        assert(allclose(array(start), array(stop)))
예제 #7
0
 def test_standardization_axis0(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5], dtype='float16'))])
     data = Series(rdd, dtype='float16')
     centered = data.center(0)
     standardized = data.standardize(0)
     zscored = data.zscore(0)
     assert(allclose(centered.first()[1], array([-2, -1, 0, 1, 2]), atol=1e-3))
     assert(allclose(standardized.first()[1], array([0.70710,  1.41421,  2.12132,  2.82842,  3.53553]), atol=1e-3))
     assert(allclose(zscored.first()[1], array([-1.41421, -0.70710,  0,  0.70710,  1.41421]), atol=1e-3))
예제 #8
0
 def test_subset(self):
     rdd = self.sc.parallelize([(0, array([1, 5], dtype='float16')),
                                (0, array([1, 10], dtype='float16')),
                                (0, array([1, 15], dtype='float16'))])
     data = Series(rdd)
     assert_equal(len(data.subset(3, stat='min', thresh=0)), 3)
     assert_array_equal(data.subset(1, stat='max', thresh=10), [[1, 15]])
     assert_array_equal(data.subset(1, stat='mean', thresh=6), [[1, 15]])
     assert_array_equal(data.subset(1, stat='std', thresh=6), [[1, 15]])
     assert_array_equal(data.subset(1, thresh=6), [[1, 15]])
예제 #9
0
    def test_toImages(self):
        from lambdaimage.rdds.images import Images
        rdd = self.sc.parallelize([((0, 0), array([1])), ((0, 1), array([2])),
                                   ((1, 0), array([3])), ((1, 1), array([4]))])
        data = Series(rdd)
        imgs = data.toImages()
        assert(isinstance(imgs, Images))

        im = imgs.values().first()
        assert(allclose(im, [[1, 2], [3, 4]]))
예제 #10
0
 def test_correlate(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5], dtype='float16'))])
     data = Series(rdd, dtype='float16')
     sig1 = [4, 5, 6, 7, 8]
     corrData = data.correlate(sig1)
     assert_equals('float64', corrData._dtype)
     corr = corrData.values().collect()
     assert(allclose(corr[0], 1))
     sig12 = [[4, 5, 6, 7, 8], [8, 7, 6, 5, 4]]
     corrs = data.correlate(sig12).values().collect()
     assert(allclose(corrs[0], [1, -1]))
예제 #11
0
 def test_seriesStats(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5]))])
     data = Series(rdd)
     assert(allclose(data.seriesMean().first()[1], 3.0))
     assert(allclose(data.seriesSum().first()[1], 15.0))
     assert(allclose(data.seriesMedian().first()[1], 3.0))
     assert(allclose(data.seriesStdev().first()[1], 1.4142135))
     assert(allclose(data.seriesStat('mean').first()[1], 3.0))
     assert(allclose(data.seriesStats().select('mean').first()[1], 3.0))
     assert(allclose(data.seriesStats().select('count').first()[1], 5))
     assert(allclose(data.seriesPercentile(25).first()[1], 2.0))
     assert(allclose(data.seriesPercentile((25, 75)).first()[1], array([2.0, 4.0])))
예제 #12
0
    def test_meanByFixedLength(self):
        rdd = self.sc.parallelize([((0,), array([0, 1, 2, 3, 4, 5, 6, 7], dtype='float16'))])
        data = Series(rdd)

        test1 = data.meanByFixedLength(4)
        assert(test1.keys().collect() == [(0,)])
        assert(allclose(test1.index, array([0, 1, 2, 3])))
        assert(allclose(test1.values().collect(), [[2, 3, 4, 5]]))

        test2 = data.meanByFixedLength(2)
        assert(test2.keys().collect() == [(0,)])
        assert(allclose(test2.index, array([0, 1])))
        assert(allclose(test2.values().collect(), [[3, 4]]))
예제 #13
0
    def test_query_linear_singleton(self):
        dataLocal = [
            ((1,), array([1.0, 2.0, 3.0])),
            ((2,), array([2.0, 2.0, 4.0])),
            ((3,), array([4.0, 2.0, 1.0]))
        ]

        data = Series(self.sc.parallelize(dataLocal))

        inds = array([array([1, 2])])
        keys, values = data.query(inds)
        assert(allclose(values[0, :], array([1.5, 2., 3.5])))
        assert_equals(data.dtype, values[0, :].dtype)
예제 #14
0
    def test_query_subscripts(self):
        dataLocal = [
            ((1, 1), array([1.0, 2.0, 3.0])),
            ((2, 1), array([2.0, 2.0, 4.0])),
            ((1, 2), array([4.0, 2.0, 1.0]))
        ]

        data = Series(self.sc.parallelize(dataLocal))

        inds = array([array([1, 2]), array([3])])
        keys, values = data.query(inds)
        assert(allclose(values[0, :], array([1.5, 2., 3.5])))
        assert(allclose(values[1, :], array([4.0, 2.0, 1.0])))
예제 #15
0
    def test_toRowMatrix(self):
        from lambdaimage.rdds.matrices import RowMatrix
        rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))])
        data = Series(rdd)
        mat = data.toRowMatrix()
        assert(isinstance(mat, RowMatrix))
        assert(mat.nrows == 2)
        assert(mat.ncols == 4)

        # check a basic operation from superclass
        newmat = mat.applyValues(lambda x: x + 1)
        out = newmat.collectValuesAsArray()
        assert(array_equal(out, array([[5, 6, 7, 8], [9, 10, 11, 12]])))
예제 #16
0
    def test_index_setter_getter(self):
        dataLocal = [
            ((1,), array([1.0, 2.0, 3.0])),
            ((2,), array([2.0, 2.0, 4.0])),
            ((3,), array([4.0, 2.0, 1.0]))
        ]
        data = Series(self.sc.parallelize(dataLocal))

        assert_true(array_equal(data.index, array([0, 1, 2])))
        data.index = [3, 2, 1]
        assert_true(data.index == [3, 2, 1])

        def setIndex(data, idx):
            data.index = idx

        assert_raises(ValueError, setIndex, data, 5)
        assert_raises(ValueError, setIndex, data, [1, 2])
예제 #17
0
 def setUp(self):
     super(TestSeriesRegionMeanMethods, self).setUp()
     self.dataLocal = [
         ((0, 0), array([1.0, 2.0, 3.0])),
         ((0, 1), array([2.0, 2.0, 4.0])),
         ((1, 0), array([4.0, 2.0, 1.0])),
         ((1, 1), array([3.0, 1.0, 1.0]))
     ]
     self.series = Series(self.sc.parallelize(self.dataLocal),
                          dtype=self.dataLocal[0][1].dtype,
                          index=arange(3))
예제 #18
0
    def test_seriesAggregateByIndex(self):
        dataLocal = [((1,), arange(12))]
        index = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
        data = Series(self.sc.parallelize(dataLocal), index=index)

        result = data.seriesAggregateByIndex(sum)
        print result.values().first()
        assert_true(array_equal(result.values().first(), array([6, 22, 38])))
        assert_true(array_equal(result.index, array([0, 1, 2])))

        index = [
            [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
            [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1],
            [0, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 3]
        ]
        data.index = array(index).T
        
        result = data.seriesAggregateByIndex(sum, level=[0, 1])
        assert_true(array_equal(result.values().first(), array([1, 14, 13, 38])))
        assert_true(array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]])))
예제 #19
0
    def test_selectByIndex(self):
        dataLocal = [((1,), arange(12))]
        index = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
        data = Series(self.sc.parallelize(dataLocal), index=index)

        result = data.selectByIndex(1)
        assert_true(array_equal(result.values().first(), array([4, 5, 6, 7])))
        assert_true(array_equal(result.index, array([1, 1, 1, 1])))

        result = data.selectByIndex(1, squeeze=True)
        assert_true(array_equal(result.index, array([0, 1, 2, 3])))

        index = [
            [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
            [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1],
            [0, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 3]
        ]
        data.index = array(index).T

        result, mask = data.selectByIndex(0, level=2, returnMask=True)
        assert_true(array_equal(result.values().first(), array([0, 2, 6, 8])))
        assert_true(array_equal(result.index, array([[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0]])))
        assert_true(array_equal(mask, array([1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0])))

        result = data.selectByIndex(0, level=2, squeeze=True)
        assert_true(array_equal(result.values().first(), array([0, 2, 6, 8])))
        assert_true(array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]])))

        result = data.selectByIndex([1, 0], level=[0, 1])
        assert_true(array_equal(result.values().first(), array([6, 7])))
        assert_true(array_equal(result.index, array([[1, 0, 0], [1, 0, 1]])))

        result = data.selectByIndex(val=[0, [2,3]], level=[0, 2])
        assert_true(array_equal(result.values().first(), array([4, 5])))
        assert_true(array_equal(result.index, array([[0, 1, 2], [0, 1, 3]])))

        result = data.selectByIndex(1, level=1, filter=True)
        assert_true(array_equal(result.values().first(), array([0, 1, 6, 7])))
        assert_true(array_equal(result.index, array([[0, 0, 0], [0, 0, 1], [1, 0, 0], [1, 0, 1]])))
예제 #20
0
class TestSeriesRegionMeanMethods(PySparkTestCase):
    def setUp(self):
        super(TestSeriesRegionMeanMethods, self).setUp()
        self.dataLocal = [
            ((0, 0), array([1.0, 2.0, 3.0])),
            ((0, 1), array([2.0, 2.0, 4.0])),
            ((1, 0), array([4.0, 2.0, 1.0])),
            ((1, 1), array([3.0, 1.0, 1.0]))
        ]
        self.series = Series(self.sc.parallelize(self.dataLocal),
                             dtype=self.dataLocal[0][1].dtype,
                             index=arange(3))

    def __setup_meanByRegion(self, useMask=False):
        itemIdxs = [1, 2]  # data keys for items 1 and 2 (0-based)
        keys = [self.dataLocal[idx][0] for idx in itemIdxs]

        expectedKeys = tuple(vstack(keys).mean(axis=0).astype('int16'))
        expected = vstack([self.dataLocal[idx][1] for idx in itemIdxs]).mean(axis=0)
        if useMask:
            keys = array([[0, 1], [1, 0]], dtype='uint8')
        return keys, expectedKeys, expected

    @staticmethod
    def __checkAsserts(expectedLen, expectedKeys, expected, actual):
        assert_equals(expectedLen, len(actual))
        assert_equals(expectedKeys, actual[0])
        assert_true(array_equal(expected, actual[1]))

    @staticmethod
    def __checkNestedAsserts(expectedLen, expectedKeys, expected, actual):
        assert_equals(expectedLen, len(actual))
        for i in xrange(expectedLen):
            assert_equals(expectedKeys[i], actual[i][0])
            assert_true(array_equal(expected[i], actual[i][1]))

    def __checkReturnedSeriesAttributes(self, newSeries):
        assert_true(newSeries._dims is None)  # check that new _dims is unset
        assert_equals(self.series.dtype, newSeries._dtype)  # check that new dtype is set
        assert_true(array_equal(self.series.index, newSeries._index))  # check that new index is set
        assert_is_not_none(newSeries.dims)  # check that new dims is at least calculable (expected to be meaningless)

    def __run_tst_meanOfRegion(self, useMask):
        keys, expectedKeys, expected = self.__setup_meanByRegion(useMask)
        actual = self.series.meanOfRegion(keys)
        TestSeriesRegionMeanMethods.__checkAsserts(2, expectedKeys, expected, actual)

    def test_meanOfRegion(self):
        self.__run_tst_meanOfRegion(False)

    def test_meanOfRegionWithMask(self):
        self.__run_tst_meanOfRegion(True)

    def test_meanOfRegionErrorsOnMissing(self):
        _, expectedKeys, expected = self.__setup_meanByRegion(False)
        keys = [(17, 24), (17, 25)]
        # if no records match, return None, None
        actualKey, actualVal = self.series.meanOfRegion(keys)
        assert_is_none(actualKey)
        assert_is_none(actualVal)
        # if we have only a partial match but haven't turned on validation, return a sensible value
        keys = [(0, 1), (17, 25)]
        actualKey, actualVal = self.series.meanOfRegion(keys)
        assert_equals((0, 1), actualKey)
        assert_true(array_equal(self.dataLocal[1][1], actualVal))
        # throw an error on a partial match when validation turned on
        assert_raises(ValueError, self.series.meanOfRegion, keys, validate=True)

    def test_meanByRegions_singleRegion(self):
        keys, expectedKeys, expected = self.__setup_meanByRegion()

        actualSeries = self.series.meanByRegions([keys])
        actual = actualSeries.collect()
        self.__checkReturnedSeriesAttributes(actualSeries)
        TestSeriesRegionMeanMethods.__checkNestedAsserts(1, [expectedKeys], [expected], actual)

    def test_meanByRegionsErrorsOnMissing(self):
        keys, expectedKeys, expected = self.__setup_meanByRegion()
        keys += [(17, 25)]

        # check that we get a sensible value with validation turned off:
        actualSeries = self.series.meanByRegions([keys])
        actual = actualSeries.collect()
        self.__checkReturnedSeriesAttributes(actualSeries)
        TestSeriesRegionMeanMethods.__checkNestedAsserts(1, [expectedKeys], [expected], actual)

        # throw an error on a partial match when validation turned on
        # this error will be on the workers, which propagates back to the driver
        # as something other than the ValueError that it started out life as
        assert_raises(Exception, self.series.meanByRegions([keys], validate=True).count)

    def test_meanByRegions_singleRegionWithMask(self):
        mask, expectedKeys, expected = self.__setup_meanByRegion(True)

        actualSeries = self.series.meanByRegions(mask)
        actual = actualSeries.collect()
        self.__checkReturnedSeriesAttributes(actualSeries)
        TestSeriesRegionMeanMethods.__checkNestedAsserts(1, [expectedKeys], [expected], actual)

    def test_meanByRegions_twoRegions(self):
        nestedKeys, expectedKeys, expected = [], [], []
        for itemIdxs in [(0, 1), (1, 2)]:
            keys = [self.dataLocal[idx][0] for idx in itemIdxs]
            nestedKeys.append(keys)
            avgKeys = tuple(vstack(keys).mean(axis=0).astype('int16'))
            expectedKeys.append(avgKeys)
            avgVals = vstack([self.dataLocal[idx][1] for idx in itemIdxs]).mean(axis=0)
            expected.append(avgVals)

        actualSeries = self.series.meanByRegions(nestedKeys)
        actual = actualSeries.collect()
        self.__checkReturnedSeriesAttributes(actualSeries)
        TestSeriesRegionMeanMethods.__checkNestedAsserts(2, expectedKeys, expected, actual)

    def test_meanByRegions_twoRegionsWithMask(self):
        expectedKeys, expected = [], []
        mask = array([[1, 1], [2, 0]], dtype='uint8')
        for itemIdxs in [(0, 1), (2, )]:
            keys = [self.dataLocal[idx][0] for idx in itemIdxs]
            avgKeys = tuple(vstack(keys).mean(axis=0).astype('int16'))
            expectedKeys.append(avgKeys)
            avgVals = vstack([self.dataLocal[idx][1] for idx in itemIdxs]).mean(axis=0)
            expected.append(avgVals)

        actualSeries = self.series.meanByRegions(mask)
        actual = actualSeries.collect()
        self.__checkReturnedSeriesAttributes(actualSeries)
        TestSeriesRegionMeanMethods.__checkNestedAsserts(2, expectedKeys, expected, actual)
예제 #21
0
    def test_seriesStatByIndex(self):
        dataLocal = [((1,), arange(12))]
        index = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
        data = Series(self.sc.parallelize(dataLocal), index=index)

        assert_true(array_equal(data.seriesStatByIndex('sum').values().first(), array([6, 22, 38])))
        assert_true(array_equal(data.seriesStatByIndex('mean').values().first(), array([1.5, 5.5, 9.5])))
        assert_true(array_equal(data.seriesStatByIndex('min').values().first(), array([0, 4, 8])))
        assert_true(array_equal(data.seriesStatByIndex('max').values().first(), array([3, 7, 11])))
        assert_true(array_equal(data.seriesStatByIndex('count').values().first(), array([4, 4, 4])))
        assert_true(array_equal(data.seriesStatByIndex('median').values().first(), array([1.5, 5.5, 9.5])))

        assert_true(array_equal(data.seriesSumByIndex().values().first(), array([6, 22, 38])))
        assert_true(array_equal(data.seriesMeanByIndex().values().first(), array([1.5, 5.5, 9.5])))
        assert_true(array_equal(data.seriesMinByIndex().values().first(), array([0, 4, 8])))
        assert_true(array_equal(data.seriesMaxByIndex().values().first(), array([3, 7, 11])))
        assert_true(array_equal(data.seriesCountByIndex().values().first(), array([4, 4, 4])))
        assert_true(array_equal(data.seriesMedianByIndex().values().first(), array([1.5, 5.5, 9.5])))

        index = [
            [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
            [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1],
            [0, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 3]
        ]
        data.index = array(index).T

        result = data.seriesStatByIndex('sum', level=[0, 1])
        assert_true(array_equal(result.values().first(), array([1, 14, 13, 38])))
        assert_true(array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]])))

        result = data.seriesSumByIndex(level=[0, 1])
        assert_true(array_equal(result.values().first(), array([1, 14, 13, 38])))
        assert_true(array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]])))
예제 #22
0
 def test_toTimeSeries(self):
     from lambdaimage.rdds.timeseries import TimeSeries
     rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))])
     data = Series(rdd)
     ts = data.toTimeSeries()
     assert(isinstance(ts, TimeSeries))
예제 #23
0
 def test_between(self):
     rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))])
     data = Series(rdd).between(0, 1)
     assert(allclose(data.index, array([0, 1])))
     assert(allclose(data.first()[1], array([4, 5])))