예제 #1
0
    def fromNpyLocal(self, datafile, keyfile=None):
        """Loads Series data stored in the numpy save() .npy format.

        `datafile` must refer to a path visible to all workers, such as on NFS or similar mounted shared filesystem.
        """
        data = load(datafile)
        if data.ndim > 2:
            raise IOError('Input data must be one or two dimensional')
        if keyfile:
            keys = map(lambda x: tuple(x), load(keyfile))
        else:
            keys = arange(0, data.shape[0])

        rdd = Series(self.sc.parallelize(zip(keys, data), self.minPartitions),
                     dtype=str(data.dtype))

        return rdd
예제 #2
0
    def fromBinary(self, dataPath, ext='bin', confFilename='conf.json',
                   nkeys=None, nvalues=None, keyType=None, valueType=None,
                   newDtype='smallfloat', casting='safe'):
        """
        Load a Series object from a directory of binary files.

        Parameters
        ----------

        dataPath: string URI or local filesystem path
            Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://",
            "s3n://". If no scheme is present, will be interpreted as a path on the local filesystem. This path
            must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified
            by a glob-style expression using a single wildcard character '*'.

        newDtype: dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat'
            Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be
            cast to the requested `newdtype` if not None - see Data `astype()` method.

        casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's `astype()` method; see numpy documentation for details.

        """

        paramsObj = self.__loadParametersAndDefaults(dataPath, confFilename, nkeys, nvalues, keyType, valueType)
        self.__checkBinaryParametersAreSpecified(paramsObj)

        dataPath = self.__normalizeDatafilePattern(dataPath, ext)

        keyDtype = dtypeFunc(paramsObj.keytype)
        valDtype = dtypeFunc(paramsObj.valuetype)

        keySize = paramsObj.nkeys * keyDtype.itemsize
        recordSize = keySize + paramsObj.nvalues * valDtype.itemsize

        lines = self.sc.newAPIHadoopFile(dataPath, 'thunder.util.io.hadoop.FixedLengthBinaryInputFormat',
                                         'org.apache.hadoop.io.LongWritable',
                                         'org.apache.hadoop.io.BytesWritable',
                                         conf={'recordLength': str(recordSize)})

        data = lines.map(lambda (_, v):
                         (tuple(int(x) for x in frombuffer(buffer(v, 0, keySize), dtype=keyDtype)),
                          frombuffer(buffer(v, keySize), dtype=valDtype)))

        return Series(data, dtype=str(valDtype), index=arange(paramsObj.nvalues)).astype(newDtype, casting)
예제 #3
0
 def setUp(self):
     super(TestLinearRegression, self).setUp()
     self.X = array([[-0.4309741, 0.43440693, 0.19946369, 1.40428728],
                     [0.54587086, -1.1092286, -0.27258427, 0.35205421],
                     [-0.4432777, 0.40580108, 0.20938645, 0.26480389],
                     [-0.53239659, -0.90966912, -0.13967252, 1.38274305],
                     [0.35731376, 0.39878607, 0.07762888, 1.82299252],
                     [0.36687294, -0.17079843, -0.17765573, 0.87161138],
                     [0.3017848, 1.36537541, 0.91211512, -0.80570055],
                     [-0.72330999, 0.36319617, 0.08986615, -0.7830115],
                     [1.11477831, 0.41631623, 0.11104172, -0.90049209],
                     [-1.62162968, 0.46928843, 0.62996118, 1.08668594]])
     self.y0 = array([
         4.57058016, -4.06400691, 4.25957933, 2.01583617, 0.34791879,
         -0.9113852, 3.41167194, 5.26059279, -2.35116878, 6.28263909
     ])
     self.y = Series(self.sc.parallelize([((1, ), self.y0)]))
     self.tol = 1E-3
예제 #4
0
파일: models.py 프로젝트: yonglehou/thunder
    def predict(self, X):
        """
        Predicts the responses given a design matrix

        Parameters
        ----------
        X: array
            Design matrix of shape n x k, where n is the number of samples and k is the
            number of regressors. Even if an intercept term was fit, should NOT include
            a column of ones.

        Returns
        -------
        yhat: Series
            Series of predictions (each of length n)
        """
        X = self._transforms.apply(X)
        return Series(self._models.mapValues(lambda v: v.predict(X)))
예제 #5
0
파일: datasets.py 프로젝트: vjlbym/thunder
 def generate(self,
              k=5,
              npartitions=10,
              ndims=5,
              nrecords=100,
              noise=0.1,
              seed=None):
     random.seed(seed)
     centers = random.randn(k, ndims)
     genFunc = lambda i: centers[int(floor(random.rand(1, 1) * k))
                                 ] + noise * random.rand(ndims)
     dataLocal = map(genFunc, range(0, nrecords))
     data = Series(
         self.sc.parallelize(self.appendKeys(dataLocal), npartitions))
     if self.returnParams is True:
         return data, centers
     else:
         return data
예제 #6
0
    def fromStack(self, dataPath, dims, ext="stack", blockSize="150M", dtype='int16',
                  newDtype='smallfloat', casting='safe', startIdx=None, stopIdx=None, recursive=False):
        """Load a Series object directly from binary image stack files.

        Parameters
        ----------

        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A dataPath argument may include a single '*' wildcard character in the filename.

        dims: tuple of positive int
            Dimensions of input image data, ordered with the fastest-changing dimension first.

        ext: string, optional, default "stack"
            Extension required on data files to be loaded.

        blockSize: string formatted as e.g. "64M", "512k", "2G", or positive int. optional, default "150M"
            Requested size of Series partitions in bytes (or kilobytes, megabytes, gigabytes).

        dtype: dtype or dtype specifier, optional, default 'int16'
            Numpy dtype of input stack data

        newDtype: dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat'
            Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be
            cast to the requested `newdtype` if not None - see Data `astype()` method.

        casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's `astype()` method; see numpy documentation for details.

        startIdx, stopIdx: nonnegative int. optional.
            Indices of the first and last-plus-one data file to load, relative to the sorted filenames matching
            `dataPath` and `ext`. Interpreted according to python slice indexing conventions.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at dataPath, loading all files in the tree that
            have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems
            (not s3).
        """
        seriesBlocks, npointsInSeries, newDtype = \
            self._getSeriesBlocksFromStack(dataPath, dims, ext=ext, blockSize=blockSize, dtype=dtype,
                                           newDtype=newDtype, casting=casting, startIdx=startIdx, stopIdx=stopIdx,
                                           recursive=recursive)
        return Series(seriesBlocks, dims=dims, dtype=newDtype, index=arange(npointsInSeries))
예제 #7
0
 def test_standardization_axis0(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5],
                                          dtype='float16'))])
     data = Series(rdd, dtype='float16')
     centered = data.center(0)
     standardized = data.standardize(0)
     zscored = data.zscore(0)
     assert_equals('float16', centered._dtype)
     assert_equals('float16', standardized._dtype)
     assert_equals('float16', zscored._dtype)
     assert (allclose(centered.first()[1],
                      array([-2, -1, 0, 1, 2]),
                      atol=1e-3))
     assert (allclose(standardized.first()[1],
                      array([0.70710, 1.41421, 2.12132, 2.82842, 3.53553]),
                      atol=1e-3))
     assert (allclose(zscored.first()[1],
                      array([-1.41421, -0.70710, 0, 0.70710, 1.41421]),
                      atol=1e-3))
예제 #8
0
    def test_selectByIndex(self):
        dataLocal = [((1, ), arange(12))]
        index = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
        data = Series(self.sc.parallelize(dataLocal), index=index)

        result = data.selectByIndex(1)
        assert_true(array_equal(result.values().first(), array([4, 5, 6, 7])))
        assert_true(array_equal(result.index, array([1, 1, 1, 1])))

        result = data.selectByIndex(1, squeeze=True)
        assert_true(array_equal(result.index, array([0, 1, 2, 3])))

        index = [[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
                 [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1],
                 [0, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 3]]
        data.index = array(index).T

        result, mask = data.selectByIndex(0, level=2, returnMask=True)
        assert_true(array_equal(result.values().first(), array([0, 2, 6, 8])))
        assert_true(
            array_equal(result.index,
                        array([[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0]])))
        assert_true(
            array_equal(mask, array([1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0])))

        result = data.selectByIndex(0, level=2, squeeze=True)
        assert_true(array_equal(result.values().first(), array([0, 2, 6, 8])))
        assert_true(
            array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]])))

        result = data.selectByIndex([1, 0], level=[0, 1])
        assert_true(array_equal(result.values().first(), array([6, 7])))
        assert_true(array_equal(result.index, array([[1, 0, 0], [1, 0, 1]])))

        result = data.selectByIndex(val=[0, [2, 3]], level=[0, 2])
        assert_true(array_equal(result.values().first(), array([4, 5])))
        assert_true(array_equal(result.index, array([[0, 1, 2], [0, 1, 3]])))

        result = data.selectByIndex(1, level=1, filter=True)
        assert_true(array_equal(result.values().first(), array([0, 1, 6, 7])))
        assert_true(
            array_equal(result.index,
                        array([[0, 0, 0], [0, 0, 1], [1, 0, 0], [1, 0, 1]])))
예제 #9
0
 def test_linearRegress(self):
     data = Series(
         self.sc.parallelize([(1, array([1.5, 2.3, 6.2, 5.1, 3.4, 2.1]))]))
     x = array([array([1, 0, 0, 0, 0, 0]), array([0, 1, 0, 0, 0, 0])])
     model = RegressionModel.load(x, "linear")
     result = model.fit(data)
     # check accuracy of results
     assert (allclose(
         result.select('betas').values().collect()[0], array([-2.7, -1.9])))
     assert (allclose(
         result.select('stats').values().collect()[0], array([0.42785299])))
     assert (allclose(
         result.select('resid').values().collect()[0],
         array([0, 0, 2, 0.9, -0.8, -2.1])))
     # check indexing of outputs
     assert (allclose(result.select('betas').index, array([0, 1])))
     assert (allclose(
         result.select('resid').index, array([0, 1, 2, 3, 4, 5])))
     assert (result.select('stats').index == ['stats'])
예제 #10
0
    def test_seriesAggregateByIndex(self):
        dataLocal = [((1, ), arange(12))]
        index = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
        data = Series(self.sc.parallelize(dataLocal), index=index)

        result = data.seriesAggregateByIndex(sum)
        print result.values().first()
        assert_true(array_equal(result.values().first(), array([6, 22, 38])))
        assert_true(array_equal(result.index, array([0, 1, 2])))

        index = [[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
                 [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1],
                 [0, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 3]]
        data.index = array(index).T

        result = data.seriesAggregateByIndex(sum, level=[0, 1])
        assert_true(
            array_equal(result.values().first(), array([1, 14, 13, 38])))
        assert_true(
            array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]])))
예제 #11
0
 def test_bilinearRegress(self):
     data = Series(
         self.sc.parallelize([(1, array([1.5, 2.3, 6.2, 5.1, 3.4, 2.1]))]))
     x1 = array([array([1, 0, 1, 0, 1, 0]), array([0, 1, 0, 1, 0, 1])])
     x2 = array([
         array([1, 1, 0, 0, 0, 0]),
         array([0, 0, 1, 1, 0, 0]),
         array([0, 0, 0, 0, 1, 1])
     ])
     model = RegressionModel.load((x1, x2), "bilinear")
     result = model.fit(data)
     tol = 1E-4  # to handle rounding errors
     assert (allclose(result.select('betas').values().collect()[0],
                      array([-3.1249, 5.6875, 0.4375]),
                      atol=tol))
     assert (allclose(
         result.select('stats').values().collect()[0], array([0.6735]),
         tol))
     assert (allclose(result.select('resid').values().collect()[0],
                      array([0, -0.8666, 0, 1.9333, 0, -1.0666]),
                      atol=tol))
예제 #12
0
    def localCorr(self, neighborhood):
        """
        Correlate every signal to the average of its local neighborhood.

        This algorithm computes, for every spatial record, the correlation coefficient
        between that record's series, and the average series of all records within
        a local neighborhood with a size defined by the neighborhood parameter.
        For data with three spatial keys, only neighborhoods in x and y
        currently supported.

        Parameters
        ----------
        neighborhood : integer
            Size of neighborhood, describes extent in either direction, so
            total neighborhood will be 2n + 1.

        """

        if len(self.dims.max) not in [2, 3]:
            raise NotImplementedError(
                'keys must have 2 or 3 dimensions to compute local correlations'
            )

        # flat map to key value pairs where the key is neighborhood identifier and value is time series
        neighbors = self.mapToNeighborhood(neighborhood)

        # reduce by key to get the average time series for each neighborhood
        means = neighbors.rdd.reduceByKey(lambda x, y: x + y).mapValues(
            lambda x: x / ((2 * neighborhood + 1)**2))

        # join with the original time series data to compute correlations
        result = self.rdd.join(means)

        # get correlations
        corr = result.mapValues(lambda x: corrcoef(x[0], x[1])[0, 1])

        # force sorting, but reverse keys for correct ordering
        output = corr.map(lambda (k, v): (k[::-1], v)).sortByKey().map(
            lambda (k, v): (k[::-1], v))
        return Series(output, index='correlation').__finalize__(self)
예제 #13
0
    def fromArraysAsImages(self, arrays):
        """Create a Series object from a sequence of numpy ndarrays resident in memory on the driver.

        The arrays will be interpreted as though each represents a single time point - effectively the same
        as if converting Images to a Series, with each array representing a volume image at a particular
        point in time. Thus in the resulting Series, the value of the record with key (0,0,0) will be
        array([arrays[0][0,0,0], arrays[1][0,0,0],... arrays[n][0,0,0]).

        The dimensions of the resulting Series will be *opposite* that of the passed numpy array. Their dtype will not
        be changed.
        """
        # if passed a single array, cast it to a sequence of length 1
        if isinstance(arrays, ndarray):
            arrays = [arrays]

        # check that shapes of passed arrays are consistent
        shape = arrays[0].shape
        dtype = arrays[0].dtype
        for ary in arrays:
            if not ary.shape == shape:
                raise ValueError(
                    "Inconsistent array shapes: first array had shape %s, but other array has shape %s"
                    % (str(shape), str(ary.shape)))
            if not ary.dtype == dtype:
                raise ValueError(
                    "Inconsistent array dtypes: first array had dtype %s, but other array has dtype %s"
                    % (str(dtype), str(ary.dtype)))

        # get indices so that fastest index changes first
        shapeiters = (xrange(n) for n in shape)
        keys = [idx[::-1] for idx in itertools.product(*shapeiters)]

        values = vstack([ary.ravel() for ary in arrays]).T

        dims = Dimensions.fromTuple(shape[::-1])

        return Series(self.sc.parallelize(zip(keys, values),
                                          self.minPartitions),
                      dims=dims,
                      dtype=str(dtype))
예제 #14
0
    def test_meanByRegions_twoRegions(self):
        dataLocal = [((0, 0), array([1.0, 2.0, 3.0])),
                     ((0, 1), array([2.0, 2.0, 4.0])),
                     ((1, 0), array([4.0, 2.0, 1.0])),
                     ((1, 1), array([3.0, 1.0, 1.0]))]
        series = Series(self.sc.parallelize(dataLocal))
        nestedKeys, expectedKeys, expected = [], [], []
        expectedKeys = []
        for itemIdxs in [(0, 1), (1, 2)]:
            keys = [dataLocal[idx][0] for idx in itemIdxs]
            nestedKeys.append(keys)
            avgKeys = tuple(vstack(keys).mean(axis=0).astype('int16'))
            expectedKeys.append(avgKeys)
            avgVals = vstack([dataLocal[idx][1]
                              for idx in itemIdxs]).mean(axis=0)
            expected.append(avgVals)

        actualSeries = series.meanByRegion(nestedKeys)
        actual = actualSeries.collect()
        assert_equals(2, len(actual))
        for regionIdx in xrange(2):
            assert_equals(expectedKeys[regionIdx], actual[regionIdx][0])
            assert_true(array_equal(expected[regionIdx], actual[regionIdx][1]))
예제 #15
0
파일: tuning.py 프로젝트: mfcabrera/thunder
    def fit(self, data):
        """
        Fit a mass univariate tuning model.

        Parameters
        ----------
        data : Series or a subclass (e.g. RowMatrix)
            The data to fit tuning models to, a collection of
            key-value pairs where the keys are identifiers and the values are
            one-dimensional arrays

        Returns
        -------
        params : RDD of (tuple, array) pairs
            Fitted tuning parameters for each record
        """

        if not (isinstance(data, Series)):
            raise Exception(
                'Input must be Series or a subclass (e.g. RowMatrix)')

        return Series(data.rdd.mapValues(lambda x: self.get(x)),
                      index=['center', 'spread']).__finalize__(data)
예제 #16
0
파일: models.py 프로젝트: yonglehou/thunder
    def score(self, X, y):
        """
        Computes R-squared values for a single design matrix and multiple responses.

        Parameters
        ----------
        X: array
            Design matrix of shape n x k, where n is the number of samples and k is the
            number of regressors. Even if an intercept term was fit, should NOT include
            a column of ones.

        y: Series
            Series of response variables where each record is a vector of length n, where
            n is the number of samples.

        Returns
        -------
        scores: Series
            Series of R-squared values.
        """
        X = self._transforms.apply(X)
        joined = self._models.join(y.rdd)
        newrdd = joined.mapValues(lambda (model, y): model.stats(X, y))
        return Series(newrdd)
예제 #17
0
    def localCorr(self, neighborhood):

        if len(self.dims.max) not in [2, 3]:
            raise NotImplementedError(
                'keys must have 2 or 3 dimensions to compute local correlations'
            )

        # flat map to key value pairs where the key is neighborhood identifier and value is time series
        neighbors = self.mapToNeighborhood(neighborhood)

        # reduce by key to get the average time series for each neighborhood
        means = neighbors.rdd.reduceByKey(lambda x, y: x + y).mapValues(
            lambda x: x / ((2 * neighborhood + 1)**2))

        # join with the original time series data to compute correlations
        result = self.rdd.join(means)

        # get correlations
        corr = result.mapValues(lambda x: corrcoef(x[0], x[1])[0, 1])

        # force sorting, but reverse keys for correct ordering
        output = corr.map(lambda (k, v): (k[::-1], v)).sortByKey().map(
            lambda (k, v): (k[::-1], v))
        return Series(output, index='correlation').__finalize__(self)
예제 #18
0
    def fit(self, data, featureset=None):
        """
        Run classification on each record in a data set

        Parameters
        ----------
        data: Series or a subclass (e.g. RowMatrix)
            Data to perform classification on, must be a collection of
            key-value pairs where the keys are identifiers and the values are
            one-dimensional arrays

        featureset : array, optional, default = None
            Which features to use

        Returns
        -------
        perf : Series
            The performance of the classifer for each record
        """

        if not isinstance(data, Series):
            raise Exception(
                'Input must be Series or a subclass (e.g. RowMatrix)')

        if self.nfeatures == 1:
            perf = data.rdd.mapValues(lambda x: [self.get(x)])
        else:
            if featureset is None:
                featureset = [[self.features[0]]]
            for i in featureset:
                assert array([item in i for item in self.features
                              ]).sum() != 0, "Feature set invalid"
            perf = data.rdd.mapValues(
                lambda x: asarray(map(lambda i: self.get(x, i), featureset)))

        return Series(perf, index='performance').__finalize__(data)
예제 #19
0
    def fit(self, mat):
        """
        Calcuate the non-negative matrix decomposition.

        Parameters
        ----------
        mat : Series or a subclass (e.g. RowMatrix)
            Data to estimate independent components from, must be a collection of
            key-value pairs where the keys are identifiers and the values are
            one-dimensional arrays

        Returns
        ----------
        self : returns an instance of self.
        """

        from numpy import add, any, diag, dot, inf, maximum, outer, sqrt, apply_along_axis
        from numpy.linalg import inv, norm, pinv
        from numpy.random import rand

        if not (isinstance(mat, Series)):
            raise Exception(
                'Input must be Series or a subclass (e.g. RowMatrix)')

        mat = mat.rdd

        # a helper function to take the Frobenius norm of two zippable RDDs
        def rddFrobeniusNorm(A, B):
            return sqrt(
                A.zip(B).map(lambda ((keyA, x), (keyB, y)): sum(
                    (x - y)**2)).reduce(add))

        # input checking
        k = self.k
        if k < 1:
            raise ValueError("Supplied k must be greater than 1.")
        m = mat.values().first().size
        if self.h0 is not None:
            if any(self.h0 < 0):
                raise ValueError("Supplied h0 contains negative entries.")

        # alternating least-squares implementation
        if self.method == "als":

            # initialize NMF and begin als algorithm
            if self.verbose:
                print "Initializing NMF"
            alsIter = 0
            hConvCurr = 100

            if self.h0 is None:
                # noinspection PyUnresolvedReferences
                self.h0 = rand(k, m)

            h = self.h0
            w = None

            # goal is to solve R = WH subject to all entries of W,H >= 0
            # by iteratively updating W and H with least squares and clipping negative values
            while (alsIter < self.maxIter) and (hConvCurr > self.tol):
                # update values on iteration
                hOld = h
                wOld = w

                # precompute pinv(H) = inv(H' x H) * H' (easy here because h is an np array)
                # the rows of H should be a basis of dimension k, so in principle we could just compute directly
                pinvH = pinv(h)

                # update W using least squares row-wise with R * pinv(H); then clip negative values to 0
                w = mat.mapValues(lambda x: dot(x, pinvH))

                # clip negative values of W
                # noinspection PyUnresolvedReferences
                w = w.mapValues(lambda x: maximum(x, 0))

                # precompute inv(W' * W) to get inv_gramian_w, a np array
                # We have chosen k to be small, i.e., rank(W) = k, so W'*W is invertible
                gramianW = w.values().map(lambda x: outer(x, x)).reduce(add)
                invGramianW = inv(gramianW)

                # pseudoinverse of W is inv(W' * W) * W' = inv_gramian_w * w
                pinvW = w.mapValues(lambda x: dot(invGramianW, x))

                # update H using least squares row-wise with inv(W' * W) * W * R (same as pinv(W) * R)
                h = pinvW.values().zip(
                    mat.values()).map(lambda (x, y): outer(x, y)).reduce(add)

                # clip negative values of H
                # noinspection PyUnresolvedReferences
                h = maximum(h, 0)

                # normalize the rows of H
                # noinspection PyUnresolvedReferences
                h = dot(diag(1 / maximum(apply_along_axis(norm, 1, h), 0.001)),
                        h)

                # estimate convergence
                hConvCurr = norm(h - hOld)
                self.hConvergence.append(hConvCurr)
                if self.wConvergence is not None:
                    if wOld is not None:
                        self.wConvergence.append(rddFrobeniusNorm(w, wOld))
                    else:
                        self.wConvergence.append(inf)

                # calculate reconstruction error
                if self.reconHist == 'all':
                    recData = w.mapValues(lambda x: dot(x, h))
                    self.reconErr.append(rddFrobeniusNorm(mat, recData))

                # report progress
                if self.verbose:
                    print "finished als iteration %d with convergence = %.6f in H" % (
                        alsIter, hConvCurr)

                # increment count
                alsIter += 1

            # report on convergence
            if self.verbose:
                if hConvCurr <= self.tol:
                    print "Converged to specified tolerance."
                else:
                    print "Warning: reached maxiter without converging to specified tolerance."

            # calculate reconstruction error
            if self.reconHist == 'final':
                recData = w.mapValues(lambda x: dot(x, h))
                self.reconErr = rddFrobeniusNorm(mat, recData)

            # report results
            self.h = h
            # TODO: need to propagate metadata through to this new Series object
            self.w = Series(w)

        else:
            raise Exception("Algorithm %s is not supported" % self.method)

        return self
예제 #20
0
 def test_toTimeSeries(self):
     from thunder.rdds.timeseries import TimeSeries
     rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))])
     data = Series(rdd)
     ts = data.toTimeSeries()
     assert(isinstance(ts, TimeSeries))
예제 #21
0
 def test_detrend(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5]))])
     data = Series(rdd).detrend('linear')
     # detrending linearly increasing data should yield all 0s
     assert (allclose(data.first()[1], array([0, 0, 0, 0, 0])))
예제 #22
0
파일: nmf.py 프로젝트: mfcabrera/thunder
    def fit(self, mat):
        """
        Calcuate the non-negative matrix decomposition.

        Parameters
        ----------
        mat : Series or a subclass (e.g. RowMatrix)
            Data to estimate independent components from, must be a collection of
            key-value pairs where the keys are identifiers and the values are
            one-dimensional arrays

        Returns
        ----------
        self : returns an instance of self.
        """

        import numpy as np

        if not (isinstance(mat, Series)):
            raise Exception('Input must be Series or a subclass (e.g. RowMatrix)')

        mat = mat.rdd

        # a helper function to take the Frobenius norm of two zippable RDDs
        def rddFrobeniusNorm(A, B):
            return np.sqrt(A.zip(B).map(lambda ((key_a, x), (key_b, y)): sum((x - y) ** 2)).reduce(np.add))

        # input checking
        k = self.k
        if k < 1:
            raise ValueError("Supplied k must be greater than 1.")
        m = mat.values().first().size
        if self.h0 is not None:
            if np.any(self.h0 < 0):
                raise ValueError("Supplied h0 contains negative entries.")

        # alternating least-squares implementation
        if self.method == "als":

            # initialize NMF and begin als algorithm
            print "Initializing NMF"
            als_iter = 0
            h_conv_curr = 100

            if self.h0 is None:
                # noinspection PyUnresolvedReferences
                self.h0 = np.random.rand(k, m)

            h = self.h0
            w = None

            # goal is to solve R = WH subject to all entries of W,H >= 0
            # by iteratively updating W and H with least squares and clipping negative values
            while (als_iter < self.maxiter) and (h_conv_curr > self.tol):
                # update values on iteration
                h_old = h
                w_old = w

                # precompute pinv(H) = inv(H' x H) * H' (easy here because h is an np array)
                # the rows of H should be a basis of dimension k, so in principle we could just compute directly
                pinv_h = np.linalg.pinv(h)

                # update W using least squares row-wise with R * pinv(H); then clip negative values to 0
                w = mat.mapValues(lambda x: np.dot(x, pinv_h))

                # clip negative values of W
                # noinspection PyUnresolvedReferences
                w = w.mapValues(lambda x: np.maximum(x, 0))

                # precompute inv(W' * W) to get inv_gramian_w, a np array
                # We have chosen k to be small, i.e., rank(W) = k, so W'*W is invertible
                gramian_w = w.values().map(lambda x: np.outer(x, x)).reduce(np.add)
                inv_gramian_w = np.linalg.inv(gramian_w)

                # pseudoinverse of W is inv(W' * W) * W' = inv_gramian_w * w
                pinv_w = w.mapValues(lambda x: np.dot(inv_gramian_w, x))

                # update H using least squares row-wise with inv(W' * W) * W * R (same as pinv(W) * R)
                h = pinv_w.values().zip(mat.values()).map(lambda (x, y): np.outer(x, y)).reduce(np.add)

                # clip negative values of H
                # noinspection PyUnresolvedReferences
                h = np.maximum(h, 0)

                # normalize the rows of H
                # noinspection PyUnresolvedReferences
                h = np.dot(np.diag(1 / np.maximum(np.linalg.norm(h, axis=1), 0.001)), h)

                # estimate convergence
                h_conv_curr = np.linalg.norm(h-h_old)
                self.h_convergence.append(h_conv_curr)
                if self.w_convergence is not None:
                    if w_old is not None:
                        self.w_convergence.append(rddFrobeniusNorm(w, w_old))
                    else:
                        self.w.convergence.append(np.inf)

                # calculate reconstruction error
                if self.recon_hist == 'all':
                    rec_data = w.mapValues(lambda x: np.dot(x, h))
                    self.recon_err.append(rddFrobeniusNorm(mat, rec_data))

                # report progress
                print "finished als iteration %d with convergence = %.6f in H" % (als_iter, h_conv_curr)

                # increment count
                als_iter += 1

            # report on convergence
            if h_conv_curr <= self.tol:
                print "Converged to specified tolerance."
            else:
                print "Warning: reached maxiter without converging to specified tolerance."

            # calculate reconstruction error
            if self.recon_hist == 'final':
                    rec_data = w.mapValues(lambda x: np.dot(x, h))
                    self.recon_err = rddFrobeniusNorm(mat, rec_data)

            # report results
            self.h = h
            self.w = Series(w)

        else:
            print "Error: %s is not a supported algorithm." % self.method

        return self
예제 #23
0
 def get_local_corr(self, data, neighborhood, images=False):
     rdd = self.sc.parallelize(data)
     imgs = Images(rdd) if images else Series(rdd).toImages()
     return imgs.localCorr(neighborhood=neighborhood)
예제 #24
0
    def test_seriesStatByIndex(self):
        dataLocal = [((1, ), arange(12))]
        index = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
        data = Series(self.sc.parallelize(dataLocal), index=index)

        assert_true(
            array_equal(
                data.seriesStatByIndex('sum').values().first(),
                array([6, 22, 38])))
        assert_true(
            array_equal(
                data.seriesStatByIndex('mean').values().first(),
                array([1.5, 5.5, 9.5])))
        assert_true(
            array_equal(
                data.seriesStatByIndex('min').values().first(),
                array([0, 4, 8])))
        assert_true(
            array_equal(
                data.seriesStatByIndex('max').values().first(),
                array([3, 7, 11])))
        assert_true(
            array_equal(
                data.seriesStatByIndex('count').values().first(),
                array([4, 4, 4])))
        assert_true(
            array_equal(
                data.seriesStatByIndex('median').values().first(),
                array([1.5, 5.5, 9.5])))

        assert_true(
            array_equal(data.seriesSumByIndex().values().first(),
                        array([6, 22, 38])))
        assert_true(
            array_equal(data.seriesMeanByIndex().values().first(),
                        array([1.5, 5.5, 9.5])))
        assert_true(
            array_equal(data.seriesMinByIndex().values().first(),
                        array([0, 4, 8])))
        assert_true(
            array_equal(data.seriesMaxByIndex().values().first(),
                        array([3, 7, 11])))
        assert_true(
            array_equal(data.seriesCountByIndex().values().first(),
                        array([4, 4, 4])))
        assert_true(
            array_equal(data.seriesMedianByIndex().values().first(),
                        array([1.5, 5.5, 9.5])))

        index = [[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
                 [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1],
                 [0, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 3]]
        data.index = array(index).T

        result = data.seriesStatByIndex('sum', level=[0, 1])
        assert_true(
            array_equal(result.values().first(), array([1, 14, 13, 38])))
        assert_true(
            array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]])))

        result = data.seriesSumByIndex(level=[0, 1])
        assert_true(
            array_equal(result.values().first(), array([1, 14, 13, 38])))
        assert_true(
            array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]])))
예제 #25
0
 def test_between(self):
     rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])),
                                (1, array([8, 9, 10, 11]))])
     data = Series(rdd).between(0, 1)
     assert (allclose(data.index, array([0, 1])))
     assert (allclose(data.first()[1], array([4, 5])))