예제 #1
0
 def _prepare_data(self):
     """
     Standardize or demean data.
     """
     adj_data = self._adjusted_data
     self._mu = nanmean(adj_data, axis=0)
     self._sigma = np.sqrt(nanmean((adj_data - self._mu)**2.0, axis=0))
     if self._standardize:
         data = (adj_data - self._mu) / self._sigma
     elif self._demean:
         data = (adj_data - self._mu)
     else:
         data = adj_data
     return data / np.sqrt(self.weights)
예제 #2
0
파일: pca.py 프로젝트: zznelson/statsmodels
    def _prepare_data(self):
        """
        Standardize or demean data.
        """
        adj_data = self._adjusted_data
        if np.all(np.isnan(adj_data)):
            return np.empty(adj_data.shape[1]).fill(np.nan)

        self._mu = nanmean(adj_data, axis=0)
        self._sigma = np.sqrt(nanmean((adj_data - self._mu)**2.0, axis=0))
        if self._standardize:
            data = (adj_data - self._mu) / self._sigma
        elif self._demean:
            data = (adj_data - self._mu)
        else:
            data = adj_data
        return data / np.sqrt(self.weights)
예제 #3
0
    def _prepare_data(self):
        """
        Standardize or demean data.
        """
        adj_data = self._adjusted_data
        if np.all(np.isnan(adj_data)):
            return np.empty(adj_data.shape[1]).fill(np.nan)

        self._mu = nanmean(adj_data, axis=0)
        self._sigma = np.sqrt(nanmean((adj_data - self._mu) ** 2.0, axis=0))
        if self._standardize:
            data = (adj_data - self._mu) / self._sigma
        elif self._demean:
            data = (adj_data - self._mu)
        else:
            data = adj_data
        return data / np.sqrt(self.weights)
예제 #4
0
파일: pca.py 프로젝트: thinks520/CodeRecord
    def _fill_missing_em(self):
        """
        EM algorithm to fill missing values
        """
        non_missing = np.logical_not(np.isnan(self.data))

        # If nothing missing, return without altering the data
        if np.all(non_missing):
            return self.data

        # 1. Standardized data as needed
        data = self.transformed_data = np.asarray(self._prepare_data())

        ncomp = self._ncomp

        # 2. Check for all nans
        col_non_missing = np.sum(non_missing, 1)
        row_non_missing = np.sum(non_missing, 0)
        if np.any(col_non_missing < ncomp) or np.any(row_non_missing < ncomp):
            raise ValueError('Implementation requires that all columns and '
                             'all rows have at least ncomp non-missing values')
        # 3. Get mask
        mask = np.isnan(data)

        # 4. Compute mean
        mu = nanmean(data, 0)

        # 5. Replace missing with mean
        projection = np.ones((self._nobs, 1)) * mu
        projection_masked = projection[mask]
        data[mask] = projection_masked

        # 6. Compute eigenvalues and fit
        diff = 1.0
        _iter = 0
        while diff > self._tol_em and _iter < self._max_em_iter:
            last_projection_masked = projection_masked
            # Set transformed data to compute eigenvalues
            self.transformed_data = data
            # Call correct eig function here
            self._compute_eig()
            # Call function to compute factors and projection
            self._compute_pca_from_eig()
            projection = np.asarray(
                self.project(transform=False, unweight=False))
            projection_masked = projection[mask]
            data[mask] = projection_masked
            delta = last_projection_masked - projection_masked
            diff = _norm(delta) / _norm(projection_masked)
            _iter += 1
        # Must copy to avoid overwriting original data since replacing values
        data = self._adjusted_data + 0.0
        projection = np.asarray(self.project())
        data[mask] = projection[mask]

        return data
예제 #5
0
    def _fill_missing_em(self):
        """
        EM algorithm to fill missing values
        """
        non_missing = np.logical_not(np.isnan(self.data))

        # If nothing missing, return without altering the data
        if np.all(non_missing):
            return self.data

        # 1. Standardized data as needed
        data = self.transformed_data = self._prepare_data()

        ncomp = self._ncomp

        # 2. Check for all nans
        col_non_missing = np.sum(non_missing, 1)
        row_non_missing = np.sum(non_missing, 0)
        if np.any(col_non_missing < ncomp) or np.any(row_non_missing < ncomp):
            raise ValueError('Implementation requires that all columns and '
                             'all rows have at least ncomp non-missing values')
        # 3. Get mask
        mask = np.isnan(data)

        # 4. Compute mean
        mu = nanmean(data, 0)

        # 5. Replace missing with mean
        projection = np.ones((self._nobs, 1)) * mu
        projection_masked = projection[mask]
        data[mask] = projection_masked

        # 6. Compute eigenvalues and fit
        diff = 1.0
        _iter = 0
        while diff > self._tol_em and _iter < self._max_em_iter:
            last_projection_masked = projection_masked
            # Set transformed data to compute eigenvalues
            self.transformed_data = data
            # Call correct eig function here
            self._compute_eig()
            # Call function to compute factors and projection
            self._compute_pca_from_eig()
            projection = self.project(transform=False, unweight=False)
            projection_masked = projection[mask]
            data[mask] = projection_masked
            delta = last_projection_masked - projection_masked
            diff = _norm(delta) / _norm(projection_masked)
            _iter += 1
        # Must copy to avoid overwriting original data since replacing values
        data = self._adjusted_data + 0.0
        projection = self.project()
        data[mask] = projection[mask]

        return data
예제 #6
0
    def test_replace_missing(self):
        x = self.x.copy()
        x[::5, ::7] = np.nan

        pc = PCA(x, missing='drop-row')
        x_dropped_row = x[np.logical_not(np.any(np.isnan(x), 1))]
        pc_dropped = PCA(x_dropped_row)
        assert_equal(pc.projection, pc_dropped.projection)
        assert_equal(x, pc.data)

        pc = PCA(x, missing='drop-col')
        x_dropped_col = x[:, np.logical_not(np.any(np.isnan(x), 0))]
        pc_dropped = PCA(x_dropped_col)
        assert_equal(pc.projection, pc_dropped.projection)
        assert_equal(x, pc.data)

        pc = PCA(x, missing='drop-min')
        if x_dropped_row.size > x_dropped_col.size:
            x_dropped_min = x_dropped_row
        else:
            x_dropped_min = x_dropped_col
        pc_dropped = PCA(x_dropped_min)
        assert_equal(pc.projection, pc_dropped.projection)
        assert_equal(x, pc.data)

        pc = PCA(x, ncomp=3, missing='fill-em')
        missing = np.isnan(x)
        mu = nanmean(x, axis=0)
        errors = x - mu
        sigma = np.sqrt(nanmean(errors ** 2, axis=0))
        x_std = errors / sigma
        x_std[missing] = 0.0
        last = x_std[missing]
        delta = 1.0
        count = 0
        while delta > 5e-8:
            pc_temp = PCA(x_std, ncomp=3, standardize=False, demean=False)
            x_std[missing] = pc_temp.projection[missing]
            current = x_std[missing]
            diff = current - last
            delta = np.sqrt(np.sum(diff ** 2)) / np.sqrt(np.sum(current ** 2))
            last = current
            count += 1
        x = self.x + 0.0
        projection = pc_temp.projection * sigma + mu
        x[missing] = projection[missing]
        assert_allclose(pc._adjusted_data, x)
        # Check data for no changes
        assert_equal(self.x, self.x_copy)

        x = self.x
        pc = PCA(x)
        pc_dropped = PCA(x, missing='drop-row')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        pc_dropped = PCA(x, missing='drop-col')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        pc_dropped = PCA(x, missing='drop-min')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        pc = PCA(x, ncomp=3)
        pc_dropped = PCA(x, ncomp=3, missing='fill-em')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        # Test too many missing for missing='fill-em'
        x = self.x.copy()
        x[:, :] = np.nan
        assert_raises(ValueError, PCA, x, missing='drop-row')
        assert_raises(ValueError, PCA, x, missing='drop-col')
        assert_raises(ValueError, PCA, x, missing='drop-min')
        assert_raises(ValueError, PCA, x, missing='fill-em')
예제 #7
0
    def test_replace_missing(self):
        x = self.x.copy()
        x[::5, ::7] = np.nan

        pc = PCA(x, missing='drop-row')
        x_dropped_row = x[np.logical_not(np.any(np.isnan(x), 1))]
        pc_dropped = PCA(x_dropped_row)
        assert_equal(pc.projection, pc_dropped.projection)
        assert_equal(x, pc.data)

        pc = PCA(x, missing='drop-col')
        x_dropped_col = x[:, np.logical_not(np.any(np.isnan(x), 0))]
        pc_dropped = PCA(x_dropped_col)
        assert_equal(pc.projection, pc_dropped.projection)
        assert_equal(x, pc.data)

        pc = PCA(x, missing='drop-min')
        if x_dropped_row.size > x_dropped_col.size:
            x_dropped_min = x_dropped_row
        else:
            x_dropped_min = x_dropped_col
        pc_dropped = PCA(x_dropped_min)
        assert_equal(pc.projection, pc_dropped.projection)
        assert_equal(x, pc.data)

        pc = PCA(x, ncomp=3, missing='fill-em')
        missing = np.isnan(x)
        mu = nanmean(x, axis=0)
        errors = x - mu
        sigma = np.sqrt(nanmean(errors**2, axis=0))
        x_std = errors / sigma
        x_std[missing] = 0.0
        last = x_std[missing]
        delta = 1.0
        count = 0
        while delta > 5e-8:
            pc_temp = PCA(x_std, ncomp=3, standardize=False, demean=False)
            x_std[missing] = pc_temp.projection[missing]
            current = x_std[missing]
            diff = current - last
            delta = np.sqrt(np.sum(diff**2)) / np.sqrt(np.sum(current**2))
            last = current
            count += 1
        x = self.x + 0.0
        projection = pc_temp.projection * sigma + mu
        x[missing] = projection[missing]
        assert_allclose(pc._adjusted_data, x)
        # Check data for no changes
        assert_equal(self.x, self.x_copy)

        x = self.x
        pc = PCA(x)
        pc_dropped = PCA(x, missing='drop-row')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        pc_dropped = PCA(x, missing='drop-col')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        pc_dropped = PCA(x, missing='drop-min')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        pc = PCA(x, ncomp=3)
        pc_dropped = PCA(x, ncomp=3, missing='fill-em')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        # Test too many missing for missing='fill-em'
        x = self.x.copy()
        x[:, :] = np.nan
        assert_raises(ValueError, PCA, x, missing='drop-row')
        assert_raises(ValueError, PCA, x, missing='drop-col')
        assert_raises(ValueError, PCA, x, missing='drop-min')
        assert_raises(ValueError, PCA, x, missing='fill-em')