示例#1
0
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array(
            [var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=coefficient_of_variation,
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
            default_val=len(matrices[0]) if matrices else 0)
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )

        # Since scipy apparently can't do mode on sparse matrices, cast it to
        # dense. This can be very inefficient for large matrices, and should
        # be changed
        def __mode(x, *args, **kwargs):
            if sp.issparse(x):
                x = x.todense(order="C")
            # return ss.mode(x, *args, **kwargs)[0]
            # Temporary replacement for scipy
            return ut.nanmode(x, *args, **kwargs)[0]

        self._center = self.__compute_stat(
            matrices,
            discrete_f=None,
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )

        self._median = self.__compute_stat(
            matrices,
            discrete_f=lambda x: __mode(x, axis=0),
            continuous_f=lambda x: ut.nanmedian(x, axis=0),
            time_f=lambda x: ut.nanmedian(x, axis=0),
        )
示例#2
0
 def test_nanmedian_more_nonzeros(self, array):
     X = np.ones((10, 10))
     X[:5, 0] = np.nan
     X[:6, 1] = 0
     X_sparse = array(X)
     np.testing.assert_array_equal(
         nanmedian(X_sparse),
         np.nanmedian(X)
     )
示例#3
0
 def test_nanmedian_more_nonzeros(self, array):
     X = np.ones((10, 10))
     X[:5, 0] = np.nan
     X[:6, 1] = 0
     X_sparse = array(X)
     np.testing.assert_array_equal(
         nanmedian(X_sparse),
         np.nanmedian(X)
     )
 def fit(self, X, Y=None):
     """
     Infer row normalization parameters from the data.
     :param X: Continuous data matrix.
     :param Y: Grouping values
     :return:
     """
     # Equalize based on read depth per library / match mean read count per cell
     # Must not store indices
     if Y is not None:
         libraries = {lib: np.where(Y == lib)[0] for lib in set(Y)}
         lib_sizes = {}
         for lib, rows in libraries.items():
             lib_sizes[lib] = nanmedian(nansum(X[rows, :], axis=1))
         self.target_row_mean = min(lib_sizes.values())
         for lib in libraries:
             self.size_factors[lib] = self.target_row_mean / lib_sizes[lib]
     else:
         self.target_row_mean = nanmedian(nansum(X, axis=1))
示例#5
0
 def test_nanmedian(self, array):
     for X in self.data:
         X_sparse = array(X)
         np.testing.assert_array_equal(nanmedian(X_sparse), np.nanmedian(X))
示例#6
0
 def test_nanmedian(self, array):
     for X in self.data:
         X_sparse = array(X)
         np.testing.assert_array_equal(
             nanmedian(X_sparse),
             np.nanmedian(X))