def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array( [var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=coefficient_of_variation, ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), default_val=len(matrices[0]) if matrices else 0) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) # Since scipy apparently can't do mode on sparse matrices, cast it to # dense. This can be very inefficient for large matrices, and should # be changed def __mode(x, *args, **kwargs): if sp.issparse(x): x = x.todense(order="C") # return ss.mode(x, *args, **kwargs)[0] # Temporary replacement for scipy return ut.nanmode(x, *args, **kwargs)[0] self._center = self.__compute_stat( matrices, discrete_f=None, continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), ) self._median = self.__compute_stat( matrices, discrete_f=lambda x: __mode(x, axis=0), continuous_f=lambda x: ut.nanmedian(x, axis=0), time_f=lambda x: ut.nanmedian(x, axis=0), )
def test_nanmedian_more_nonzeros(self, array): X = np.ones((10, 10)) X[:5, 0] = np.nan X[:6, 1] = 0 X_sparse = array(X) np.testing.assert_array_equal( nanmedian(X_sparse), np.nanmedian(X) )
def fit(self, X, Y=None): """ Infer row normalization parameters from the data. :param X: Continuous data matrix. :param Y: Grouping values :return: """ # Equalize based on read depth per library / match mean read count per cell # Must not store indices if Y is not None: libraries = {lib: np.where(Y == lib)[0] for lib in set(Y)} lib_sizes = {} for lib, rows in libraries.items(): lib_sizes[lib] = nanmedian(nansum(X[rows, :], axis=1)) self.target_row_mean = min(lib_sizes.values()) for lib in libraries: self.size_factors[lib] = self.target_row_mean / lib_sizes[lib] else: self.target_row_mean = nanmedian(nansum(X, axis=1))
def test_nanmedian(self, array): for X in self.data: X_sparse = array(X) np.testing.assert_array_equal(nanmedian(X_sparse), np.nanmedian(X))
def test_nanmedian(self, array): for X in self.data: X_sparse = array(X) np.testing.assert_array_equal( nanmedian(X_sparse), np.nanmedian(X))