def normalize(self, table: AnyArray, group_col: np.ndarray) -> AnyArray:
        group_sums = np.bincount(group_col, ut.nansum(table, axis=1))
        group_sums[group_sums == 0] = 1
        group_sums_row = np.zeros_like(group_col)
        medians = []
        row_sums = ut.nansum(table, axis=1)
        for value, group_sum in zip(np.unique(group_col), group_sums):
            mask = group_col == value
            group_sums_row[mask] = group_sum
            if self.method == NormalizeGroups.Median:
                medians.append(np.nanmedian(row_sums[mask]))

        if self.method == NormalizeGroups.Median:
            factor = np.min(medians)
        else:
            factor = 1e6

        if sp.issparse(table):
            table = sp.diags(1 / group_sums_row) @ table
        else:
            table = table / group_sums_row[:, None]

        table *= factor

        return table
    def transform(self, data):
        """
        Transform data based on inferred parameters.
        :param data: Data table with expression values as counts.
                    Columns are genes and rows are cells.
        :return: Data table with normalized values.
        """
        # Result in expected number of reads
        Xeq = data.X.copy()
        n = Xeq.shape[0]

        # Normalize cell profiles
        if self.normalize_cells:
            # Each cell is normalized independently by default
            if sp.isspmatrix(Xeq):
                rs = Xeq.sum(axis=1).astype(float)
            else:
                rs = nansum(Xeq, axis=1).astype(float)
            rs[rs == 0] = 1.0
            rsm = np.ones((n, ), dtype=float) * self.target_row_mean
            factors = rsm / rs

            # Override with library size factor, if provided. Else, each row is
            # treated as a separate group
            if self.equalize_var is not None:
                vals = np.array(
                    list(
                        map(lambda lib: self.size_factors.get(lib, np.nan),
                            data.get_column_view(self.equalize_var)[0])))
                inxs = np.logical_not(np.isnan(vals))
                factors[inxs] = vals[inxs]

            Xd = sp.dia_matrix((factors.ravel(), 0), shape=(n, n), dtype=float)
            Xeq = Xd.dot(Xeq)

        # Log transform log(1 + x)
        if self.log_base is not None:
            if sp.isspmatrix(Xeq):
                Xeq = Xeq.log1p() / np.log(self.log_base)
            else:
                Xeq = np.log(1 + Xeq) / np.log(self.log_base)

        # Binary transform;
        # potential change to sparsity structure;
        if self.bin_thresh is not None:
            if sp.isspmatrix(Xeq):
                Xeq.data = (Xeq.data > self.bin_thresh).astype(int)
                Xeq.eliminate_zeros()
            else:
                Xeq = (Xeq > self.bin_thresh)

        # Preserve sparsity
        X_new = Xeq.tocsr() if sp.isspmatrix(Xeq) else Xeq
        data_new = Table.from_numpy(domain=data.domain,
                                    X=X_new,
                                    Y=data.Y,
                                    W=data.W,
                                    metas=data.metas)
        return data_new
Exemplo n.º 3
0
 def fit(self, X, Y=None):
     """
     Infer row normalization parameters from the data.
     :param X: Continuous data matrix.
     :param Y: Grouping values
     :return:
     """
     # Equalize based on read depth per library / match mean read count per cell
     # Must not store indices
     if Y is not None:
         libraries = {lib: np.where(Y == lib)[0] for lib in set(Y)}
         lib_sizes = {}
         for lib, rows in libraries.items():
             lib_sizes[lib] = nanmedian(nansum(X[rows, :], axis=1))
         self.target_row_mean = min(lib_sizes.values())
         for lib in libraries:
             self.size_factors[lib] = self.target_row_mean / lib_sizes[lib]
     else:
         self.target_row_mean = nanmedian(nansum(X, axis=1))
    def normalize(self, table: AnyArray) -> AnyArray:
        row_sums = ut.nansum(table, axis=1)
        row_sums[row_sums == 0] = 1  # avoid division by zero errors

        if self.method == NormalizeSamples.Median:
            factor = np.nanmedian(row_sums)
        else:
            factor = 1e6

        if sp.issparse(table):
            table = sp.diags(1 / row_sums) @ table
        else:
            table = table / row_sums[:, None]

        table *= factor

        return table
Exemplo n.º 5
0
 def test_nansum(self, array):
     for X in self.data:
         X_sparse = array(X)
         np.testing.assert_array_equal(nansum(X_sparse), np.nansum(X))
Exemplo n.º 6
0
 def test_nansum(self, array):
     for X in self.data:
         X_sparse = array(X)
         np.testing.assert_array_equal(
             nansum(X_sparse),
             np.nansum(X))
Exemplo n.º 7
0
class Pivot:
    Functions = AggregationFunctionsEnum
    (Count, Count_defined, Sum, Mean, Min, Max, Mode, Median, Var,
     Majority) = Functions

    AutonomousFunctions = (Count, )
    AnyVarFunctions = (Count_defined, )
    ContVarFunctions = (Sum, Mean, Min, Max, Mode, Median, Var)
    DiscVarFunctions = (Majority, )
    TimeVarFunctions = (Mean, Min, Max, Mode, Median)
    FloatFunctions = (Count, Count_defined, Sum, Var)

    class Tables:
        table = None  # type: Table
        total_h = None  # type: Table
        total_v = None  # type: Table
        total = None  # type: Table

        def __call__(self):
            return self.table, self.total_h, self.total_v, self.total

    def __init__(self,
                 table: Table,
                 agg_funs: Iterable[Functions],
                 row_var: Variable,
                 col_var: Variable = None,
                 val_var: Variable = None):
        self._group_tables = self.Tables()
        self._pivot_tables = self.Tables()
        self._table = table
        self._row_var = row_var
        self._col_var = col_var if col_var else row_var
        self.renamed = []

        if not table:
            return
        if not self._row_var.is_primitive():
            raise TypeError("Row variable should be DiscreteVariable"
                            " or ContinuousVariable")
        if self._col_var and not self._col_var.is_discrete:
            raise TypeError("Column variable should be DiscreteVariable")

        self._row_var_col = table.get_column_view(row_var)[0].astype(np.float)
        self._col_var_col = table.get_column_view(self._col_var)[0].astype(
            np.float)
        self._row_var_groups = nanunique(self._row_var_col)
        self._col_var_groups = nanunique(self._col_var_col)

        self._total_var = DiscreteVariable("Total", values=("total", ))
        self._current_agg_functions = sorted(agg_funs)
        self._indepen_agg_done = {}  # type: Dict[Functions, int]
        self._depen_agg_done = {}  # type: Dict[Functions, Dict[Variable, int]]

        self._initialize(agg_funs, val_var)

    @property
    def group_table(self) -> Table:
        table = self._group_tables.table
        if not table or len(table) == 0:
            return None
        indices = [0, 1] if not self.single_var_grouping else [0]
        for f in self._current_agg_functions:
            if f in self._indepen_agg_done:
                indices.append(self._indepen_agg_done[f])
        for v in self._table.domain.variables + self._table.domain.metas:
            for f in self._current_agg_functions:
                if f in self._depen_agg_done and v in self._depen_agg_done[f]:
                    indices.append(self._depen_agg_done[f][v])
        return table[:, indices]

    @property
    def pivot_table(self) -> Table:
        return self._pivot_tables.table

    @property
    def pivot_total_h(self) -> Table:
        return self._pivot_tables.total_h

    @property
    def pivot_total_v(self) -> Table:
        return self._pivot_tables.total_v

    @property
    def pivot_total(self) -> Table:
        return self._pivot_tables.total

    @property
    def pivot_tables(self) -> Table:
        return self._pivot_tables()

    @property
    def single_var_grouping(self) -> bool:
        return self._row_var is self._col_var

    def update_group_table(self,
                           agg_funs: Iterable[Functions],
                           val_var: Variable = None):
        if not self._group_tables:
            return
        self._current_agg_functions = sorted(agg_funs)
        agg_funs = set(self._indepen_agg_done.keys()) | \
            set(self._depen_agg_done.keys()) | set(agg_funs)
        self._initialize(sorted(agg_funs), val_var)

    def _initialize(self, agg_funs, val_var):
        var_indep_funs, var_dep_funs = self.__group_aggregations(agg_funs)
        self._create_group_tables(var_indep_funs, var_dep_funs)
        self.__reference_aggregations(var_indep_funs, var_dep_funs)
        self._create_pivot_tables(val_var)

    def __group_aggregations(self, agg_funs):
        auto_funcs = self.AutonomousFunctions
        var_indep_funs = [fun for fun in agg_funs if fun in auto_funcs]
        var_dep_funs = []
        attrs = self._table.domain.variables + self._table.domain.metas
        prod = product(filter_visible(attrs),
                       [fun for fun in agg_funs if fun not in auto_funcs])
        for var, fun in prod:
            if self.__include_aggregation(fun, var):
                var_dep_funs.append((var, fun))
        return var_indep_funs, var_dep_funs

    def __include_aggregation(self, fun, var):
        return fun in self.ContVarFunctions and var.is_continuous or \
               fun in self.DiscVarFunctions and var.is_discrete or \
               fun in self.AnyVarFunctions

    def __reference_aggregations(self, var_indep_funs, var_dep_funs):
        self._indepen_agg_done = {}
        self._depen_agg_done = defaultdict(dict)
        i = 1 - int(bool(self.single_var_grouping))
        for i, fun in enumerate(var_indep_funs, i + 1):
            self._indepen_agg_done[fun] = i
        for j, (var, fun) in enumerate(var_dep_funs, i + 1):
            self._depen_agg_done[fun].update({var: j})

    def _create_group_tables(self, var_indep_funs, var_dep_funs):
        attrs = [
            ContinuousVariable(f"({str(fun).lower()})")
            for fun in var_indep_funs
        ]
        for var, fun in var_dep_funs:
            name = f"{var.name} ({str(fun).lower()})"
            if fun in self.DiscVarFunctions:
                attrs.append(DiscreteVariable(name, var.values))
            else:
                if isinstance(var, TimeVariable) and \
                        fun in self.TimeVarFunctions:
                    attrs.append(
                        TimeVariable(name,
                                     have_date=var.have_date,
                                     have_time=var.have_time))
                else:
                    attrs.append(ContinuousVariable(name))
        args = (var_indep_funs, var_dep_funs, attrs)
        for t, var in (("table", None), ("total_h", self._col_var),
                       ("total_v", self._row_var), ("total", self._total_var)):
            setattr(self._group_tables, t, self.__get_group_table(var, *args))

    def __get_group_table(self, var, var_indep_funs, var_dep_funs, attrs):
        if var is self._total_var:
            group_tab = self._group_tables.total
            offset = int(bool(not self.single_var_grouping))
            leading_vars = [self._total_var]
            combs = np.array([[0]])
            sub_table_getter = lambda x: \
                self._table[np.where((~np.isnan(self._row_var_col)) &
                                     (~np.isnan(self._col_var_col)))[0]]
        elif var is self._row_var or self.single_var_grouping:
            group_tab = self._group_tables.total_v
            offset = int(bool(not self.single_var_grouping))
            leading_vars = [self._row_var]
            combs = self._row_var_groups[:, None]
            sub_table_getter = lambda x: \
                self._table[np.where((~np.isnan(self._col_var_col)) &
                                     (self._row_var_col == x[0]))[0]]
        elif var is self._col_var:
            group_tab = self._group_tables.total_h
            offset = int(bool(not self.single_var_grouping))
            leading_vars = [self._col_var]
            combs = self._col_var_groups[:, None]
            sub_table_getter = lambda x: \
                self._table[np.where((~np.isnan(self._row_var_col)) &
                                     (self._col_var_col == x[0]))[0]]
        else:
            group_tab = self._group_tables.table
            offset = 0
            leading_vars = [self._row_var, self._col_var]
            combs = np.array(
                list(product(self._row_var_groups, self._col_var_groups)))
            sub_table_getter = lambda x: \
                self._table[np.where((self._row_var_col == x[0])
                                     & (self._col_var_col == x[1]))[0]]

        if not combs.shape[0]:
            return None

        n = len(var_indep_funs) + len(var_dep_funs)
        X = np.zeros((len(combs), n), dtype=float)
        for i, comb in enumerate(combs):
            sub_table = sub_table_getter(comb)
            j = -1
            for j, fun in enumerate(var_indep_funs):
                if fun in self._indepen_agg_done:
                    # TODO - optimize - after this line is executed,
                    # the whole column is already set
                    X[:, j] = group_tab.X[:,
                                          self._indepen_agg_done[fun] - offset]
                else:
                    X[i, j] = fun(sub_table)
            for k, (v, fun) in enumerate(var_dep_funs, j + 1):
                if fun in self._depen_agg_done:
                    X[:,
                      k] = group_tab.X[:,
                                       self._depen_agg_done[fun][v] - offset]
                else:
                    X[i, k] = fun(sub_table.get_column_view(v)[0])

        #rename leading vars (seems the easiest) if needed
        current = [var.name for var in attrs]
        uniq_leading_vars = []
        for v in leading_vars:
            uniq = get_unique_names(current, v.name)
            if uniq != v.name:
                self.renamed.append(v.name)
                v = v.copy(name=uniq)
            uniq_leading_vars.append(v)
            current.append(uniq)

        return Table(Domain(uniq_leading_vars + attrs), np.hstack((combs, X)))

    def update_pivot_table(self, val_var: Variable):
        self._create_pivot_tables(val_var)

    def _create_pivot_tables(self, val_var):
        if not self._group_tables.table:
            self._pivot_tables = self.Tables()
            return

        agg_funs = [
            fun for fun in self._current_agg_functions
            if fun in self.AutonomousFunctions
            or val_var and self.__include_aggregation(fun, val_var)
        ]
        X, X_h, X_v, X_t = self.__get_pivot_tab_x(val_var, agg_funs)
        dom, dom_h, dom_v, dom_t = self.__get_pivot_tab_domain(
            val_var, X, X_h, X_v, X_t, agg_funs)
        for t, d, x in (("table", dom, X), ("total_h", dom_h, X_h),
                        ("total_v", dom_v, X_v), ("total", dom_t, X_t)):
            setattr(self._pivot_tables, t, Table(d, x))

    # pylint: disable=invalid-name
    def __get_pivot_tab_domain(self, val_var, X, X_h, X_v, X_t, agg_funs):
        def map_values(index, _X):
            values = np.unique(_X[:, index])
            values = np.delete(values, np.where(values == "nan")[0])
            for j, value in enumerate(values):
                _X[:, index][_X[:, index] == value] = j
            return values

        create_time_var = \
            isinstance(val_var, TimeVariable) and \
            all(fun in self.TimeVarFunctions for fun in agg_funs)
        create_cont_var = \
            not val_var or val_var.is_continuous and \
            (not isinstance(val_var, TimeVariable) or
             all(fun in self.FloatFunctions for fun in agg_funs))

        vals = np.array(self._col_var.values)[self._col_var_groups.astype(int)]
        if create_time_var:
            kwargs = {
                "have_date": val_var.have_date,
                "have_time": val_var.have_time
            }
            attrs = [[TimeVariable(f"{v}", **kwargs) for v in vals]] * 2
            attrs.extend([[TimeVariable("Total", **kwargs)]] * 2)
        elif create_cont_var:
            attrs = [[ContinuousVariable(f"{v}", 1) for v in vals]] * 2
            attrs.extend([[ContinuousVariable("Total", 1)]] * 2)
        else:
            attrs = []
            for x in (X, X_h):
                attrs.append([
                    DiscreteVariable(f"{v}", map_values(i, x))
                    for i, v in enumerate(vals, 2)
                ])
            for x in (X_v, X_t):
                attrs.append([DiscreteVariable("Total", map_values(0, x))])
        row_var_h = DiscreteVariable(self._row_var.name, values=["Total"])
        aggr_attr = DiscreteVariable('Aggregate', [str(f) for f in agg_funs])

        same_row_col = self._col_var is self._row_var

        extra_vars = [self._row_var, aggr_attr]
        uniq_a = get_unique_names_duplicates([v.name for v in extra_vars] +
                                             [atr.name for atr in attrs[0]])
        for (idx, var), u in zip(enumerate(chain(extra_vars, attrs[0])),
                                 uniq_a):
            if var.name == u:
                continue
            if idx == 0:
                self.renamed.append(self._row_var.name)
                self._row_var = self._row_var.copy(name=u)
                if same_row_col:
                    self._col_var = self._row_var
                row_var_h = row_var_h.copy(name=u)
            elif idx == 1:
                self.renamed.append(aggr_attr.name)
                aggr_attr = aggr_attr.copy(name=u)
            else:
                self.renamed.append(var.name)
                attrs[0][idx - 2] = var.copy(name=u)
                attrs[1][idx - 2] = var.copy(name=u)

        if same_row_col:
            vals = tuple(v.name for v in attrs[0])
            self._row_var.make(self._row_var.name, values=vals)
            vals = tuple(v.name for v in attrs[2])
            row_var_h.make(row_var_h.name, vals)

        return (Domain([self._row_var, aggr_attr] + attrs[0]),
                Domain([row_var_h, aggr_attr] + attrs[1]), Domain(attrs[2]),
                Domain(attrs[3]))

    def __get_pivot_tab_x(self, val_var, agg_funs):
        gt = self._group_tables
        n_fun = len(agg_funs)
        n_rows, n_cols = len(self._row_var_groups), len(self._col_var_groups)
        is_float_type = not val_var or val_var.is_continuous
        if isinstance(val_var, TimeVariable):
            is_float_type = \
                all(fun in self.TimeVarFunctions for fun in agg_funs) or \
                all(fun in self.FloatFunctions for fun in agg_funs)
        kwargs = {"fill_value": np.nan, "dtype": float} if is_float_type \
            else {"fill_value": "", "dtype": object}
        X = np.full((n_rows * n_fun, 2 + n_cols), **kwargs)
        X_h = np.full((n_fun, 2 + n_cols), **kwargs)
        X_v = np.full((n_rows * n_fun, 1), **kwargs)
        X_t = np.full((n_fun, 1), **kwargs)
        for i, fun in enumerate(agg_funs):
            args = (val_var, fun, is_float_type)
            X[i::n_fun, 2:] = self.__rows_for_function(n_rows, n_cols, *args)
            X[i::n_fun, :2] = np.array([[row_val, agg_funs.index(fun)]
                                        for row_val in self._row_var_groups])
            X_h[i, :2] = 0, agg_funs.index(fun)
            X_h[i, 2:] = self.__total_for_function(gt.total_h, *args)
            X_v[i::n_fun, 0] = self.__total_for_function(gt.total_v, *args)
            X_t[i] = self.__total_for_function(gt.total, *args)
        return X, X_h, X_v, X_t

    def __total_for_function(self, group_tab, val_var, fun, is_float_type):
        ref = self._indepen_agg_done.get(fun, None) \
              or self._depen_agg_done[fun][val_var]
        ref -= int(bool(not self.single_var_grouping))
        return self.__check_continuous(val_var, group_tab.X[:, ref], fun,
                                       is_float_type)

    def __rows_for_function(self, n_rows, n_cols, val_var, fun, is_float_type):
        ref = self._indepen_agg_done.get(fun, None) \
              or self._depen_agg_done[fun][val_var]
        column = self._group_tables.table.X[:, ref]
        if self.single_var_grouping:
            rows = np.full((n_rows, n_cols), fun(np.array([]), ), dtype=float)
            rows[np.diag_indices_from(rows)] = column
        else:
            rows = column.reshape(n_rows, n_cols)
        return self.__check_continuous(val_var, rows, fun, is_float_type)

    def __check_continuous(self, val_var, column, fun, is_float_type):
        if val_var and not val_var.is_continuous:
            column = column.astype(str)
            if fun in self.DiscVarFunctions:
                for j, val in enumerate(val_var.values):
                    column[column == str(float(j))] = val
        elif isinstance(val_var, TimeVariable) and not is_float_type:
            shape = column.shape
            column = column.flatten()
            column_ = column.astype(str)
            if fun in self.TimeVarFunctions:
                for i in range(column.shape[0]):
                    if not np.isnan(column[i]):
                        column_[i] = val_var.repr_val(column[i])
            return column_.reshape(shape)
        return column

    @staticmethod
    def count_defined(x):
        if x.shape[0] == 0:
            return 0
        if x.size and np.issubdtype(x.dtype, np.number) and not sp.issparse(x):
            nans = np.isnan(x).sum(axis=0)
        elif sp.issparse(x) and x.size:
            nans = np.bincount(x.nonzero()[1], minlength=x.shape[1])
            x = x.tocsc()
        else:
            x_str = x.astype(str)
            nans = ((x_str == "nan") | (x_str == "")).sum(axis=0) \
                if x.size else np.zeros(x.shape[1])
        return x.shape[0] - nans

    @staticmethod
    def stat(x, f):
        return f(x.astype(np.float), axis=0) if x.shape[0] > 0 else np.nan

    @staticmethod
    def mode(x):
        return Pivot.stat(x, nanmode).mode if x.shape[0] > 0 else np.nan

    @staticmethod
    def majority(x):
        if x.shape[0] == 0:
            return np.nan
        counts = bincount(x)[0]
        return np.argmax(counts) if counts.shape[0] else np.nan

    Count.func = lambda x: len(x[0])
    Count_defined.func = lambda x: Pivot.count_defined(x[0])
    Sum.func = lambda x: nansum(x[0], axis=0) if x[0].shape[0] > 0 else 0
    Mean.func = lambda x: Pivot.stat(x[0], nanmean)
    Min.func = lambda x: Pivot.stat(x[0], nanmin)
    Max.func = lambda x: Pivot.stat(x[0], nanmax)
    Median.func = lambda x: Pivot.stat(x[0], nanmedian)
    Mode.func = lambda x: Pivot.mode(x[0])
    Var.func = lambda x: Pivot.stat(x[0], nanvar)
    Majority.func = lambda x: Pivot.majority(x[0])