Exemplo n.º 1
0
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array(
            [var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=coefficient_of_variation,
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
            default_val=len(matrices[0]) if matrices else 0)
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )

        # Since scipy apparently can't do mode on sparse matrices, cast it to
        # dense. This can be very inefficient for large matrices, and should
        # be changed
        def __mode(x, *args, **kwargs):
            if sp.issparse(x):
                x = x.todense(order="C")
            # return ss.mode(x, *args, **kwargs)[0]
            # Temporary replacement for scipy
            return ut.nanmode(x, *args, **kwargs)[0]

        self._center = self.__compute_stat(
            matrices,
            discrete_f=None,
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )

        self._median = self.__compute_stat(
            matrices,
            discrete_f=lambda x: __mode(x, axis=0),
            continuous_f=lambda x: ut.nanmedian(x, axis=0),
            time_f=lambda x: ut.nanmedian(x, axis=0),
        )
Exemplo n.º 2
0
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array([var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0),
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )

        # Since scipy apparently can't do mode on sparse matrices, cast it to
        # dense. This can be very inefficient for large matrices, and should
        # be changed
        def __mode(x, *args, **kwargs):
            if sp.issparse(x):
                x = x.todense(order="C")
            # return ss.mode(x, *args, **kwargs)[0]
            return ut.nanmode(x, *args, **kwargs)[0]  # Temporary replacement for scipy

        self._center = self.__compute_stat(
            matrices,
            discrete_f=lambda x: __mode(x, axis=0),
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )
Exemplo n.º 3
0
    def test_nanmin_nanmax(self):
        for X in self.data:
            X_sparse = csr_matrix(X)
            for axis in [None, 0, 1]:
                np.testing.assert_array_equal(nanmin(X, axis=axis),
                                              np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(nanmin(X_sparse, axis=axis),
                                              np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(nanmax(X, axis=axis),
                                              np.nanmax(X, axis=axis))

                np.testing.assert_array_equal(nanmax(X_sparse, axis=axis),
                                              np.nanmax(X, axis=axis))
Exemplo n.º 4
0
    def _get_colors(self):
        """Compute colors for different kinds of histograms."""
        if self.target_var and self.target_var.is_discrete:
            colors = [[QColor(*color)
                       for color in self.target_var.colors]] * self.n_bins

        elif self.target_var and self.target_var.is_continuous:
            palette = ContinuousPaletteGenerator(*self.target_var.colors)

            bins = np.arange(self.n_bins)[:, np.newaxis]
            edges = self.edges if self.attribute.is_discrete else self.edges[
                1:-1]
            # Need to digitize on `right` here so the samples will be assigned
            # to the correct bin for coloring
            bin_indices = ut.digitize(self.x, bins=edges, right=True)
            mask = bin_indices == bins

            colors = []
            for bin_idx in range(self.n_bins):
                biny = self.y[mask[bin_idx]]
                if np.isfinite(biny).any():
                    mean = ut.nanmean(biny) / ut.nanmax(self.y)
                else:
                    mean = 0  # bin is empty, color does not matter
                colors.append([palette[mean]])

        else:
            colors = [[QColor('#ccc')]] * self.n_bins

        return colors
Exemplo n.º 5
0
 def update_sel_range(self, y_data):
     if y_data is None:
         curve1 = curve2 = pg.PlotDataItem(x=self.x_data, y=self.__mean)
     else:
         curve1 = pg.PlotDataItem(x=self.x_data, y=nanmin(y_data, axis=0))
         curve2 = pg.PlotDataItem(x=self.x_data, y=nanmax(y_data, axis=0))
     self.sel_range.setCurves(curve1, curve2)
Exemplo n.º 6
0
    def _get_colors(self):
        """Compute colors for different kinds of histograms."""
        if self.target_var and self.target_var.is_discrete:
            colors = [[QColor(*color) for color in self.target_var.colors]] * self.n_bins

        elif self.target_var and self.target_var.is_continuous:
            palette = ContinuousPaletteGenerator(*self.target_var.colors)

            bins = np.arange(self.n_bins)[:, np.newaxis]
            edges = self.edges if self.attribute.is_discrete else self.edges[1:-1]
            # Need to digitize on `right` here so the samples will be assigned
            # to the correct bin for coloring
            bin_indices = ut.digitize(self.x, bins=edges, right=True)
            mask = bin_indices == bins

            colors = []
            for bin_idx in range(self.n_bins):
                biny = self.y[mask[bin_idx]]
                if np.isfinite(biny).any():
                    mean = ut.nanmean(biny) / ut.nanmax(self.y)
                else:
                    mean = 0  # bin is empty, color does not matter
                colors.append([palette[mean]])

        else:
            colors = [[QColor('#ccc')]] * self.n_bins

        return colors
Exemplo n.º 7
0
    def test_nanmin_nanmax(self):
        warnings.filterwarnings("ignore", r".*All-NaN slice encountered.*")
        for X in self.data:
            X_sparse = csr_matrix(X)
            for axis in [None, 0, 1]:
                np.testing.assert_array_equal(nanmin(X, axis=axis),
                                              np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(nanmin(X_sparse, axis=axis),
                                              np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(nanmax(X, axis=axis),
                                              np.nanmax(X, axis=axis))

                np.testing.assert_array_equal(nanmax(X_sparse, axis=axis),
                                              np.nanmax(X, axis=axis))
Exemplo n.º 8
0
 def _get_range_curve(self):
     color = QColor(self.color)
     color.setAlpha(LinePlotStyle.RANGE_ALPHA)
     bottom, top = nanmin(self.y_data, axis=0), nanmax(self.y_data, axis=0)
     return pg.FillBetweenItem(pg.PlotDataItem(x=self.x_data, y=bottom),
                               pg.PlotDataItem(x=self.x_data, y=top),
                               brush=color)
Exemplo n.º 9
0
 def update_sel_range(self, y_data):
     if y_data is None:
         curve1 = curve2 = pg.PlotDataItem(x=self.x_data, y=self.__mean)
     else:
         curve1 = pg.PlotDataItem(x=self.x_data, y=nanmin(y_data, axis=0))
         curve2 = pg.PlotDataItem(x=self.x_data, y=nanmax(y_data, axis=0))
     self.sel_range.setCurves(curve1, curve2)
Exemplo n.º 10
0
    def _get_histogram_edges(self):
        """Get the edges in the histogram based on the attribute type.

        In case of a continuous variable, we split the variable range into
        n bins. In case of a discrete variable, bins don't make sense, so we
        just return the attribute values.

        This will return the staring and ending edge, not just the edges in
        between (in the case of a continuous variable).

        Returns
        -------
        np.ndarray

        """
        if self.attribute.is_discrete:
            return np.array(
                [self.attribute.to_val(v) for v in self.attribute.values])
        else:
            edges = np.linspace(ut.nanmin(self.x), ut.nanmax(self.x),
                                self.n_bins)
            edge_diff = edges[1] - edges[0]
            edges = np.hstack((edges, [edges[-1] + edge_diff]))

            # If the variable takes on a single value, we still need to spit
            # out some reasonable bin edges
            if np.all(edges == edges[0]):
                edges = np.array([edges[0] - 1, edges[0], edges[0] + 1])

            return edges
Exemplo n.º 11
0
    def _get_colors(self):
        """Compute colors for different kinds of histograms."""
        target = self.target_var
        if target and target.is_discrete:
            colors = [list(target.palette)[:len(target.values)]] * self.n_bins

        elif self.target_var and self.target_var.is_continuous:
            palette = self.target_var.palette

            bins = np.arange(self.n_bins)[:, np.newaxis]
            edges = self.edges if self.attribute.is_discrete else self.edges[
                1:-1]
            bin_indices = ut.digitize(self.x, bins=edges)
            mask = bin_indices == bins

            colors = []
            for bin_idx in range(self.n_bins):
                biny = self.y[mask[bin_idx]]
                if np.isfinite(biny).any():
                    mean = ut.nanmean(biny) / ut.nanmax(self.y)
                else:
                    mean = 0  # bin is empty, color does not matter
                colors.append([palette.value_to_qcolor(mean)])

        else:
            colors = [[QColor('#ccc')]] * self.n_bins

        return colors
Exemplo n.º 12
0
    def _get_histogram_edges(self):
        """Get the edges in the histogram based on the attribute type.

        In case of a continuous variable, we split the variable range into
        n bins. In case of a discrete variable, bins don't make sense, so we
        just return the attribute values.

        This will return the staring and ending edge, not just the edges in
        between (in the case of a continuous variable).

        Returns
        -------
        np.ndarray

        """
        if self.attribute.is_discrete:
            return np.array([self.attribute.to_val(v) for v in self.attribute.values])
        else:
            edges = np.linspace(ut.nanmin(self.x), ut.nanmax(self.x), self.n_bins)
            edge_diff = edges[1] - edges[0]
            edges = np.hstack((edges, [edges[-1] + edge_diff]))

            # If the variable takes on a single value, we still need to spit
            # out some reasonable bin edges
            if np.all(edges == edges[0]):
                edges = np.array([edges[0] - 1, edges[0], edges[0] + 1])

            return edges
Exemplo n.º 13
0
 def _get_range_curve(self):
     color = QColor(self.color)
     color.setAlpha(LinePlotStyle.RANGE_ALPHA)
     bottom, top = nanmin(self.y_data, axis=0), nanmax(self.y_data, axis=0)
     return pg.FillBetweenItem(
         pg.PlotDataItem(x=self.x_data, y=bottom),
         pg.PlotDataItem(x=self.x_data, y=top), brush=color
     )
Exemplo n.º 14
0
 def _get_range_curve(self):
     color = QColor(self.color)
     color.setAlpha(self.graph.range_settings[Updater.ALPHA_LABEL])
     bottom, top = nanmin(self.y_data, axis=0), nanmax(self.y_data, axis=0)
     return pg.FillBetweenItem(
         pg.PlotDataItem(x=self.x_data, y=bottom),
         pg.PlotDataItem(x=self.x_data, y=top), brush=color
     )
Exemplo n.º 15
0
    def test_nanmin_nanmax(self):
        for X in self.data:
            X_sparse = csr_matrix(X)
            for axis in [None, 0, 1]:
                np.testing.assert_array_equal(
                    nanmin(X, axis=axis),
                    np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmin(X_sparse, axis=axis),
                    np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmax(X, axis=axis),
                    np.nanmax(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmax(X_sparse, axis=axis),
                    np.nanmax(X, axis=axis))
Exemplo n.º 16
0
    def test_nanmin_nanmax(self):
        warnings.filterwarnings("ignore", r".*All-NaN slice encountered.*")
        for X in self.data:
            X_sparse = csr_matrix(X)
            for axis in [None, 0, 1]:
                np.testing.assert_array_equal(
                    nanmin(X, axis=axis),
                    np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmin(X_sparse, axis=axis),
                    np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmax(X, axis=axis),
                    np.nanmax(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmax(X_sparse, axis=axis),
                    np.nanmax(X, axis=axis))
Exemplo n.º 17
0
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array(
            [var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(
                x, axis=0),
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )
        self._center = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ss.mode(x)[0],
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )
Exemplo n.º 18
0
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array([var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0),
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )
        self._center = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ss.mode(x)[0],
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )
Exemplo n.º 19
0
 def __call__(self, data: Table, attribute):
     values, _ = data.get_column_view(attribute)
     points = []
     if values.size:
         mn, mx = ut.nanmin(values), ut.nanmax(values)
         if not np.isnan(mn):
             minf = int(1 + np.floor(mn / self.width))
             maxf = int(1 + np.floor(mx / self.width))
             if maxf - minf - 1 >= 100:
                 raise TooManyIntervals
             points = [i * self.width for i in range(minf, maxf)]
     return Discretizer.create_discretized_var(data.domain[attribute],
                                               points,
                                               ndigits=self.digits)
Exemplo n.º 20
0
 def __call__(self, data, attribute, fixed=None):
     if fixed:
         min, max = fixed[attribute.name]
         points = self._split_eq_width(min, max)
     else:
         if type(data) == SqlTable:
             stats = BasicStats(data, attribute)
             points = self._split_eq_width(stats.min, stats.max)
         else:
             values = data[:, attribute]
             values = values.X if values.X.size else values.Y
             min, max = ut.nanmin(values), ut.nanmax(values)
             points = self._split_eq_width(min, max)
     return Discretizer.create_discretized_var(
         data.domain[attribute], points)
Exemplo n.º 21
0
 def __call__(self, data: Table, attribute, fixed=None):
     if fixed:
         mn, mx = fixed[attribute.name]
         points = self._split_eq_width(mn, mx)
     else:
         if type(data) == SqlTable:
             stats = BasicStats(data, attribute)
             points = self._split_eq_width(stats.min, stats.max)
         else:
             values, _ = data.get_column_view(attribute)
             if values.size:
                 mn, mx = ut.nanmin(values), ut.nanmax(values)
                 points = self._split_eq_width(mn, mx)
             else:
                 points = []
     return Discretizer.create_discretized_var(data.domain[attribute],
                                               points)
Exemplo n.º 22
0
    def calculate_log_reg_coefficients(self):
        self.log_reg_coeffs = []
        self.log_reg_cont_data_extremes = []
        self.b0 = None
        if self.classifier is None or self.domain is None:
            return
        if not isinstance(self.classifier, LogisticRegressionClassifier):
            return

        self.domain = self.reconstruct_domain(self.classifier.original_domain,
                                              self.domain)
        self.data = Table.from_table(self.domain,
                                     self.classifier.original_data)
        attrs, ranges, start = self.domain.attributes, [], 0
        for attr in attrs:
            stop = start + len(attr.values) if attr.is_discrete else start + 1
            ranges.append(slice(start, stop))
            start = stop

        self.b0 = self.classifier.intercept
        coeffs = self.classifier.coefficients
        if len(self.domain.class_var.values) == 2:
            self.b0 = np.hstack((self.b0 * (-1), self.b0))
            coeffs = np.vstack((coeffs * (-1), coeffs))
        self.log_reg_coeffs = [coeffs[:, ranges[i]] for i in range(len(attrs))]
        self.log_reg_coeffs_orig = self.log_reg_coeffs.copy()

        min_values = nanmin(self.data.X, axis=0)
        max_values = nanmax(self.data.X, axis=0)

        for i, min_t, max_t in zip(range(len(self.log_reg_coeffs)), min_values,
                                   max_values):
            if self.log_reg_coeffs[i].shape[1] == 1:
                coef = self.log_reg_coeffs[i]
                self.log_reg_coeffs[i] = np.hstack(
                    (coef * min_t, coef * max_t))
                self.log_reg_cont_data_extremes.append(
                    [sorted([min_t, max_t], reverse=(c < 0)) for c in coef])
            else:
                self.log_reg_cont_data_extremes.append([None])
Exemplo n.º 23
0
    def calculate_log_reg_coefficients(self):
        self.log_reg_coeffs = []
        self.log_reg_cont_data_extremes = []
        self.b0 = None
        if self.classifier is None or self.domain is None:
            return
        if not isinstance(self.classifier, LogisticRegressionClassifier):
            return

        self.domain = self.reconstruct_domain(self.classifier.original_domain,
                                              self.domain)
        self.data = self.classifier.original_data.transform(self.domain)
        attrs, ranges, start = self.domain.attributes, [], 0
        for attr in attrs:
            stop = start + len(attr.values) if attr.is_discrete else start + 1
            ranges.append(slice(start, stop))
            start = stop

        self.b0 = self.classifier.intercept
        coeffs = self.classifier.coefficients
        if len(self.domain.class_var.values) == 2:
            self.b0 = np.hstack((self.b0 * (-1), self.b0))
            coeffs = np.vstack((coeffs * (-1), coeffs))
        self.log_reg_coeffs = [coeffs[:, ranges[i]] for i in range(len(attrs))]
        self.log_reg_coeffs_orig = self.log_reg_coeffs.copy()

        min_values = nanmin(self.data.X, axis=0)
        max_values = nanmax(self.data.X, axis=0)

        for i, min_t, max_t in zip(range(len(self.log_reg_coeffs)),
                                   min_values, max_values):
            if self.log_reg_coeffs[i].shape[1] == 1:
                coef = self.log_reg_coeffs[i]
                self.log_reg_coeffs[i] = np.hstack((coef * min_t, coef * max_t))
                self.log_reg_cont_data_extremes.append(
                    [sorted([min_t, max_t], reverse=(c < 0)) for c in coef])
            else:
                self.log_reg_cont_data_extremes.append([None])
Exemplo n.º 24
0
 def __call__(self, data: Table, attribute):
     fmt = [
         "%Y", "%y %b", "%y %b %d", "%y %b %d %H:%M", "%y %b %d %H:%M",
         "%H:%M:%S"
     ][self.unit]
     values, _ = data.get_column_view(attribute)
     times = []
     if values.size:
         mn, mx = ut.nanmin(values), ut.nanmax(values)
         if not np.isnan(mn):
             mn = utc_from_timestamp(mn).timetuple()
             mx = utc_from_timestamp(mx).timetuple()
             times = _time_range(mn, mx, self.unit, self.width, 0, 100)
             if times is None:
                 raise TooManyIntervals
     times = [time.struct_time(t + (0, 0, 0)) for t in times][1:-1]
     points = np.array([calendar.timegm(t) for t in times])
     values = [time.strftime(fmt, t) for t in times]
     values = _simplified_time_intervals(values)
     var = data.domain[attribute]
     return DiscreteVariable(name=var.name,
                             values=values,
                             compute_value=Discretizer(var, points),
                             sparse=var.sparse)
    def __compute_statistics(self):
        # We will compute statistics over all data at once
        matrices = [self._data.X, self._data._Y, self._data.metas]

        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = zip([
            self._domain.attributes, self._domain.class_vars, self._domain.metas
        ], matrices)
        # Filter out any matrices with size 0, filter the zipped matrices to 
        # eliminate variables in a single swoop
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        def _apply_to_types(attrs_x_pair, discrete_f=None, continuous_f=None,
                            time_f=None, string_f=None, default_val=np.nan):
            """Apply functions to variable types e.g. discrete_f to discrete 
            variables. Default value is returned if there is no function 
            defined for specific variable types."""
            attrs, x = attrs_x_pair
            result = np.full(len(attrs), default_val)
            disc_var_idx, cont_var_idx, time_var_idx, str_var_idx = self._attr_indices(attrs)
            if discrete_f and x[:, disc_var_idx].size:
                result[disc_var_idx] = discrete_f(x[:, disc_var_idx].astype(np.float64))
            if continuous_f and x[:, cont_var_idx].size:
                result[cont_var_idx] = continuous_f(x[:, cont_var_idx].astype(np.float64))
            if time_f and x[:, time_var_idx].size:
                result[time_var_idx] = time_f(x[:, time_var_idx].astype(np.float64))
            if string_f and x[:, str_var_idx].size:
                result[str_var_idx] = string_f(x[:, str_var_idx].astype(np.object))
            return result

        self._variable_types = [type(var) for var in self._attributes]
        self._variable_names = [var.name.lower() for var in self._attributes]

        # Compute the center
        _center = partial(
            _apply_to_types,
            discrete_f=lambda x: ss.mode(x)[0],
            continuous_f=lambda x: ut.nanmean(x, axis=0),
        )
        self._center = np.hstack(map(_center, matrices))

        # Compute the dispersion
        def _entropy(x):
            p = [ut.bincount(row)[0] for row in x.T]
            p = [pk / np.sum(pk) for pk in p]
            return np.fromiter((ss.entropy(pk) for pk in p), dtype=np.float64)
        _dispersion = partial(
            _apply_to_types,
            discrete_f=lambda x: _entropy(x),
            continuous_f=lambda x: ut.nanvar(x, axis=0),
        )
        self._dispersion = np.hstack(map(_dispersion, matrices))

        # Compute minimum values
        _max = partial(
            _apply_to_types,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
        )
        self._max = np.hstack(map(_max, matrices))

        # Compute maximum values
        _min = partial(
            _apply_to_types,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._min = np.hstack(map(_min, matrices))

        # Compute # of missing values
        _missing = partial(
            _apply_to_types,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._missing = np.hstack(map(_missing, matrices))