Пример #1
0
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array([var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0),
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )

        # Since scipy apparently can't do mode on sparse matrices, cast it to
        # dense. This can be very inefficient for large matrices, and should
        # be changed
        def __mode(x, *args, **kwargs):
            if sp.issparse(x):
                x = x.todense(order="C")
            # return ss.mode(x, *args, **kwargs)[0]
            return ut.nanmode(x, *args, **kwargs)[0]  # Temporary replacement for scipy

        self._center = self.__compute_stat(
            matrices,
            discrete_f=lambda x: __mode(x, axis=0),
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )
Пример #2
0
    def test_nanmin_nanmax(self):
        warnings.filterwarnings("ignore", r".*All-NaN slice encountered.*")
        for X in self.data:
            X_sparse = csr_matrix(X)
            for axis in [None, 0, 1]:
                np.testing.assert_array_equal(nanmin(X, axis=axis),
                                              np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(nanmin(X_sparse, axis=axis),
                                              np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(nanmax(X, axis=axis),
                                              np.nanmax(X, axis=axis))

                np.testing.assert_array_equal(nanmax(X_sparse, axis=axis),
                                              np.nanmax(X, axis=axis))
Пример #3
0
 def update_sel_range(self, y_data):
     if y_data is None:
         curve1 = curve2 = pg.PlotDataItem(x=self.x_data, y=self.__mean)
     else:
         curve1 = pg.PlotDataItem(x=self.x_data, y=nanmin(y_data, axis=0))
         curve2 = pg.PlotDataItem(x=self.x_data, y=nanmax(y_data, axis=0))
     self.sel_range.setCurves(curve1, curve2)
Пример #4
0
    def _get_histogram_edges(self):
        """Get the edges in the histogram based on the attribute type.

        In case of a continuous variable, we split the variable range into
        n bins. In case of a discrete variable, bins don't make sense, so we
        just return the attribute values.

        This will return the staring and ending edge, not just the edges in
        between (in the case of a continuous variable).

        Returns
        -------
        np.ndarray

        """
        if self.attribute.is_discrete:
            return np.array([self.attribute.to_val(v) for v in self.attribute.values])
        else:
            edges = np.linspace(ut.nanmin(self.x), ut.nanmax(self.x), self.n_bins)
            edge_diff = edges[1] - edges[0]
            edges = np.hstack((edges, [edges[-1] + edge_diff]))

            # If the variable takes on a single value, we still need to spit
            # out some reasonable bin edges
            if np.all(edges == edges[0]):
                edges = np.array([edges[0] - 1, edges[0], edges[0] + 1])

            return edges
Пример #5
0
 def update_sel_range(self, y_data):
     if y_data is None:
         curve1 = curve2 = pg.PlotDataItem(x=self.x_data, y=self.__mean)
     else:
         curve1 = pg.PlotDataItem(x=self.x_data, y=nanmin(y_data, axis=0))
         curve2 = pg.PlotDataItem(x=self.x_data, y=nanmax(y_data, axis=0))
     self.sel_range.setCurves(curve1, curve2)
Пример #6
0
    def _get_histogram_edges(self):
        """Get the edges in the histogram based on the attribute type.

        In case of a continuous variable, we split the variable range into
        n bins. In case of a discrete variable, bins don't make sense, so we
        just return the attribute values.

        This will return the staring and ending edge, not just the edges in
        between (in the case of a continuous variable).

        Returns
        -------
        np.ndarray

        """
        if self.attribute.is_discrete:
            return np.array(
                [self.attribute.to_val(v) for v in self.attribute.values])
        else:
            edges = np.linspace(ut.nanmin(self.x), ut.nanmax(self.x),
                                self.n_bins)
            edge_diff = edges[1] - edges[0]
            edges = np.hstack((edges, [edges[-1] + edge_diff]))

            # If the variable takes on a single value, we still need to spit
            # out some reasonable bin edges
            if np.all(edges == edges[0]):
                edges = np.array([edges[0] - 1, edges[0], edges[0] + 1])

            return edges
Пример #7
0
 def _get_range_curve(self):
     color = QColor(self.color)
     color.setAlpha(LinePlotStyle.RANGE_ALPHA)
     bottom, top = nanmin(self.y_data, axis=0), nanmax(self.y_data, axis=0)
     return pg.FillBetweenItem(
         pg.PlotDataItem(x=self.x_data, y=bottom),
         pg.PlotDataItem(x=self.x_data, y=top), brush=color
     )
Пример #8
0
 def _get_range_curve(self):
     color = QColor(self.color)
     color.setAlpha(LinePlotStyle.RANGE_ALPHA)
     bottom, top = nanmin(self.y_data, axis=0), nanmax(self.y_data, axis=0)
     return pg.FillBetweenItem(
         pg.PlotDataItem(x=self.x_data, y=bottom),
         pg.PlotDataItem(x=self.x_data, y=top), brush=color
     )
Пример #9
0
 def _get_range_curve(self):
     color = QColor(self.color)
     color.setAlpha(self.graph.range_settings[Updater.ALPHA_LABEL])
     bottom, top = nanmin(self.y_data, axis=0), nanmax(self.y_data, axis=0)
     return pg.FillBetweenItem(
         pg.PlotDataItem(x=self.x_data, y=bottom),
         pg.PlotDataItem(x=self.x_data, y=top), brush=color
     )
Пример #10
0
    def test_nanmin_nanmax(self):
        for X in self.data:
            X_sparse = csr_matrix(X)
            for axis in [None, 0, 1]:
                np.testing.assert_array_equal(
                    nanmin(X, axis=axis),
                    np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmin(X_sparse, axis=axis),
                    np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmax(X, axis=axis),
                    np.nanmax(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmax(X_sparse, axis=axis),
                    np.nanmax(X, axis=axis))
Пример #11
0
    def test_nanmin_nanmax(self):
        for X in self.data:
            X_sparse = csr_matrix(X)
            for axis in [None, 0, 1]:
                np.testing.assert_array_equal(
                    nanmin(X, axis=axis),
                    np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmin(X_sparse, axis=axis),
                    np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmax(X, axis=axis),
                    np.nanmax(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmax(X_sparse, axis=axis),
                    np.nanmax(X, axis=axis))
Пример #12
0
    def test_nanmin_nanmax(self):
        warnings.filterwarnings("ignore", r".*All-NaN slice encountered.*")
        for X in self.data:
            X_sparse = csr_matrix(X)
            for axis in [None, 0, 1]:
                np.testing.assert_array_equal(
                    nanmin(X, axis=axis),
                    np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmin(X_sparse, axis=axis),
                    np.nanmin(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmax(X, axis=axis),
                    np.nanmax(X, axis=axis))

                np.testing.assert_array_equal(
                    nanmax(X_sparse, axis=axis),
                    np.nanmax(X, axis=axis))
Пример #13
0
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array(
            [var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(
                x, axis=0),
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )
        self._center = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ss.mode(x)[0],
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )
Пример #14
0
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array([var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0),
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )
        self._center = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ss.mode(x)[0],
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )
Пример #15
0
 def __call__(self, data: Table, attribute):
     values, _ = data.get_column_view(attribute)
     points = []
     if values.size:
         mn, mx = ut.nanmin(values), ut.nanmax(values)
         if not np.isnan(mn):
             minf = int(1 + np.floor(mn / self.width))
             maxf = int(1 + np.floor(mx / self.width))
             if maxf - minf - 1 >= 100:
                 raise TooManyIntervals
             points = [i * self.width for i in range(minf, maxf)]
     return Discretizer.create_discretized_var(data.domain[attribute],
                                               points,
                                               ndigits=self.digits)
Пример #16
0
 def __call__(self, data, attribute, fixed=None):
     if fixed:
         min, max = fixed[attribute.name]
         points = self._split_eq_width(min, max)
     else:
         if type(data) == SqlTable:
             stats = BasicStats(data, attribute)
             points = self._split_eq_width(stats.min, stats.max)
         else:
             values = data[:, attribute]
             values = values.X if values.X.size else values.Y
             min, max = ut.nanmin(values), ut.nanmax(values)
             points = self._split_eq_width(min, max)
     return Discretizer.create_discretized_var(
         data.domain[attribute], points)
Пример #17
0
 def __call__(self, data: Table, attribute, fixed=None):
     if fixed:
         mn, mx = fixed[attribute.name]
         points = self._split_eq_width(mn, mx)
     else:
         if type(data) == SqlTable:
             stats = BasicStats(data, attribute)
             points = self._split_eq_width(stats.min, stats.max)
         else:
             values, _ = data.get_column_view(attribute)
             if values.size:
                 mn, mx = ut.nanmin(values), ut.nanmax(values)
                 points = self._split_eq_width(mn, mx)
             else:
                 points = []
     return Discretizer.create_discretized_var(data.domain[attribute],
                                               points)
Пример #18
0
    def calculate_log_reg_coefficients(self):
        self.log_reg_coeffs = []
        self.log_reg_cont_data_extremes = []
        self.b0 = None
        if self.classifier is None or self.domain is None:
            return
        if not isinstance(self.classifier, LogisticRegressionClassifier):
            return

        self.domain = self.reconstruct_domain(self.classifier.original_domain,
                                              self.domain)
        self.data = Table.from_table(self.domain,
                                     self.classifier.original_data)
        attrs, ranges, start = self.domain.attributes, [], 0
        for attr in attrs:
            stop = start + len(attr.values) if attr.is_discrete else start + 1
            ranges.append(slice(start, stop))
            start = stop

        self.b0 = self.classifier.intercept
        coeffs = self.classifier.coefficients
        if len(self.domain.class_var.values) == 2:
            self.b0 = np.hstack((self.b0 * (-1), self.b0))
            coeffs = np.vstack((coeffs * (-1), coeffs))
        self.log_reg_coeffs = [coeffs[:, ranges[i]] for i in range(len(attrs))]
        self.log_reg_coeffs_orig = self.log_reg_coeffs.copy()

        min_values = nanmin(self.data.X, axis=0)
        max_values = nanmax(self.data.X, axis=0)

        for i, min_t, max_t in zip(range(len(self.log_reg_coeffs)), min_values,
                                   max_values):
            if self.log_reg_coeffs[i].shape[1] == 1:
                coef = self.log_reg_coeffs[i]
                self.log_reg_coeffs[i] = np.hstack(
                    (coef * min_t, coef * max_t))
                self.log_reg_cont_data_extremes.append(
                    [sorted([min_t, max_t], reverse=(c < 0)) for c in coef])
            else:
                self.log_reg_cont_data_extremes.append([None])
Пример #19
0
    def calculate_log_reg_coefficients(self):
        self.log_reg_coeffs = []
        self.log_reg_cont_data_extremes = []
        self.b0 = None
        if self.classifier is None or self.domain is None:
            return
        if not isinstance(self.classifier, LogisticRegressionClassifier):
            return

        self.domain = self.reconstruct_domain(self.classifier.original_domain,
                                              self.domain)
        self.data = self.classifier.original_data.transform(self.domain)
        attrs, ranges, start = self.domain.attributes, [], 0
        for attr in attrs:
            stop = start + len(attr.values) if attr.is_discrete else start + 1
            ranges.append(slice(start, stop))
            start = stop

        self.b0 = self.classifier.intercept
        coeffs = self.classifier.coefficients
        if len(self.domain.class_var.values) == 2:
            self.b0 = np.hstack((self.b0 * (-1), self.b0))
            coeffs = np.vstack((coeffs * (-1), coeffs))
        self.log_reg_coeffs = [coeffs[:, ranges[i]] for i in range(len(attrs))]
        self.log_reg_coeffs_orig = self.log_reg_coeffs.copy()

        min_values = nanmin(self.data.X, axis=0)
        max_values = nanmax(self.data.X, axis=0)

        for i, min_t, max_t in zip(range(len(self.log_reg_coeffs)),
                                   min_values, max_values):
            if self.log_reg_coeffs[i].shape[1] == 1:
                coef = self.log_reg_coeffs[i]
                self.log_reg_coeffs[i] = np.hstack((coef * min_t, coef * max_t))
                self.log_reg_cont_data_extremes.append(
                    [sorted([min_t, max_t], reverse=(c < 0)) for c in coef])
            else:
                self.log_reg_cont_data_extremes.append([None])
Пример #20
0
 def __call__(self, data: Table, attribute):
     fmt = [
         "%Y", "%y %b", "%y %b %d", "%y %b %d %H:%M", "%y %b %d %H:%M",
         "%H:%M:%S"
     ][self.unit]
     values, _ = data.get_column_view(attribute)
     times = []
     if values.size:
         mn, mx = ut.nanmin(values), ut.nanmax(values)
         if not np.isnan(mn):
             mn = utc_from_timestamp(mn).timetuple()
             mx = utc_from_timestamp(mx).timetuple()
             times = _time_range(mn, mx, self.unit, self.width, 0, 100)
             if times is None:
                 raise TooManyIntervals
     times = [time.struct_time(t + (0, 0, 0)) for t in times][1:-1]
     points = np.array([calendar.timegm(t) for t in times])
     values = [time.strftime(fmt, t) for t in times]
     values = _simplified_time_intervals(values)
     var = data.domain[attribute]
     return DiscreteVariable(name=var.name,
                             values=values,
                             compute_value=Discretizer(var, points),
                             sparse=var.sparse)
    def __compute_statistics(self):
        # We will compute statistics over all data at once
        matrices = [self._data.X, self._data._Y, self._data.metas]

        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = zip([
            self._domain.attributes, self._domain.class_vars, self._domain.metas
        ], matrices)
        # Filter out any matrices with size 0, filter the zipped matrices to 
        # eliminate variables in a single swoop
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        def _apply_to_types(attrs_x_pair, discrete_f=None, continuous_f=None,
                            time_f=None, string_f=None, default_val=np.nan):
            """Apply functions to variable types e.g. discrete_f to discrete 
            variables. Default value is returned if there is no function 
            defined for specific variable types."""
            attrs, x = attrs_x_pair
            result = np.full(len(attrs), default_val)
            disc_var_idx, cont_var_idx, time_var_idx, str_var_idx = self._attr_indices(attrs)
            if discrete_f and x[:, disc_var_idx].size:
                result[disc_var_idx] = discrete_f(x[:, disc_var_idx].astype(np.float64))
            if continuous_f and x[:, cont_var_idx].size:
                result[cont_var_idx] = continuous_f(x[:, cont_var_idx].astype(np.float64))
            if time_f and x[:, time_var_idx].size:
                result[time_var_idx] = time_f(x[:, time_var_idx].astype(np.float64))
            if string_f and x[:, str_var_idx].size:
                result[str_var_idx] = string_f(x[:, str_var_idx].astype(np.object))
            return result

        self._variable_types = [type(var) for var in self._attributes]
        self._variable_names = [var.name.lower() for var in self._attributes]

        # Compute the center
        _center = partial(
            _apply_to_types,
            discrete_f=lambda x: ss.mode(x)[0],
            continuous_f=lambda x: ut.nanmean(x, axis=0),
        )
        self._center = np.hstack(map(_center, matrices))

        # Compute the dispersion
        def _entropy(x):
            p = [ut.bincount(row)[0] for row in x.T]
            p = [pk / np.sum(pk) for pk in p]
            return np.fromiter((ss.entropy(pk) for pk in p), dtype=np.float64)
        _dispersion = partial(
            _apply_to_types,
            discrete_f=lambda x: _entropy(x),
            continuous_f=lambda x: ut.nanvar(x, axis=0),
        )
        self._dispersion = np.hstack(map(_dispersion, matrices))

        # Compute minimum values
        _max = partial(
            _apply_to_types,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
        )
        self._max = np.hstack(map(_max, matrices))

        # Compute maximum values
        _min = partial(
            _apply_to_types,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._min = np.hstack(map(_min, matrices))

        # Compute # of missing values
        _missing = partial(
            _apply_to_types,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._missing = np.hstack(map(_missing, matrices))