Пример #1
0
 def _disc_no_vals_column(data: np.ndarray, col: int, **_) -> \
         _ColumnProperties:
     vals, coltype = _TableBuilder._disc_column(data, col)
     return _ColumnProperties(valuemap=natural_sorted(set(vals) - {""}),
                              values=vals,
                              coltype=coltype,
                              orig_values=vals)
Пример #2
0
    def test_sort_table(self):
        """
        Test if first column of the table is sorted naturally
        """
        view = self.widget.view
        model = view.model()
        self.send_signal(self.widget.Inputs.corpus, self.corpus)
        self.send_signal(self.widget.Inputs.words, self.words)

        view.horizontalHeader().setSortIndicator(0, Qt.AscendingOrder)
        data = [model.data(model.index(i, 0)) for i in range(model.rowCount())]
        self.assertListEqual(data, natural_sorted(self.corpus.titles))

        view.horizontalHeader().setSortIndicator(0, Qt.DescendingOrder)
        data = [model.data(model.index(i, 0)) for i in range(model.rowCount())]
        self.assertListEqual(data, natural_sorted(self.corpus.titles)[::-1])
Пример #3
0
    def test_guess_data_type_discrete(self):
        # should be DiscreteVariable
        valuemap, values, coltype = guess_data_type([1, 2, 1, 2])
        self.assertEqual(DiscreteVariable, coltype)
        self.assertEqual([1, 2], valuemap)
        np.testing.assert_array_equal([1, 2, 1, 2], values)

        valuemap, values, coltype = guess_data_type(["1", "2", "1", "2", "a"])
        self.assertEqual(DiscreteVariable, coltype)
        self.assertEqual(["1", "2", "a"], valuemap)
        np.testing.assert_array_equal(['1', '2', '1', '2', 'a'], values)

        # just below the threshold for string variable
        in_values = list(map(lambda x: str(x) + "a", range(24))) + ["a"] * 76
        valuemap, values, coltype = guess_data_type(in_values)
        self.assertEqual(DiscreteVariable, coltype)
        self.assertEqual(natural_sorted(set(in_values)), valuemap)
        np.testing.assert_array_equal(in_values, values)
Пример #4
0
def guess_data_type(orig_values, namask=None):
    """
    Use heuristics to guess data type.
    """
    valuemap, values = None, orig_values
    is_discrete = is_discrete_values(orig_values)
    orig_values = np.asarray(orig_values, dtype=str)
    if namask is None:
        namask = isnastr(orig_values)
    if is_discrete:
        valuemap = natural_sorted(is_discrete)
        coltype = DiscreteVariable
    else:
        # try to parse as float
        values = np.empty_like(orig_values, dtype=float)
        values[namask] = np.nan
        try:
            np.copyto(values, orig_values, where=~namask, casting="unsafe")
        except ValueError:
            values = orig_values
            coltype = StringVariable
        else:
            coltype = ContinuousVariable

    if coltype is not ContinuousVariable:
        # when not continuous variable it can still be time variable even it
        # was before recognized as a discrete
        tvar = TimeVariable('_')
        # introducing new variable prevent overwriting orig_values and values
        temp_values = np.empty_like(orig_values, dtype=float)
        try:
            temp_values[~namask] = [
                tvar.parse_exact_iso(i) for i in orig_values[~namask]
            ]
        except ValueError:
            pass
        else:
            valuemap = None
            coltype = TimeVariable
            values = temp_values
    return valuemap, values, coltype
Пример #5
0
    def get_domain(self, domain, data, deduplicate=False):
        """
        Create domain (and dataset) from changes made in the widget.

        Returns
        -------

        Args:
            domain (Domain): original domain
            data (Table): original data
            deduplicate (bool): if True, variable names are deduplicated and
               the result contains an additional list with names of renamed
               variables

        Returns:
            (new_domain, [attribute_columns, class_var_columns, meta_columns])
            or
            (new_domain, [attribute_columns, class_var_columns, meta_columns], renamed)
        """
        # Allow type-checking with type() instead of isinstance() for exact comparison
        # pylint: disable=unidiomatic-typecheck
        variables = self.model().variables
        places = [[], [], []]  # attributes, class_vars, metas
        cols = [[], [], []]  # Xcols, Ycols, Mcols

        def numbers_are_round(var, col_data):
            if type(var) == ContinuousVariable:
                data = np.asarray(col_data.data)  # Works for dense and sparse
                data = data[~np.isnan(data)]
                return (data == data.astype(int)).all()
            return False

        # Exit early with original domain if the user didn't actually change anything
        if all((name == orig_var.name and tpe == type(orig_var)
                and place == orig_plc)
               for (name, tpe, place, _, _), (orig_var, orig_plc) in zip(
                   variables,
                   chain(((at, Place.feature) for at in domain.attributes), (
                       (cl, Place.class_var) for cl in domain.class_vars), (
                           (mt, Place.meta) for mt in domain.metas)))):
            if deduplicate:
                return domain, [data.X, data.Y, data.metas], []
            else:
                return domain, [data.X, data.Y, data.metas]

        relevant_names = [var[0] for var in variables if var[2] != Place.skip]
        if deduplicate:
            renamed_iter = iter(get_unique_names_duplicates(relevant_names))
        else:
            renamed_iter = iter(relevant_names)
        renamed = []
        for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \
                zip(variables,
                    chain([(at, Place.feature) for at in domain.attributes],
                          [(cl, Place.class_var)
                           for cl in domain.class_vars],
                          [(mt, Place.meta) for mt in domain.metas])):
            if place == Place.skip:
                continue

            new_name = next(renamed_iter)
            if new_name != name and name not in renamed:
                renamed.append(name)

            col_data = self._get_column(data, orig_var, orig_plc)
            is_sparse = sp.issparse(col_data)

            if new_name == orig_var.name and tpe == type(orig_var):
                var = orig_var
            elif tpe == type(orig_var):
                var = orig_var.copy(name=new_name)
            elif tpe == DiscreteVariable:
                values = natural_sorted(
                    list(
                        str(i) for i in unique(col_data)
                        if not self._is_missing(i)))
                round_numbers = numbers_are_round(orig_var, col_data)
                col_data = [
                    np.nan if self._is_missing(x) else values.index(str(x))
                    for x in self._iter_vals(col_data)
                ]
                if round_numbers:
                    values = [str(int(float(v))) for v in values]
                var = tpe(new_name, values)
                col_data = self._to_column(col_data, is_sparse)
            elif tpe == StringVariable:
                var = tpe.make(new_name)
                if type(orig_var) in [DiscreteVariable, TimeVariable]:
                    col_data = [
                        orig_var.repr_val(x) if not np.isnan(x) else ""
                        for x in self._iter_vals(col_data)
                    ]
                elif type(orig_var) == ContinuousVariable:
                    round_numbers = numbers_are_round(orig_var, col_data)
                    col_data = [
                        '' if np.isnan(x) else
                        str(int(x)) if round_numbers else orig_var.repr_val(x)
                        for x in self._iter_vals(col_data)
                    ]
                # don't obey sparsity for StringVariable since they are
                # in metas which are transformed to dense below
                col_data = self._to_column(col_data, False, dtype=object)
            elif tpe == ContinuousVariable and type(
                    orig_var) == DiscreteVariable:
                var = tpe.make(new_name)
                if may_be_numeric:
                    col_data = [
                        np.nan if self._is_missing(x) else float(
                            orig_var.values[int(x)])
                        for x in self._iter_vals(col_data)
                    ]
                col_data = self._to_column(col_data, is_sparse)
            else:
                var = tpe(new_name)
            places[place].append(var)
            cols[place].append(col_data)

        # merge columns for X, Y and metas
        feats = cols[Place.feature]
        X = self._merge(feats) if feats else np.empty((len(data), 0))
        Y = self._merge(cols[Place.class_var], force_dense=True)
        m = self._merge(cols[Place.meta], force_dense=True)
        domain = Domain(*places)
        if deduplicate:
            return domain, [X, Y, m], renamed
        else:
            return domain, [X, Y, m]
Пример #6
0
 def test_natural_sorted_numbers(self):
     data = [1, 20, 2, 12]
     res = [1, 2, 12, 20]
     self.assertListEqual(res, natural_sorted(data))
Пример #7
0
 def test_natural_sorted_numbers_str(self):
     data = ["1", "20", "2", "12"]
     res = ["1", "2", "12", "20"]
     self.assertListEqual(res, natural_sorted(data))
Пример #8
0
 def test_natural_sorted_text(self):
     data = ["b", "aa", "c", "dd"]
     res = ["aa", "b", "c", "dd"]
     self.assertListEqual(res, natural_sorted(data))
Пример #9
0
 def test_natural_sorted(self):
     data = ["something1", "something20", "something2", "something12"]
     res = ["something1", "something2", "something12", "something20"]
     self.assertListEqual(res, natural_sorted(data))