def _disc_no_vals_column(data: np.ndarray, col: int, **_) -> \ _ColumnProperties: vals, coltype = _TableBuilder._disc_column(data, col) return _ColumnProperties(valuemap=natural_sorted(set(vals) - {""}), values=vals, coltype=coltype, orig_values=vals)
def test_sort_table(self): """ Test if first column of the table is sorted naturally """ view = self.widget.view model = view.model() self.send_signal(self.widget.Inputs.corpus, self.corpus) self.send_signal(self.widget.Inputs.words, self.words) view.horizontalHeader().setSortIndicator(0, Qt.AscendingOrder) data = [model.data(model.index(i, 0)) for i in range(model.rowCount())] self.assertListEqual(data, natural_sorted(self.corpus.titles)) view.horizontalHeader().setSortIndicator(0, Qt.DescendingOrder) data = [model.data(model.index(i, 0)) for i in range(model.rowCount())] self.assertListEqual(data, natural_sorted(self.corpus.titles)[::-1])
def test_guess_data_type_discrete(self): # should be DiscreteVariable valuemap, values, coltype = guess_data_type([1, 2, 1, 2]) self.assertEqual(DiscreteVariable, coltype) self.assertEqual([1, 2], valuemap) np.testing.assert_array_equal([1, 2, 1, 2], values) valuemap, values, coltype = guess_data_type(["1", "2", "1", "2", "a"]) self.assertEqual(DiscreteVariable, coltype) self.assertEqual(["1", "2", "a"], valuemap) np.testing.assert_array_equal(['1', '2', '1', '2', 'a'], values) # just below the threshold for string variable in_values = list(map(lambda x: str(x) + "a", range(24))) + ["a"] * 76 valuemap, values, coltype = guess_data_type(in_values) self.assertEqual(DiscreteVariable, coltype) self.assertEqual(natural_sorted(set(in_values)), valuemap) np.testing.assert_array_equal(in_values, values)
def guess_data_type(orig_values, namask=None): """ Use heuristics to guess data type. """ valuemap, values = None, orig_values is_discrete = is_discrete_values(orig_values) orig_values = np.asarray(orig_values, dtype=str) if namask is None: namask = isnastr(orig_values) if is_discrete: valuemap = natural_sorted(is_discrete) coltype = DiscreteVariable else: # try to parse as float values = np.empty_like(orig_values, dtype=float) values[namask] = np.nan try: np.copyto(values, orig_values, where=~namask, casting="unsafe") except ValueError: values = orig_values coltype = StringVariable else: coltype = ContinuousVariable if coltype is not ContinuousVariable: # when not continuous variable it can still be time variable even it # was before recognized as a discrete tvar = TimeVariable('_') # introducing new variable prevent overwriting orig_values and values temp_values = np.empty_like(orig_values, dtype=float) try: temp_values[~namask] = [ tvar.parse_exact_iso(i) for i in orig_values[~namask] ] except ValueError: pass else: valuemap = None coltype = TimeVariable values = temp_values return valuemap, values, coltype
def get_domain(self, domain, data, deduplicate=False): """ Create domain (and dataset) from changes made in the widget. Returns ------- Args: domain (Domain): original domain data (Table): original data deduplicate (bool): if True, variable names are deduplicated and the result contains an additional list with names of renamed variables Returns: (new_domain, [attribute_columns, class_var_columns, meta_columns]) or (new_domain, [attribute_columns, class_var_columns, meta_columns], renamed) """ # Allow type-checking with type() instead of isinstance() for exact comparison # pylint: disable=unidiomatic-typecheck variables = self.model().variables places = [[], [], []] # attributes, class_vars, metas cols = [[], [], []] # Xcols, Ycols, Mcols def numbers_are_round(var, col_data): if type(var) == ContinuousVariable: data = np.asarray(col_data.data) # Works for dense and sparse data = data[~np.isnan(data)] return (data == data.astype(int)).all() return False # Exit early with original domain if the user didn't actually change anything if all((name == orig_var.name and tpe == type(orig_var) and place == orig_plc) for (name, tpe, place, _, _), (orig_var, orig_plc) in zip( variables, chain(((at, Place.feature) for at in domain.attributes), ( (cl, Place.class_var) for cl in domain.class_vars), ( (mt, Place.meta) for mt in domain.metas)))): if deduplicate: return domain, [data.X, data.Y, data.metas], [] else: return domain, [data.X, data.Y, data.metas] relevant_names = [var[0] for var in variables if var[2] != Place.skip] if deduplicate: renamed_iter = iter(get_unique_names_duplicates(relevant_names)) else: renamed_iter = iter(relevant_names) renamed = [] for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \ zip(variables, chain([(at, Place.feature) for at in domain.attributes], [(cl, Place.class_var) for cl in domain.class_vars], [(mt, Place.meta) for mt in domain.metas])): if place == Place.skip: continue new_name = next(renamed_iter) if new_name != name and name not in renamed: renamed.append(name) col_data = self._get_column(data, orig_var, orig_plc) is_sparse = sp.issparse(col_data) if new_name == orig_var.name and tpe == type(orig_var): var = orig_var elif tpe == type(orig_var): var = orig_var.copy(name=new_name) elif tpe == DiscreteVariable: values = natural_sorted( list( str(i) for i in unique(col_data) if not self._is_missing(i))) round_numbers = numbers_are_round(orig_var, col_data) col_data = [ np.nan if self._is_missing(x) else values.index(str(x)) for x in self._iter_vals(col_data) ] if round_numbers: values = [str(int(float(v))) for v in values] var = tpe(new_name, values) col_data = self._to_column(col_data, is_sparse) elif tpe == StringVariable: var = tpe.make(new_name) if type(orig_var) in [DiscreteVariable, TimeVariable]: col_data = [ orig_var.repr_val(x) if not np.isnan(x) else "" for x in self._iter_vals(col_data) ] elif type(orig_var) == ContinuousVariable: round_numbers = numbers_are_round(orig_var, col_data) col_data = [ '' if np.isnan(x) else str(int(x)) if round_numbers else orig_var.repr_val(x) for x in self._iter_vals(col_data) ] # don't obey sparsity for StringVariable since they are # in metas which are transformed to dense below col_data = self._to_column(col_data, False, dtype=object) elif tpe == ContinuousVariable and type( orig_var) == DiscreteVariable: var = tpe.make(new_name) if may_be_numeric: col_data = [ np.nan if self._is_missing(x) else float( orig_var.values[int(x)]) for x in self._iter_vals(col_data) ] col_data = self._to_column(col_data, is_sparse) else: var = tpe(new_name) places[place].append(var) cols[place].append(col_data) # merge columns for X, Y and metas feats = cols[Place.feature] X = self._merge(feats) if feats else np.empty((len(data), 0)) Y = self._merge(cols[Place.class_var], force_dense=True) m = self._merge(cols[Place.meta], force_dense=True) domain = Domain(*places) if deduplicate: return domain, [X, Y, m], renamed else: return domain, [X, Y, m]
def test_natural_sorted_numbers(self): data = [1, 20, 2, 12] res = [1, 2, 12, 20] self.assertListEqual(res, natural_sorted(data))
def test_natural_sorted_numbers_str(self): data = ["1", "20", "2", "12"] res = ["1", "2", "12", "20"] self.assertListEqual(res, natural_sorted(data))
def test_natural_sorted_text(self): data = ["b", "aa", "c", "dd"] res = ["aa", "b", "c", "dd"] self.assertListEqual(res, natural_sorted(data))
def test_natural_sorted(self): data = ["something1", "something20", "something2", "something12"] res = ["something1", "something2", "something12", "something20"] self.assertListEqual(res, natural_sorted(data))