Пример #1
0
    def commit(self):
        self.Warning.renamed_vars.clear()

        if not self.data:
            self.Outputs.data.send(None)
            return
        data = np.array(self.data)
        if self.hasAttr2:
            X, Y = data[:, :2], data[:, 2]
            proposed = [self.attr1.strip(), self.attr2.strip()]
        else:
            X, Y = data[:, np.newaxis, 0], data[:, 2]
            proposed = [self.attr1.strip()]

        if len(np.unique(Y)) >= 2:
            proposed.append("Class")
            unique_names, renamed = get_unique_names_duplicates(proposed, True)
            domain = Domain((map(ContinuousVariable, unique_names[:-1])),
                            DiscreteVariable(unique_names[-1],
                                             values=tuple(self.class_model)))
            data = Table.from_numpy(domain, X, Y)
        else:
            unique_names, renamed = get_unique_names_duplicates(proposed, True)
            domain = Domain(map(ContinuousVariable, unique_names))
            data = Table.from_numpy(domain, X)

        if renamed:
            self.Warning.renamed_vars(", ".join(renamed))
            self.plot.getAxis("bottom").setLabel(unique_names[0])
            self.plot.getAxis("left").setLabel(unique_names[1])

        data.name = self.table_name
        self.Outputs.data.send(data)
Пример #2
0
 def test_get_unique_names_from_duplicates(self):
     self.assertEqual(
         get_unique_names_duplicates(["foo", "bar", "baz"]),
         ["foo", "bar", "baz"])
     self.assertEqual(
         get_unique_names_duplicates(["foo", "bar", "baz", "bar"]),
         ["foo", "bar (1)", "baz", "bar (2)"])
Пример #3
0
 def merge_data(self, domain, values, ids=None):
     X, metas, class_vars = None, None, None
     renamed = []
     for val in domain.values():
         names = [var.name for var in val]
         unique_names = get_unique_names_duplicates(names)
         for n, u, idx, var in zip(names, unique_names, count(), val):
             if n != u:
                 val[idx] = var.copy(name=u)
                 renamed.append(n)
     if renamed:
         self.Warning.renamed_vars(', '.join(renamed))
     if 'attributes' in values:
         X = np.hstack(values['attributes'])
     if 'metas' in values:
         metas = np.hstack(values['metas'])
         n = len(metas)
     if 'class_vars' in values:
         class_vars = np.hstack(values['class_vars'])
         n = len(class_vars)
     if X is None:
         X = np.empty((n, 0))
     table = Table.from_numpy(Domain(**domain), X, class_vars, metas)
     if ids is not None:
         table.ids = ids
     return table
Пример #4
0
    def create_scores_table(self, labels):
        self.Warning.renamed_variables.clear()
        model_list = self.ranksModel.tolist()
        if not model_list or len(
                model_list[0]) == 1:  # Empty or just n_values column
            return None
        unique, renamed = get_unique_names_duplicates(labels + ('Feature', ),
                                                      return_duplicated=True)
        if renamed:
            self.Warning.renamed_variables(', '.join(renamed))

        domain = Domain([ContinuousVariable(label) for label in unique[:-1]],
                        metas=[StringVariable(unique[-1])])

        # Prevent np.inf scores
        finfo = np.finfo(np.float64)
        scores = np.clip(np.array(model_list)[:, 1:], finfo.min, finfo.max)

        feature_names = np.array([a.name for a in self.data.domain.attributes])
        # Reshape to 2d array as Table does not like 1d arrays
        feature_names = feature_names[:, None]

        new_table = Table(domain, scores, metas=feature_names)
        new_table.name = "Feature Scores"
        return new_table
Пример #5
0
    def __get_pivot_tab_domain(self, val_var, X, X_h, X_v, X_t, agg_funs):
        def map_values(index, _X):
            values = np.unique(_X[:, index])
            values = np.delete(values, np.where(values == "nan")[0])
            for j, value in enumerate(values):
                _X[:, index][_X[:, index] == value] = j
            return values

        vals = np.array(self._col_var.values)[self._col_var_groups.astype(int)]
        if not val_var or val_var.is_continuous:
            cv = ContinuousVariable
            attrs = [[cv(f"{v}", 1) for v in vals]] * 2
            attrs.extend([[cv("Total", 1)]] * 2)
        else:
            attrs = []
            for x in (X, X_h):
                attrs.append([
                    DiscreteVariable(f"{v}", map_values(i, x))
                    for i, v in enumerate(vals, 2)
                ])
            for x in (X_v, X_t):
                attrs.append([DiscreteVariable("Total", map_values(0, x))])
        row_var_h = DiscreteVariable(self._row_var.name, values=["Total"])
        aggr_attr = DiscreteVariable('Aggregate', [str(f) for f in agg_funs])

        same_row_col = self._col_var is self._row_var

        extra_vars = [self._row_var, aggr_attr]
        uniq_a = get_unique_names_duplicates([v.name for v in extra_vars] +
                                             [atr.name for atr in attrs[0]])
        for (idx, var), u in zip(enumerate(chain(extra_vars, attrs[0])),
                                 uniq_a):
            if var.name == u:
                continue
            if idx == 0:
                self.renamed.append(self._row_var.name)
                self._row_var = self._row_var.copy(name=u)
                if same_row_col:
                    self._col_var = self._row_var
                row_var_h = row_var_h.copy(name=u)
            elif idx == 1:
                self.renamed.append(aggr_attr.name)
                aggr_attr = aggr_attr.copy(name=u)
            else:
                self.renamed.append(var.name)
                attrs[0][idx - 2] = var.copy(name=u)
                attrs[1][idx - 2] = var.copy(name=u)

        if same_row_col:
            vals = tuple(v.name for v in attrs[0])
            self._row_var.make(self._row_var.name, values=vals)
            vals = tuple(v.name for v in attrs[2])
            row_var_h.make(row_var_h.name, vals)

        return (Domain([self._row_var, aggr_attr] + attrs[0]),
                Domain([row_var_h, aggr_attr] + attrs[1]), Domain(attrs[2]),
                Domain(attrs[3]))
Пример #6
0
 def __init__(self, headers: List):
     """
     Parameters
     ----------
     headers: List
         Header rows, to be used for constructing domain.
     """
     names, types, flags = self.create_header_data(headers)
     self.names = get_unique_names_duplicates(names)
     self.types = types
     self.flags = flags
Пример #7
0
def _merge_domains(domains):
    def fix_names(part):
        for i, attr, name in zip(count(), part, name_iter):
            if attr.name != name:
                part[i] = attr.renamed(name)

    parts = [
        _get_part(domains, set.union, part)
        for part in ("attributes", "class_vars", "metas")
    ]
    all_names = [var.name for var in chain(*parts)]
    name_iter = iter(get_unique_names_duplicates(all_names))
    for part in parts:
        fix_names(part)
    return Domain(*parts)
Пример #8
0
    def merge_domains(self, domains):
        def fix_names(part):
            for i, attr, name in zip(count(), part, name_iter):
                if attr.name != name:
                    part[i] = attr.renamed(name)
                    self.Warning.renamed_variables()

        oper = set.union if self.merge_type == OWConcatenate.MergeUnion \
            else set.intersection
        parts = [self._get_part(domains, oper, part)
                 for part in ("attributes", "class_vars", "metas")]
        all_names = [var.name for var in chain(*parts)]
        name_iter = iter(get_unique_names_duplicates(all_names))
        for part in parts:
            fix_names(part)
        domain = Orange.data.Domain(*parts)
        return domain
Пример #9
0
    def _domain_rename_duplicates(self, attributes, class_vars, metas):
        """Check for duplicate variable names in domain. If any, rename
        the variables, by replacing them with new ones (names are
        appended a number). """
        attrs, cvars, mets = [], [], []
        n_attrs, n_cvars, n_metas = len(attributes), len(class_vars), len(metas)
        lists = [attrs] * n_attrs + [cvars] * n_cvars + [mets] * n_metas

        all_vars = attributes + class_vars + metas
        proposed_names = [m.name for m in all_vars]
        unique_names = get_unique_names_duplicates(proposed_names)
        duplicates = set()
        for p_name, u_name, var, c in zip(proposed_names, unique_names,
                                          all_vars, lists):
            if p_name != u_name:
                duplicates.add(p_name)
                var = var.copy(name=u_name)
            c.append(var)
        if duplicates:
            self.Warning.renamed_vars(", ".join(duplicates))
        return Orange.data.Domain(attrs, cvars, mets)
Пример #10
0
 def test_get_unique_names_from_duplicates(self):
     self.assertEqual(get_unique_names_duplicates(["foo", "bar", "baz"]),
                      ["foo", "bar", "baz"])
     self.assertEqual(
         get_unique_names_duplicates(["foo", "bar", "baz", "bar"]),
         ["foo", "bar (1)", "baz", "bar (2)"])
Пример #11
0
    def get_domain(self, domain, data, deduplicate=False):
        """
        Create domain (and dataset) from changes made in the widget.

        Returns
        -------

        Args:
            domain (Domain): original domain
            data (Table): original data
            deduplicate (bool): if True, variable names are deduplicated and
               the result contains an additional list with names of renamed
               variables

        Returns:
            (new_domain, [attribute_columns, class_var_columns, meta_columns])
            or
            (new_domain, [attribute_columns, class_var_columns, meta_columns], renamed)
        """
        # Allow type-checking with type() instead of isinstance() for exact comparison
        # pylint: disable=unidiomatic-typecheck
        variables = self.model().variables
        places = [[], [], []]  # attributes, class_vars, metas
        cols = [[], [], []]  # Xcols, Ycols, Mcols

        def numbers_are_round(var, col_data):
            if type(var) == ContinuousVariable:
                data = np.asarray(col_data.data)  # Works for dense and sparse
                data = data[~np.isnan(data)]
                return (data == data.astype(int)).all()
            return False

        # Exit early with original domain if the user didn't actually change anything
        if all((name == orig_var.name and tpe == type(orig_var)
                and place == orig_plc)
               for (name, tpe, place, _, _), (orig_var, orig_plc) in zip(
                   variables,
                   chain(((at, Place.feature) for at in domain.attributes), (
                       (cl, Place.class_var) for cl in domain.class_vars), (
                           (mt, Place.meta) for mt in domain.metas)))):
            if deduplicate:
                return domain, [data.X, data.Y, data.metas], []
            else:
                return domain, [data.X, data.Y, data.metas]

        relevant_names = [var[0] for var in variables if var[2] != Place.skip]
        if deduplicate:
            renamed_iter = iter(get_unique_names_duplicates(relevant_names))
        else:
            renamed_iter = iter(relevant_names)
        renamed = []
        for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \
                zip(variables,
                    chain([(at, Place.feature) for at in domain.attributes],
                          [(cl, Place.class_var)
                           for cl in domain.class_vars],
                          [(mt, Place.meta) for mt in domain.metas])):
            if place == Place.skip:
                continue

            new_name = next(renamed_iter)
            if new_name != name and name not in renamed:
                renamed.append(name)

            col_data = self._get_column(data, orig_var, orig_plc)
            is_sparse = sp.issparse(col_data)

            if new_name == orig_var.name and tpe == type(orig_var):
                var = orig_var
            elif tpe == type(orig_var):
                var = orig_var.copy(name=new_name)
            elif tpe == DiscreteVariable:
                values = natural_sorted(
                    list(
                        str(i) for i in unique(col_data)
                        if not self._is_missing(i)))
                round_numbers = numbers_are_round(orig_var, col_data)
                col_data = [
                    np.nan if self._is_missing(x) else values.index(str(x))
                    for x in self._iter_vals(col_data)
                ]
                if round_numbers:
                    values = [str(int(float(v))) for v in values]
                var = tpe(new_name, values)
                col_data = self._to_column(col_data, is_sparse)
            elif tpe == StringVariable:
                var = tpe.make(new_name)
                if type(orig_var) in [DiscreteVariable, TimeVariable]:
                    col_data = [
                        orig_var.repr_val(x) if not np.isnan(x) else ""
                        for x in self._iter_vals(col_data)
                    ]
                elif type(orig_var) == ContinuousVariable:
                    round_numbers = numbers_are_round(orig_var, col_data)
                    col_data = [
                        '' if np.isnan(x) else
                        str(int(x)) if round_numbers else orig_var.repr_val(x)
                        for x in self._iter_vals(col_data)
                    ]
                # don't obey sparsity for StringVariable since they are
                # in metas which are transformed to dense below
                col_data = self._to_column(col_data, False, dtype=object)
            elif tpe == ContinuousVariable and type(
                    orig_var) == DiscreteVariable:
                var = tpe.make(new_name)
                if may_be_numeric:
                    col_data = [
                        np.nan if self._is_missing(x) else float(
                            orig_var.values[int(x)])
                        for x in self._iter_vals(col_data)
                    ]
                col_data = self._to_column(col_data, is_sparse)
            else:
                var = tpe(new_name)
            places[place].append(var)
            cols[place].append(col_data)

        # merge columns for X, Y and metas
        feats = cols[Place.feature]
        X = self._merge(feats) if feats else np.empty((len(data), 0))
        Y = self._merge(cols[Place.class_var], force_dense=True)
        m = self._merge(cols[Place.meta], force_dense=True)
        domain = Domain(*places)
        if deduplicate:
            return domain, [X, Y, m], renamed
        else:
            return domain, [X, Y, m]
Пример #12
0
    def test_get_unique_names_from_duplicates(self):
        self.assertEqual(get_unique_names_duplicates(["foo", "bar", "baz"]),
                         ["foo", "bar", "baz"])
        self.assertEqual(
            get_unique_names_duplicates(["foo", "bar", "baz", "bar"]),
            ["foo", "bar (1)", "baz", "bar (2)"])
        self.assertEqual(get_unique_names_duplicates(["x", "x", "x (1)"]),
                         ["x (2)", "x (3)", "x (1)"])
        self.assertEqual(
            get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"]),
            ["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)"])
        self.assertEqual(
            get_unique_names_duplicates(["iris", "iris", "iris (1)"]),
            ["iris (2)", "iris (3)", "iris (1)"])

        self.assertEqual(
            get_unique_names_duplicates(["foo", "bar", "baz"],
                                        return_duplicated=True),
            (["foo", "bar", "baz"], []))
        self.assertEqual(
            get_unique_names_duplicates(["foo", "bar", "baz", "bar"],
                                        return_duplicated=True),
            (["foo", "bar (1)", "baz", "bar (2)"], ["bar"]))
        self.assertEqual(
            get_unique_names_duplicates(["x", "x", "x (1)"],
                                        return_duplicated=True),
            (["x (2)", "x (3)", "x (1)"], ["x"]))
        self.assertEqual(
            get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"],
                                        return_duplicated=True),
            (["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)"
              ], ["x (2)", "x"]))
        self.assertEqual(
            get_unique_names_duplicates(["x", "", "", None, None, "x"]),
            ["x (1)", "", "", None, None, "x (2)"])
        self.assertEqual(
            get_unique_names_duplicates(
                ["iris", "iris", "iris (1)", "iris (2)"],
                return_duplicated=True),
            (["iris (3)", "iris (4)", "iris (1)", "iris (2)"], ["iris"]))

        self.assertEqual(
            get_unique_names_duplicates(
                ["iris (1) (1)", "iris (1)", "iris (1)"]),
            ["iris (1) (1)", "iris (1) (2)", "iris (1) (3)"])

        self.assertEqual(
            get_unique_names_duplicates(
                ["iris (1) (1)", "iris (1)", "iris (1)", "iris", "iris"]), [
                    "iris (1) (1)", "iris (1) (2)", "iris (1) (3)", "iris (2)",
                    "iris (3)"
                ])