Пример #1
0
 def setUpClass(cls):
     super().setUpClass()
     cls.iris = Table("iris.tab")
     dom = Domain(cls.iris.domain.attributes, [])
     cls.iris_no_class = Table(dom, cls.iris)
Пример #2
0
    def test_has_discrete(self):
        self.assertFalse(Domain([]).has_discrete_attributes())
        self.assertFalse(Domain([], [age]).has_discrete_attributes())
        self.assertFalse(Domain([], race).has_discrete_attributes())

        self.assertFalse(Domain([age], None).has_discrete_attributes())
        self.assertTrue(Domain([race], None).has_discrete_attributes())
        self.assertTrue(Domain([age, race], None).has_discrete_attributes())
        self.assertTrue(Domain([race, age], None).has_discrete_attributes())

        self.assertFalse(Domain([], [age]).has_discrete_attributes(True))
        self.assertTrue(Domain([], [race]).has_discrete_attributes(True))
        self.assertFalse(Domain([age], None).has_discrete_attributes(True))
        self.assertTrue(Domain([race], None).has_discrete_attributes(True))
        self.assertTrue(Domain([age], race).has_discrete_attributes(True))
        self.assertTrue(Domain([race], age).has_discrete_attributes(True))
        self.assertTrue(Domain([], [race, age]).has_discrete_attributes(True))

        d = Domain([], None, [gender])
        self.assertTrue(d.has_discrete_attributes(False, True))
        d = Domain([], None, [age])
        self.assertFalse(d.has_discrete_attributes(False, True))
        d = Domain([], [age], [gender])
        self.assertTrue(d.has_discrete_attributes(True, True))
        d = Domain([], [incomeA], [age])
        self.assertFalse(d.has_discrete_attributes(True, True))
Пример #3
0
    def test_has_time(self):
        self.assertFalse(Domain([]).has_time_attributes())
        self.assertFalse(Domain([], [age]).has_time_attributes())
        self.assertFalse(Domain([], [race]).has_time_attributes())
        self.assertFalse(Domain([], [arrival]).has_time_attributes())
        self.assertFalse(Domain([], [], [arrival]).has_time_attributes())

        self.assertTrue(Domain([arrival], []).has_time_attributes())
        self.assertTrue(
            Domain([], [arrival]).has_time_attributes(include_class=True))
        self.assertTrue(
            Domain([], [], [arrival]).has_time_attributes(include_metas=True))

        self.assertFalse(Domain([arrival], []).has_time_class)
        self.assertTrue(Domain([], [arrival]).has_time_class)
        self.assertFalse(Domain([], [], [arrival]).has_time_class)
Пример #4
0
 def test_wrong_vartypes(self):
     attributes = (age, gender, income)
     for args in ((attributes, ssn), (attributes + (ssn, )),
                  ((ssn, ) + attributes)):
         with self.assertRaises(TypeError):
             Domain(*args)
Пример #5
0
 def test_get_item(self):
     d = Domain((age, gender, income), metas=(ssn, race))
     for idx, var in [(age, age), ("AGE", age), (0, age), (income, income),
                      ("income", income), (2, income), (ssn, ssn),
                      ("SSN", ssn), (-1, ssn), (-2, race)]:
         self.assertEqual(d[idx], var)
Пример #6
0
    def __call__(self, G):
        if not G.edges:
            raise ValueError("Network has no edges")

        num_nodes = G.number_of_nodes()
        nodes = np.arange(num_nodes)
        # Node->node probas are needed for the initial step, when there's no previous edge to condition on
        self._node_probas = self.setup_nodes(G, nodes)

        edges_coo = G.edges[0].edges.tocoo(copy=False)
        edges = np.column_stack((edges_coo.row, edges_coo.col))
        self._edge_probas = self.setup_edges(G, edges)

        walks = self._simulate_walks(G)
        walks = [list(map(str, walk)) for walk in walks]

        # gensim changed "size" param to "vector_size" in v. 4.0.0
        # https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4
        params = dict(window=self.window_size,
                      min_count=0,
                      sg=1,
                      workers=4,
                      callbacks=self.callbacks)
        if gensim.__version__ < "4.0.0":
            params["size"] = self.emb_size
            params["iter"] = self.num_epochs
        else:
            params["vector_size"] = self.emb_size
            params["epochs"] = self.num_epochs
        model = Word2Vec(walks, **params)

        items = G.nodes
        new_attrs = {}
        new_data = np.array([[] for _ in range(num_nodes)])
        class_vars, meta_vars = [], []
        class_data, meta_data = np.array([
            [] for _ in range(num_nodes)
        ]), np.array([[] for _ in range(num_nodes)])
        if isinstance(items, Table):
            attrs_mask = []
            for attr in items.domain.attributes:
                attrs_mask.append(attr.name not in new_attrs)
                new_attrs[attr.name] = new_attrs.get(attr.name,
                                                     (len(new_attrs), attr))

            new_data = items.X[:, np.array(attrs_mask, dtype=bool)]
            class_vars, meta_vars = items.domain.class_vars, items.domain.metas
            class_data, meta_data = items.Y, items.metas

        # override existing continuous vars with same names
        for i in range(self.emb_size):
            new_name = "{}_{}".format(self.feature_prefix, i)
            new_attrs[new_name] = (len(new_attrs),
                                   ContinuousVariable(new_name))

        new_data = np.hstack(
            (new_data,
             np.array([model.wv[str(curr_node)] for curr_node in nodes])))
        ordered_attrs = [None] * len(new_attrs)
        for idx, attr in new_attrs.values():
            ordered_attrs[idx] = attr
        new_domain = Domain(ordered_attrs, class_vars, meta_vars)
        new_items = Table(new_domain, new_data, class_data, meta_data)
        return new_items
Пример #7
0
    def data_table(self, data, headers=None):
        """
        Return Orange.data.Table given rows of `headers` (iterable of iterable)
        and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might
        as well **have it sorted column-major**, e.g. ``order='F'``).

        Basically, the idea of subclasses is to produce those two iterables,
        however they might.

        If `headers` is not provided, the header rows are extracted from `data`,
        assuming they precede it.
        """
        if not headers:
            headers, data = self.parse_headers(data)

        # Consider various header types (single-row, two-row, three-row, none)
        if 3 == len(headers):
            names, types, flags = map(list, headers)
        else:
            if 1 == len(headers):
                HEADER1_FLAG_SEP = '#'
                # First row format either:
                #   1) delimited column names
                #   2) -||- with type and flags prepended, separated by #,
                #      e.g. d#sex,c#age,cC#IQ
                _flags, names = zip(*[
                    i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else
                    ('', i) for i in headers[0]
                ])
                names = list(names)
            elif 2 == len(headers):
                names, _flags = map(list, headers)
            else:
                # Use heuristics for everything
                names, _flags = [], []
            types = [
                ''.join(filter(str.isupper, flag)).lower() for flag in _flags
            ]
            flags = [Flags.join(filter(str.islower, flag)) for flag in _flags]

        # Determine maximum row length
        rowlen = max(map(len, (names, types, flags)))

        def _equal_length(lst):
            lst.extend([''] * (rowlen - len(lst)))
            return lst

        # Ensure all data is of equal width in a column-contiguous array
        data = np.array([_equal_length(list(row)) for row in data if any(row)],
                        copy=False,
                        dtype=object,
                        order='F')

        # Data may actually be longer than headers were
        try:
            rowlen = data.shape[1]
        except IndexError:
            pass
        else:
            for lst in (names, types, flags):
                _equal_length(lst)

        NAMEGEN = namegen('Feature ', 1)
        Xcols, attrs = [], []
        Mcols, metas = [], []
        Ycols, clses = [], []
        Wcols = []

        # Rename variables if necessary
        # Reusing across files still works if both files have same duplicates
        name_counts = Counter(names)
        del name_counts[""]
        if len(name_counts) != len(names) and name_counts:
            uses = {
                name: 0
                for name, count in name_counts.items() if count > 1
            }
            for i, name in enumerate(names):
                if name in uses:
                    uses[name] += 1
                    names[i] = "{}_{}".format(name, uses[name])

        # Iterate through the columns
        for col in range(rowlen):
            flag = Flags(Flags.split(flags[col]))
            if flag.i:
                continue

            type_flag = types and types[col].strip()
            try:
                orig_values = [
                    np.nan if i in MISSING_VALUES else i
                    for i in (i.strip() for i in data[:, col])
                ]
            except IndexError:
                # No data instances leads here
                orig_values = []
                # In this case, coltype could be anything. It's set as-is
                # only to satisfy test_table.TableTestCase.test_append
                coltype = DiscreteVariable

            coltype_kwargs = {}
            valuemap = []
            values = orig_values

            if type_flag in StringVariable.TYPE_HEADERS:
                coltype = StringVariable
            elif type_flag in ContinuousVariable.TYPE_HEADERS:
                coltype = ContinuousVariable
                try:
                    values = [float(i) for i in orig_values]
                except ValueError:
                    for row, num in enumerate(orig_values):
                        try:
                            float(num)
                        except ValueError:
                            break
                    raise ValueError('Non-continuous value in (1-based) '
                                     'line {}, column {}'.format(
                                         row + len(headers) + 1, col + 1))

            elif type_flag in TimeVariable.TYPE_HEADERS:
                coltype = TimeVariable

            elif (type_flag in DiscreteVariable.TYPE_HEADERS
                  or _RE_DISCRETE_LIST.match(type_flag)):
                coltype = DiscreteVariable
                if _RE_DISCRETE_LIST.match(type_flag):
                    valuemap = Flags.split(type_flag)
                    coltype_kwargs.update(ordered=True)
                else:
                    valuemap = sorted(set(orig_values) - {np.nan})

            else:
                # No known type specified, use heuristics
                valuemap, values, coltype = guess_data_type(orig_values)

            if flag.m or coltype is StringVariable:
                append_to = (Mcols, metas)
            elif flag.w:
                append_to = (Wcols, None)
            elif flag.c:
                append_to = (Ycols, clses)
            else:
                append_to = (Xcols, attrs)

            cols, domain_vars = append_to
            cols.append(col)

            existing_var, new_var_name, column = None, None, None
            if domain_vars is not None:
                existing_var = names and names[col]
                if not existing_var:
                    new_var_name = next(NAMEGEN)

            values, var = sanitize_variable(valuemap, values, orig_values,
                                            coltype, coltype_kwargs,
                                            domain_vars, existing_var,
                                            new_var_name, data)
            if domain_vars is not None:
                var.attributes.update(flag.attributes)
                domain_vars.append(var)

            # Write back the changed data. This is needeed to pass the
            # correct, converted values into Table.from_numpy below
            try:
                data[:, col] = values
            except IndexError:
                pass

        domain = Domain(attrs, clses, metas)

        if not data.size:
            return Table.from_domain(domain, 0)

        table = Table.from_numpy(domain, data[:, Xcols].astype(float,
                                                               order='C'),
                                 data[:, Ycols].astype(float, order='C'),
                                 data[:, Mcols].astype(object, order='C'),
                                 data[:, Wcols].astype(float, order='C'))
        return table
Пример #8
0
    def test_nonunique(self):
        widget = self.widget
        x = ContinuousVariable("x")
        d = DiscreteVariable("d", values=list("abc"))
        domain = Domain([x, d], [])
        dataA = Table.from_numpy(domain, np.array([[1.0, 0], [1, 1], [2, 1]]))
        dataB = Table.from_numpy(domain, np.array([[1.0, 0], [2, 1], [3, 1]]))
        dataB.ids = dataA.ids
        self.send_signal(widget.Inputs.data, dataA)
        self.send_signal(widget.Inputs.extra_data, dataB)
        widget.merging = widget.InnerJoin

        self.assertFalse(widget.Error.nonunique_left.is_shown())
        self.assertFalse(widget.Error.nonunique_right.is_shown())

        widget.attr_boxes.set_state([(INSTANCEID, INSTANCEID)])
        widget.unconditional_commit()
        self.assertFalse(widget.Error.nonunique_left.is_shown())
        self.assertFalse(widget.Error.nonunique_right.is_shown())
        self.assertIsNotNone(self.get_output(widget.Outputs.data))

        widget.attr_boxes.set_state([(INDEX, INDEX)])
        widget.unconditional_commit()
        self.assertFalse(widget.Error.nonunique_left.is_shown())
        self.assertFalse(widget.Error.nonunique_right.is_shown())
        self.assertIsNotNone(self.get_output(widget.Outputs.data))

        widget.attr_boxes.set_state([(x, x)])
        widget.unconditional_commit()
        self.assertTrue(widget.Error.nonunique_left.is_shown())
        self.assertFalse(widget.Error.nonunique_right.is_shown())
        self.assertIsNone(self.get_output(widget.Outputs.data))

        widget.merging = widget.LeftJoin
        widget.unconditional_commit()
        self.assertFalse(widget.Error.nonunique_left.is_shown())
        self.assertFalse(widget.Error.nonunique_right.is_shown())
        self.assertIsNotNone(self.get_output(widget.Outputs.data))

        widget.merging = widget.InnerJoin
        widget.attr_boxes.set_state([(x, x), (d, d)])
        widget.unconditional_commit()
        self.assertFalse(widget.Error.nonunique_left.is_shown())
        self.assertFalse(widget.Error.nonunique_right.is_shown())
        self.assertIsNotNone(self.get_output(widget.Outputs.data))

        widget.attr_boxes.set_state([(d, d)])
        widget.unconditional_commit()
        self.assertTrue(widget.Error.nonunique_left.is_shown())
        self.assertTrue(widget.Error.nonunique_right.is_shown())
        self.assertIsNone(self.get_output(widget.Outputs.data))

        widget.merging = widget.LeftJoin
        widget.unconditional_commit()
        self.assertFalse(widget.Error.nonunique_left.is_shown())
        self.assertTrue(widget.Error.nonunique_right.is_shown())
        self.assertIsNone(self.get_output(widget.Outputs.data))

        widget.merging = widget.InnerJoin
        widget.unconditional_commit()
        self.assertTrue(widget.Error.nonunique_left.is_shown())
        self.assertTrue(widget.Error.nonunique_right.is_shown())
        self.assertIsNone(self.get_output(widget.Outputs.data))

        self.send_signal(widget.Inputs.data, None)
        self.send_signal(widget.Inputs.extra_data, None)
        self.assertFalse(widget.Error.nonunique_left.is_shown())
        self.assertFalse(widget.Error.nonunique_right.is_shown())
        self.assertIsNone(self.get_output(widget.Outputs.data))
Пример #9
0
 def effective_data(self):
     return self.data.transform(
         Domain(self.effective_variables, self.data.domain.class_vars,
                self.data.domain.metas))
 def test_does_not_crash_on_empty_domain(self):
     empty_data = Table('iris').transform(Domain([]))
     self.send_signal(self.widget.Inputs.data, empty_data)
Пример #11
0
    def test_match_attr_name(self):
        widget = self.widget
        row = widget.attr_boxes.rows[0]
        data_combo, extra_combo = row.left_combo, row.right_combo

        domainA = Domain(
            [
                DiscreteVariable("dA1", ("a", "b", "c", "d")),
                DiscreteVariable("dA2", ("aa", "bb")),
                DiscreteVariable("dA3", ("aa", "bb"))
            ], DiscreteVariable("cls", ("aaa", "bbb", "ccc")),
            [DiscreteVariable("mA1", ("cc", "dd")),
             StringVariable("mA2")])
        XA = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 0], [3, 1, 0]])
        yA = np.array([0, 1, 2, np.nan])
        metasA = np.array([[0.0, "m1"], [1.0, "m2"], [np.nan, "m3"],
                           [0.0, "m4"]]).astype(object)

        domainB = Domain(
            [
                DiscreteVariable("dB1", values=("a", "b", "c")),
                ContinuousVariable("dA2")
            ], None,
            [StringVariable("cls"),
             DiscreteVariable("dA1", ("m4", "m5"))])
        XB = np.array([[0, 0], [1, 1], [2, np.nan]])
        yB = np.empty((3, 0))
        metasB = np.array([[np.nan, np.nan], [1, 1], [0, 0]]).astype(object)
        dataA = Table(domainA, XA, yA, metasA)
        dataA.name = 'dataA'
        dataA.attributes = 'dataA attributes'
        dataB = Table(domainB, XB, yB, metasB)
        dataB.name = 'dataB'
        dataB.attributes = 'dataB attributes'

        self.send_signal(widget.Inputs.data, dataA)
        self.send_signal(widget.Inputs.extra_data, dataB)

        # match variable if available and the other combo is Row Index
        extra_combo.setCurrentIndex(0)
        extra_combo.activated.emit(0)
        data_combo.setCurrentIndex(2)
        data_combo.activated.emit(2)
        self.assertEqual(extra_combo.currentIndex(), 5)

        # match variable if available and the other combo is ID
        extra_combo.setCurrentIndex(1)
        extra_combo.activated.emit(1)
        data_combo.setCurrentIndex(2)
        data_combo.activated.emit(2)
        self.assertEqual(extra_combo.currentIndex(), 5)

        # don't match variable if other combo is set
        extra_combo.setCurrentIndex(4)
        extra_combo.activated.emit(4)
        data_combo.setCurrentIndex(2)
        data_combo.activated.emit(2)
        self.assertEqual(extra_combo.currentIndex(), 4)

        # don't match if nothing to match to
        extra_combo.setCurrentIndex(0)
        extra_combo.activated.emit(0)
        data_combo.setCurrentIndex(4)
        data_combo.activated.emit(4)
        self.assertEqual(extra_combo.currentIndex(), 0)

        # don't match numeric with non-numeric
        extra_combo.setCurrentIndex(0)
        extra_combo.activated.emit(0)
        data_combo.setCurrentIndex(3)
        data_combo.activated.emit(3)
        self.assertEqual(extra_combo.currentIndex(), 0)

        # allow matching string with discrete
        extra_combo.setCurrentIndex(0)
        extra_combo.activated.emit(0)
        data_combo.setCurrentIndex(5)
        data_combo.activated.emit(5)
        self.assertEqual(extra_combo.currentIndex(), 4)
Пример #12
0
 def test_continuous_metas(self):
     domain = self.iris.domain
     metas = domain.attributes[:-1] + (StringVariable("str"), )
     domain = Domain([], domain.class_var, metas)
     data = Table.from_table(domain, self.iris)
     self.send_signal(self.widget.Inputs.data, data)
    def from_file(cls, filename):
        """
        Load distance matrix from a file

        The file should be preferrably encoded in ascii/utf-8. White space at
        the beginning and end of lines is ignored.

        The first line of the file starts with the matrix dimension. It
        can be followed by a list flags

        - *axis=<number>*: the axis number
        - *symmetric*: the matrix is symmetric; when reading the element (i, j)
          it's value is also assigned to (j, i)
        - *asymmetric*: the matrix is asymmetric
        - *row_labels*: the file contains row labels
        - *col_labels*: the file contains column labels

        By default, matrices are symmetric, have axis 1 and no labels are given.
        Flags *labeled* and *labelled* are obsolete aliases for *row_labels*.

        If the file has column labels, they follow in the second line.
        Row labels appear at the beginning of each row.
        Labels are arbitrary strings that cannot contain newlines and
        tabulators. Labels are stored as instances of `Table` with a single
        meta attribute named "label".

        The remaining lines contain tab-separated numbers, preceded with labels,
        if present. Lines are padded with zeros if necessary. If the matrix is
        symmetric, the file contains the lower triangle; any data above the
        diagonal is ignored.

        Args:
            filename: file name
        """
        with open(filename, encoding=detect_encoding(filename)) as fle:
            line = fle.readline()
            if not line:
                raise ValueError("empty file")
            data = line.strip().split()
            if not data[0].strip().isdigit():
                raise ValueError("distance file must begin with dimension")
            n = int(data.pop(0))
            symmetric = True
            axis = 1
            col_labels = row_labels = None
            for flag in data:
                if flag in ("labelled", "labeled", "row_labels"):
                    row_labels = []
                elif flag == "col_labels":
                    col_labels = []
                elif flag == "symmetric":
                    symmetric = True
                elif flag == "asymmetric":
                    symmetric = False
                else:
                    flag_data = flag.split("=")
                    if len(flag_data) == 2:
                        name, value = map(str.strip, flag_data)
                    else:
                        name, value = "", None
                    if name == "axis" and value.isdigit():
                        axis = int(value)
                    else:
                        raise ValueError("invalid flag '{}'".format(
                            flag, filename))
            if col_labels is not None:
                col_labels = [
                    x.strip() for x in fle.readline().strip().split("\t")
                ]
                if len(col_labels) != n:
                    raise ValueError("mismatching number of column labels")

            matrix = np.zeros((n, n))
            for i, line in enumerate(fle):
                if i >= n:
                    raise ValueError("too many rows".format(filename))
                line = line.strip().split("\t")
                if row_labels is not None:
                    row_labels.append(line.pop(0).strip())
                if len(line) > n:
                    raise ValueError(
                        "too many columns in matrix row {}".format(
                            "'{}'".format(row_labels[i]) if row_labels else i +
                            1))
                for j, e in enumerate(line[:i + 1 if symmetric else n]):
                    try:
                        matrix[i, j] = float(e)
                    except ValueError as exc:
                        raise ValueError(
                            "invalid element at row {}, column {}".format(
                                "'{}'".format(row_labels[i])
                                if row_labels else i + 1,
                                "'{}'".format(col_labels[j])
                                if col_labels else j + 1,
                            )) from exc
                    if symmetric:
                        matrix[j, i] = matrix[i, j]
        if col_labels:
            col_labels = Table.from_list(
                Domain([], metas=[StringVariable("label")]),
                [[item] for item in col_labels],
            )
        if row_labels:
            row_labels = Table.from_list(
                Domain([], metas=[StringVariable("label")]),
                [[item] for item in row_labels],
            )
        return cls(matrix, row_labels, col_labels, axis)
    def apply(self):
        degree = int(self.polynomialexpansion)
        learner = self.LEARNER(preprocessors=self.preprocessors,
                               degree=degree,
                               learner=LinearRegressionLearner()
                               if self.learner is None else self.learner)
        learner.name = self.learner_name
        predictor = None

        self.Error.all_none.clear()

        if self.data is not None:
            attributes = self.x_var_model[self.x_var_index]
            class_var = self.y_var_model[self.y_var_index]
            data_table = Table(Domain([attributes], class_vars=[class_var]),
                               self.data)

            # all lines has nan
            if sum(
                    math.isnan(line[0]) or math.isnan(line.get_class())
                    for line in data_table) == len(data_table):
                self.Error.all_none()
                self.clear_plot()
                return

            predictor = learner(data_table)

            preprocessed_data = data_table
            for preprocessor in learner.active_preprocessors:
                preprocessed_data = preprocessor(preprocessed_data)

            x = preprocessed_data.X.ravel()
            y = preprocessed_data.Y.ravel()

            linspace = np.linspace(np.nanmin(x), np.nanmax(x),
                                   1000).reshape(-1, 1)
            values = predictor(linspace, predictor.Value)

            # calculate prediction for x from data
            predicted = TestOnTrainingData(preprocessed_data, [learner])
            self.rmse = round(RMSE(predicted)[0], 6)
            self.mae = round(MAE(predicted)[0], 6)

            # plot error bars
            self.plot_error_bars(x, predicted.actual,
                                 predicted.predicted.ravel())

            # plot data points
            self.plot_scatter_points(x, y)

            # plot regression line
            self.plot_regression_line(linspace.ravel(), values.ravel())

            x_label = self.x_var_model[self.x_var_index]
            axis = self.plot.getAxis("bottom")
            axis.setLabel(x_label)

            y_label = self.y_var_model[self.y_var_index]
            axis = self.plot.getAxis("left")
            axis.setLabel(y_label)

            self.set_range(x, y)

        self.Outputs.learner.send(learner)
        self.Outputs.model.send(predictor)

        # Send model coefficents
        model = None
        if predictor is not None:
            model = predictor.model
            if hasattr(model, "model"):
                model = model.model
            elif hasattr(model, "skl_model"):
                model = model.skl_model
        if model is not None and hasattr(model, "coef_"):
            domain = Domain([ContinuousVariable("coef")],
                            metas=[StringVariable("name")])
            coefs = [model.intercept_ + model.coef_[0]] + list(model.coef_[1:])
            names = ["1", x_label] + \
                    ["{}^{}".format(x_label, i) for i in range(2, degree + 1)]
            coef_table = Table(domain, list(zip(coefs, names)))
            self.Outputs.coefficients.send(coef_table)
        else:
            self.Outputs.coefficients.send(None)

        self.send_data()
Пример #15
0
    def test_colors_diff_domain(self):
        """
        Test whether the color selection for values is correct.
        """
        # pylint: disable=protected-access
        self.send_signal(self.widget.Inputs.data, self.iris)

        # case 1: two domains one subset other
        idom = self.iris.domain
        dom1 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values)
        )
        dom2 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values[:2])
        )
        iris1 = self.iris[:100].transform(dom1)
        iris2 = self.iris[:100].transform(dom2)

        predictor_iris1 = ConstantLearner()(iris1)
        predictor_iris2 = ConstantLearner()(iris2)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris1)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1)
        colors = self.widget._get_colors()
        np.testing.assert_array_equal(colors, iris1.domain.class_var.colors)

        # case 2: two domains one subset other - different color order
        idom = self.iris.domain
        colors = idom.class_var.colors[::-1]
        dom1 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values)
        )
        dom2 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values[:2])
        )
        dom1.class_var.colors = colors
        dom2.class_var.colors = colors[:2]
        iris1 = self.iris[:100].transform(dom1)
        iris2 = self.iris[:100].transform(dom2)

        predictor_iris1 = ConstantLearner()(iris1)
        predictor_iris2 = ConstantLearner()(iris2)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris1)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1)
        colors = self.widget._get_colors()
        np.testing.assert_array_equal(colors, iris1.domain.class_var.colors)

        # case 3: domain color, values miss-match - use default colors
        idom = self.iris.domain
        dom1 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values)
        )
        dom2 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values)
        )
        dom1.class_var.colors = dom1.class_var.colors[::-1]
        iris1 = self.iris.transform(dom1)
        iris2 = self.iris.transform(dom2)

        predictor_iris1 = ConstantLearner()(iris1)
        predictor_iris2 = ConstantLearner()(iris2)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris1)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1)
        colors = self.widget._get_colors()
        np.testing.assert_array_equal(colors, LimitedDiscretePalette(3).palette)

        # case 4: two domains different values order, matching colors
        idom = self.iris.domain
        # this way we know that default colors are not used
        colors = LimitedDiscretePalette(5).palette[2:]
        dom1 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values)
        )
        dom2 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values[::-1])
        )
        dom1.class_var.colors = colors
        dom2.class_var.colors = colors[::-1]  # colors mixed same than values
        iris1 = self.iris[:100].transform(dom1)
        iris2 = self.iris[:100].transform(dom2)

        predictor_iris1 = ConstantLearner()(iris1)
        predictor_iris2 = ConstantLearner()(iris2)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris1)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1)
        colors = self.widget._get_colors()
        np.testing.assert_array_equal(colors, iris1.domain.class_var.colors)
Пример #16
0
 def _get_projection_data(self):
     if self.data is None or self.projection is None:
         return None
     return self.data.transform(
         Domain(self.data.domain.attributes, self.data.domain.class_vars,
                self.data.domain.metas + self.projection.domain.attributes))
Пример #17
0
    def send_data(self):
        if self.optimize_k:
            row = self.selected_row()
            k = self.k_from + row if row is not None else None
        else:
            k = self.k

        km = self.clusterings.get(k)
        if self.data is None or km is None or isinstance(km, str):
            self.Outputs.annotated_data.send(None)
            self.Outputs.centroids.send(None)
            return

        domain = self.data.domain
        cluster_var = DiscreteVariable(
            get_unique_names(domain, "Cluster"),
            values=["C%d" % (x + 1) for x in range(km.k)])
        clust_ids = km.labels
        silhouette_var = ContinuousVariable(
            get_unique_names(domain, "Silhouette"))
        if km.silhouette_samples is not None:
            self.Warning.no_silhouettes.clear()
            scores = np.arctan(km.silhouette_samples) / np.pi + 0.5
            clust_scores = []
            for i in range(km.k):
                in_clust = clust_ids == i
                if in_clust.any():
                    clust_scores.append(np.mean(scores[in_clust]))
                else:
                    clust_scores.append(0.)
            clust_scores = np.atleast_2d(clust_scores).T
        else:
            self.Warning.no_silhouettes()
            scores = np.nan
            clust_scores = np.full((km.k, 1), np.nan)

        new_domain = add_columns(domain, metas=[cluster_var, silhouette_var])
        new_table = self.data.transform(new_domain)
        new_table.get_column_view(cluster_var)[0][:] = clust_ids
        new_table.get_column_view(silhouette_var)[0][:] = scores

        domain_attributes = set(domain.attributes)
        centroid_attributes = [
            attr.compute_value.variable
            if isinstance(attr.compute_value, ReplaceUnknowns)
            and attr.compute_value.variable in domain_attributes else attr
            for attr in km.domain.attributes
        ]
        centroid_domain = add_columns(Domain(centroid_attributes, [],
                                             domain.metas),
                                      metas=[cluster_var, silhouette_var])
        centroids = Table(
            centroid_domain, km.centroids, None,
            np.hstack((np.full((km.k, len(domain.metas)), np.nan),
                       np.arange(km.k).reshape(km.k, 1), clust_scores)))
        if self.data.name == Table.name:
            centroids.name = "centroids"
        else:
            centroids.name = f"{self.data.name} centroids"

        self.Outputs.annotated_data.send(new_table)
        self.Outputs.centroids.send(centroids)
Пример #18
0
 def test_cls_with_single_instance(self):
     table = Table(Domain([ContinuousVariable("c1")],
                          [DiscreteVariable("c2", values=("a", "b"))]),
                   np.array([[1], [2], [3]]), np.array([[0], [0], [1]]))
     self.send_signal(self.widget.Inputs.data, table)
     self.widget.set_row_clustering(Clustering.Clustering)
Пример #19
0
    def _test_predictions_with_absent_class(self, sparse):
        """Empty classes should not affect predictions"""
        x = np.array([
            [1, 0, 0],
            [0, np.nan, 0],
            [0, 1, 0],
            [0, 0, 0],
            [1, 2, 0],
            [1, 1, 0],
            [1, 2, 0],
            [0, 1, 0]])
        if sparse is not None:
            x = sparse(x)

        y = np.array([0, 0, 0, 2, 2, 2, 3, 3])
        domain = Domain(
            [DiscreteVariable("a", values="ab"),
             DiscreteVariable("b", values="abc"),
             DiscreteVariable("c", values="a")],
            DiscreteVariable("y", values="abcd"))
        data = Table.from_numpy(domain, x, y)

        model = self.learner(data)
        np.testing.assert_almost_equal(
            model.class_prob,
            [4/11, 0, 4/11, 3/11]
        )
        np.testing.assert_almost_equal(
            np.exp(model.log_cont_prob[0]) * model.class_prob[:, None],
            [[3/7, 2/7], [0, 0], [2/7, 3/7], [2/7, 2/7]])
        np.testing.assert_almost_equal(
            np.exp(model.log_cont_prob[1]) * model.class_prob[:, None],
            [[2/5, 1/3, 1/5], [0, 0, 0], [2/5, 1/3, 2/5], [1/5, 1/3, 2/5]])
        np.testing.assert_almost_equal(
            np.exp(model.log_cont_prob[2]) * model.class_prob[:, None],
            [[4/11], [0], [4/11], [3/11]])

        test_x = np.array([[a, b, 0] for a in [0, 1] for b in [0, 1, 2]])
        # Classifiers reject csc matrices in the base class
        # Naive bayesian classifier supports them if predict_storage is
        # called directly, which we do below
        if sparse is not None and sparse is not sp.csc_matrix:
            test_x = sparse(test_x)
        test_y = np.full((6, ), np.nan)
        # The following was computed manually, too
        exp_probs = np.array([
            [0.47368421052632, 0, 0.31578947368421, 0.21052631578947],
            [0.39130434782609, 0, 0.26086956521739, 0.34782608695652],
            [0.24324324324324, 0, 0.32432432432432, 0.43243243243243],
            [0.31578947368421, 0, 0.47368421052632, 0.21052631578947],
            [0.26086956521739, 0, 0.39130434782609, 0.34782608695652],
            [0.15000000000000, 0, 0.45000000000000, 0.40000000000000]
        ])

        # Test the faster algorithm for Table (numpy matrices)
        test_data = Table.from_numpy(domain, test_x, test_y)
        probs = model(test_data, ret=model.Probs)
        np.testing.assert_almost_equal(exp_probs, probs)
        values = model(test_data)
        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
        values, probs = model(test_data, ret=model.ValueProbs)
        np.testing.assert_almost_equal(exp_probs, probs)
        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

        # Test the slower algorithm for non-Table data (iteration in Python)
        test_data = NotATable.from_numpy(domain, test_x, test_y)
        probs = model(test_data, ret=model.Probs)
        np.testing.assert_almost_equal(exp_probs, probs)
        values = model(test_data)
        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
        values, probs = model(test_data, ret=model.ValueProbs)
        np.testing.assert_almost_equal(exp_probs, probs)
        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

        # Test prediction directly on numpy
        probs = model(test_x, ret=model.Probs)
        np.testing.assert_almost_equal(exp_probs, probs)
        values = model(test_x)
        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
        values, probs = model(test_x, ret=model.ValueProbs)
        np.testing.assert_almost_equal(exp_probs, probs)
        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

        # Test prediction on instances
        for inst, exp_prob in zip(test_data, exp_probs):
            np.testing.assert_almost_equal(
                model(inst, ret=model.Probs),
                exp_prob)
            self.assertEqual(model(inst), np.argmax(exp_prob))
            value, prob = model(inst, ret=model.ValueProbs)
            np.testing.assert_almost_equal(prob, exp_prob)
            self.assertEqual(value, np.argmax(exp_prob))

        # Test prediction by directly calling predict. This is needed to test
        # csc_matrix, but doesn't hurt others
        if sparse is sp.csc_matrix:
            test_x = sparse(test_x)
        values, probs = model.predict(test_x)
        np.testing.assert_almost_equal(exp_probs, probs)
        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
Пример #20
0
    def commit(self):
        items = getattr(self.matrix, "items", self.items)
        if not items:
            self.Outputs.selected_data.send(None)
            self.Outputs.annotated_data.send(None)
            return

        selection = self.dendrogram.selected_nodes()
        selection = sorted(selection, key=lambda c: c.value.first)

        indices = [leaf.value.index for leaf in leaves(self.root)]

        maps = [indices[node.value.first:node.value.last]
                for node in selection]

        selected_indices = list(chain(*maps))
        unselected_indices = sorted(set(range(self.root.value.last)) -
                                    set(selected_indices))

        if not selected_indices:
            self.Outputs.selected_data.send(None)
            annotated_data = create_annotated_table(items, []) \
                if self.selection_method == 0 and self.matrix.axis else None
            self.Outputs.annotated_data.send(annotated_data)
            return

        selected_data = None

        if isinstance(items, Orange.data.Table) and self.matrix.axis == 1:
            # Select rows
            c = np.zeros(self.matrix.shape[0])

            for i, indices in enumerate(maps):
                c[indices] = i
            c[unselected_indices] = len(maps)

            mask = c != len(maps)

            data, domain = items, items.domain
            attrs = domain.attributes
            classes = domain.class_vars
            metas = domain.metas

            var_name = get_unique_names(domain, "Cluster")
            values = [f"C{i + 1}" for i in range(len(maps))]

            clust_var = Orange.data.DiscreteVariable(
                var_name, values=values + ["Other"])
            domain = Orange.data.Domain(attrs, classes, metas + (clust_var,))
            data = items.transform(domain)
            with data.unlocked(data.metas):
                data.get_column_view(clust_var)[0][:] = c

            if selected_indices:
                selected_data = data[mask]
                clust_var = Orange.data.DiscreteVariable(
                    var_name, values=values)
                selected_data.domain = Domain(
                    attrs, classes, metas + (clust_var, ))

            annotated_data = create_annotated_table(data, selected_indices)

        elif isinstance(items, Orange.data.Table) and self.matrix.axis == 0:
            # Select columns
            attrs = []
            for clust, indices in chain(enumerate(maps, start=1),
                                        [(0, unselected_indices)]):
                for i in indices:
                    attr = items.domain[i].copy()
                    attr.attributes["cluster"] = clust
                    attrs.append(attr)
            domain = Orange.data.Domain(
                # len(unselected_indices) can be 0
                attrs[:len(attrs) - len(unselected_indices)],
                items.domain.class_vars, items.domain.metas)
            selected_data = items.from_table(domain, items)

            domain = Orange.data.Domain(
                attrs,
                items.domain.class_vars, items.domain.metas)
            annotated_data = items.from_table(domain, items)

        self.Outputs.selected_data.send(selected_data)
        self.Outputs.annotated_data.send(annotated_data)
Пример #21
0
 def test_init_source_class(self):
     attributes = (age, gender, income)
     d = Domain(attributes, (education, race))
     d2 = Domain(["Gender", 0], "income", source=d)
     self.assertEqual(d2.variables, (gender, age, income))
Пример #22
0
 def __call__(self, data, *_):
     if data is not None:
         raise ValueError("boom")
     return Model(Domain([]))
Пример #23
0
 def test_wrong_vartypes_w_source(self):
     d = Domain((age, gender), metas=(ssn, ))
     with self.assertRaises(TypeError):
         Domain(-1, source=d)
Пример #24
0
 def extract_col(data, var):
     nd = Domain([var])
     d = data.transform(nd)
     return d.X[:, 0]
Пример #25
0
 def test_get_item_slices(self):
     d = Domain((age, gender, income, race), metas=(ssn, race))
     self.assertEqual(d[:2], (age, gender))
     self.assertEqual(d[1:3], (gender, income))
     self.assertEqual(d[2:], (income, race))
Пример #26
0
    def get_domain(self, domain, data):
        """Create domain (and dataset) from changes made in the widget.

        Parameters
        ----------
        domain : old domain
        data : source data

        Returns
        -------
        (new_domain, [attribute_columns, class_var_columns, meta_columns])
        """
        # Allow type-checking with type() instead of isinstance() for exact comparison
        # pylint: disable=unidiomatic-typecheck

        variables = self.model().variables
        places = [[], [], []]  # attributes, class_vars, metas
        cols = [[], [], []]  # Xcols, Ycols, Mcols

        def numbers_are_round(var, col_data):
            if type(var) == ContinuousVariable:
                data = np.asarray(col_data.data)  # Works for dense and sparse
                data = data[~np.isnan(data)]
                return (data == data.astype(int)).all()
            return False

        # Exit early with original domain if the user didn't actually change anything
        if all((name == orig_var.name and tpe == type(orig_var)
                and place == orig_plc)
               for (name, tpe, place, _, _), (orig_var, orig_plc) in zip(
                   variables,
                   chain(((at, Place.feature) for at in domain.attributes), (
                       (cl, Place.class_var) for cl in domain.class_vars), (
                           (mt, Place.meta) for mt in domain.metas)))):
            return domain, [data.X, data.Y, data.metas]

        for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \
                zip(variables,
                        chain([(at, Place.feature) for at in domain.attributes],
                              [(cl, Place.class_var) for cl in domain.class_vars],
                              [(mt, Place.meta) for mt in domain.metas])):
            if place == Place.skip:
                continue

            col_data = self._get_column(data, orig_var, orig_plc)
            is_sparse = sp.issparse(col_data)

            if name == orig_var.name and tpe == type(orig_var):
                var = orig_var
            elif tpe == type(orig_var):
                # change the name so that all_vars will get the correct name
                orig_var.name = name
                var = orig_var
            elif tpe == DiscreteVariable:
                values = list(
                    str(i) for i in unique(col_data)
                    if not self._is_missing(i))
                round_numbers = numbers_are_round(orig_var, col_data)
                col_data = [
                    np.nan if self._is_missing(x) else values.index(str(x))
                    for x in self._iter_vals(col_data)
                ]
                if round_numbers:
                    values = [str(int(float(v))) for v in values]
                var = tpe(name, values)
                col_data = self._to_column(col_data, is_sparse)
            elif tpe == StringVariable:
                var = tpe.make(name)
                if type(orig_var) == DiscreteVariable:
                    col_data = [
                        orig_var.repr_val(x) if not np.isnan(x) else ""
                        for x in self._iter_vals(col_data)
                    ]
                elif type(orig_var) == ContinuousVariable:
                    round_numbers = numbers_are_round(orig_var, col_data)
                    col_data = [
                        '' if np.isnan(x) else
                        str(int(x)) if round_numbers else orig_var.repr_val(x)
                        for x in self._iter_vals(col_data)
                    ]
                # don't obey sparsity for StringVariable since they are
                # in metas which are transformed to dense below
                col_data = self._to_column(col_data, False, dtype=object)
            elif tpe == ContinuousVariable and type(
                    orig_var) == DiscreteVariable:
                var = tpe.make(name)
                if may_be_numeric:
                    col_data = [
                        np.nan if self._is_missing(x) else float(
                            orig_var.values[int(x)])
                        for x in self._iter_vals(col_data)
                    ]
                col_data = self._to_column(col_data, is_sparse)
            else:
                var = tpe(name)
            places[place].append(var)
            cols[place].append(col_data)

        # merge columns for X, Y and metas
        feats = cols[Place.feature]
        X = self._merge(feats) if len(feats) else np.empty((len(data), 0))
        Y = self._merge(cols[Place.class_var], force_dense=True)
        m = self._merge(cols[Place.meta], force_dense=True)
        domain = Domain(*places)
        return domain, [X, Y, m]
Пример #27
0
    def test_has_continuous(self):
        self.assertFalse(Domain([]).has_continuous_attributes())
        self.assertFalse(Domain([], [age]).has_continuous_attributes())
        self.assertFalse(Domain([], [race]).has_continuous_attributes())

        self.assertTrue(Domain([age], None).has_continuous_attributes())
        self.assertFalse(Domain([race], None).has_continuous_attributes())
        self.assertTrue(Domain([age, race], None).has_continuous_attributes())
        self.assertTrue(Domain([race, age], None).has_continuous_attributes())

        self.assertTrue(Domain([], [age]).has_continuous_attributes(True))
        self.assertFalse(Domain([], [race]).has_continuous_attributes(True))
        self.assertTrue(Domain([age], None).has_continuous_attributes(True))
        self.assertFalse(Domain([race], None).has_continuous_attributes(True))
        self.assertTrue(Domain([age], race).has_continuous_attributes(True))
        self.assertTrue(Domain([race], age).has_continuous_attributes(True))
        self.assertTrue(
            Domain([], [race, age]).has_continuous_attributes(True))

        d = Domain([], None, [age])
        self.assertTrue(d.has_continuous_attributes(False, True))
        d = Domain([], None, [gender])
        self.assertFalse(d.has_continuous_attributes(False, True))
        d = Domain([], [gender], [age])
        self.assertTrue(d.has_continuous_attributes(True, True))
        d = Domain([], [race], [gender])
        self.assertFalse(d.has_continuous_attributes(True, True))
    def setSeries(self, timeseries, attr, xdim, ydim, fagg):
        if timeseries is None or not attr:
            self.clear()
            return
        # TODO: support discrete variables
        if isinstance(xdim, str) and xdim.isdigit():
            xdim = [str(i) for i in range(1, int(xdim) + 1)]
        if isinstance(ydim, str) and ydim.isdigit():
            ydim = [str(i) for i in range(1, int(ydim) + 1)]

        xvals, xfunc = xdim.value
        yvals, yfunc = ydim.value

        values = Timeseries(Domain([], [], attr, source=timeseries.domain), timeseries).metas
        time_values = np.ravel(timeseries[:, timeseries.time_variable])

        if True:
            fromtimestamp = datetime.fromtimestamp
            time_values = [fromtimestamp(i) for i in time_values]

        if not yvals:
            yvals = sorted(set(yfunc(i) for i in time_values))
        if not xvals:
            xvals = sorted(set(xfunc(i) for i in time_values))

        indices = defaultdict(list)
        for i, tval in enumerate(time_values):
            indices[(xfunc(tval), yfunc(tval))].append(i)

        series = []
        aggvals = []
        self.indices = []
        xname = self.AxesCategories.name_it(xdim)
        yname = self.AxesCategories.name_it(ydim)
        for yval in yvals:
            data = []
            series.append(dict(name=yname(yval), data=data))
            self.indices.append([])
            for xval in xvals:
                inds = indices.get((xval, yval), ())
                self.indices[-1].append(inds)
                point = dict(y=1)
                data.append(point)
                if inds:
                    try:
                        aggval = fagg(values[inds])
                    except ValueError:
                        aggval = np.nan
                else:
                    aggval = np.nan
                if np.isnan(aggval):
                    aggval = 'NaN'
                    point['select'] = ''
                    point['color'] = 'white'
                else:
                    aggvals.append(aggval)
                point['n'] = aggval

        # TODO: allow scaling over just rows or cols instead of all values as currently
        try:
            maxval, minval = np.max(aggvals), np.min(aggvals)
        except ValueError:
            self.clear()
            return
        ptpval = maxval - minval
        color = GradientPaletteGenerator('#ffcccc', '#cc0000')
        selected_color = GradientPaletteGenerator('#ccffcc', '#006600')
        for serie in series:
            for point in serie['data']:
                n = point['n']
                if isinstance(n, Number):
                    val = (n - minval) / ptpval
                    point['color'] = color[val]
                    point['states'] = dict(select=dict(color=selected_color[val]))

        # TODO: make a white hole in the middle. Center w/o data.
        self.chart(series=series,
                   xAxis_categories=[xname(i) for i in xvals],
                   yAxis_categories=[yname(i) for i in reversed(yvals)])
Пример #29
0
 def test_unpickling_recreates_known_domains(self):
     Variable._clear_all_caches()
     domain = Domain([])
     unpickled_domain = pickle.loads(pickle.dumps(domain))
     self.assertTrue(hasattr(unpickled_domain, '_known_domains'))
Пример #30
0
 def effective_data(self):
     return self.data.transform(Domain(self.effective_variables))