def setUpClass(cls): super().setUpClass() cls.iris = Table("iris.tab") dom = Domain(cls.iris.domain.attributes, []) cls.iris_no_class = Table(dom, cls.iris)
def test_has_discrete(self): self.assertFalse(Domain([]).has_discrete_attributes()) self.assertFalse(Domain([], [age]).has_discrete_attributes()) self.assertFalse(Domain([], race).has_discrete_attributes()) self.assertFalse(Domain([age], None).has_discrete_attributes()) self.assertTrue(Domain([race], None).has_discrete_attributes()) self.assertTrue(Domain([age, race], None).has_discrete_attributes()) self.assertTrue(Domain([race, age], None).has_discrete_attributes()) self.assertFalse(Domain([], [age]).has_discrete_attributes(True)) self.assertTrue(Domain([], [race]).has_discrete_attributes(True)) self.assertFalse(Domain([age], None).has_discrete_attributes(True)) self.assertTrue(Domain([race], None).has_discrete_attributes(True)) self.assertTrue(Domain([age], race).has_discrete_attributes(True)) self.assertTrue(Domain([race], age).has_discrete_attributes(True)) self.assertTrue(Domain([], [race, age]).has_discrete_attributes(True)) d = Domain([], None, [gender]) self.assertTrue(d.has_discrete_attributes(False, True)) d = Domain([], None, [age]) self.assertFalse(d.has_discrete_attributes(False, True)) d = Domain([], [age], [gender]) self.assertTrue(d.has_discrete_attributes(True, True)) d = Domain([], [incomeA], [age]) self.assertFalse(d.has_discrete_attributes(True, True))
def test_has_time(self): self.assertFalse(Domain([]).has_time_attributes()) self.assertFalse(Domain([], [age]).has_time_attributes()) self.assertFalse(Domain([], [race]).has_time_attributes()) self.assertFalse(Domain([], [arrival]).has_time_attributes()) self.assertFalse(Domain([], [], [arrival]).has_time_attributes()) self.assertTrue(Domain([arrival], []).has_time_attributes()) self.assertTrue( Domain([], [arrival]).has_time_attributes(include_class=True)) self.assertTrue( Domain([], [], [arrival]).has_time_attributes(include_metas=True)) self.assertFalse(Domain([arrival], []).has_time_class) self.assertTrue(Domain([], [arrival]).has_time_class) self.assertFalse(Domain([], [], [arrival]).has_time_class)
def test_wrong_vartypes(self): attributes = (age, gender, income) for args in ((attributes, ssn), (attributes + (ssn, )), ((ssn, ) + attributes)): with self.assertRaises(TypeError): Domain(*args)
def test_get_item(self): d = Domain((age, gender, income), metas=(ssn, race)) for idx, var in [(age, age), ("AGE", age), (0, age), (income, income), ("income", income), (2, income), (ssn, ssn), ("SSN", ssn), (-1, ssn), (-2, race)]: self.assertEqual(d[idx], var)
def __call__(self, G): if not G.edges: raise ValueError("Network has no edges") num_nodes = G.number_of_nodes() nodes = np.arange(num_nodes) # Node->node probas are needed for the initial step, when there's no previous edge to condition on self._node_probas = self.setup_nodes(G, nodes) edges_coo = G.edges[0].edges.tocoo(copy=False) edges = np.column_stack((edges_coo.row, edges_coo.col)) self._edge_probas = self.setup_edges(G, edges) walks = self._simulate_walks(G) walks = [list(map(str, walk)) for walk in walks] # gensim changed "size" param to "vector_size" in v. 4.0.0 # https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4 params = dict(window=self.window_size, min_count=0, sg=1, workers=4, callbacks=self.callbacks) if gensim.__version__ < "4.0.0": params["size"] = self.emb_size params["iter"] = self.num_epochs else: params["vector_size"] = self.emb_size params["epochs"] = self.num_epochs model = Word2Vec(walks, **params) items = G.nodes new_attrs = {} new_data = np.array([[] for _ in range(num_nodes)]) class_vars, meta_vars = [], [] class_data, meta_data = np.array([ [] for _ in range(num_nodes) ]), np.array([[] for _ in range(num_nodes)]) if isinstance(items, Table): attrs_mask = [] for attr in items.domain.attributes: attrs_mask.append(attr.name not in new_attrs) new_attrs[attr.name] = new_attrs.get(attr.name, (len(new_attrs), attr)) new_data = items.X[:, np.array(attrs_mask, dtype=bool)] class_vars, meta_vars = items.domain.class_vars, items.domain.metas class_data, meta_data = items.Y, items.metas # override existing continuous vars with same names for i in range(self.emb_size): new_name = "{}_{}".format(self.feature_prefix, i) new_attrs[new_name] = (len(new_attrs), ContinuousVariable(new_name)) new_data = np.hstack( (new_data, np.array([model.wv[str(curr_node)] for curr_node in nodes]))) ordered_attrs = [None] * len(new_attrs) for idx, attr in new_attrs.values(): ordered_attrs[idx] = attr new_domain = Domain(ordered_attrs, class_vars, meta_vars) new_items = Table(new_domain, new_data, class_data, meta_data) return new_items
def data_table(self, data, headers=None): """ Return Orange.data.Table given rows of `headers` (iterable of iterable) and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might as well **have it sorted column-major**, e.g. ``order='F'``). Basically, the idea of subclasses is to produce those two iterables, however they might. If `headers` is not provided, the header rows are extracted from `data`, assuming they precede it. """ if not headers: headers, data = self.parse_headers(data) # Consider various header types (single-row, two-row, three-row, none) if 3 == len(headers): names, types, flags = map(list, headers) else: if 1 == len(headers): HEADER1_FLAG_SEP = '#' # First row format either: # 1) delimited column names # 2) -||- with type and flags prepended, separated by #, # e.g. d#sex,c#age,cC#IQ _flags, names = zip(*[ i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else ('', i) for i in headers[0] ]) names = list(names) elif 2 == len(headers): names, _flags = map(list, headers) else: # Use heuristics for everything names, _flags = [], [] types = [ ''.join(filter(str.isupper, flag)).lower() for flag in _flags ] flags = [Flags.join(filter(str.islower, flag)) for flag in _flags] # Determine maximum row length rowlen = max(map(len, (names, types, flags))) def _equal_length(lst): lst.extend([''] * (rowlen - len(lst))) return lst # Ensure all data is of equal width in a column-contiguous array data = np.array([_equal_length(list(row)) for row in data if any(row)], copy=False, dtype=object, order='F') # Data may actually be longer than headers were try: rowlen = data.shape[1] except IndexError: pass else: for lst in (names, types, flags): _equal_length(lst) NAMEGEN = namegen('Feature ', 1) Xcols, attrs = [], [] Mcols, metas = [], [] Ycols, clses = [], [] Wcols = [] # Rename variables if necessary # Reusing across files still works if both files have same duplicates name_counts = Counter(names) del name_counts[""] if len(name_counts) != len(names) and name_counts: uses = { name: 0 for name, count in name_counts.items() if count > 1 } for i, name in enumerate(names): if name in uses: uses[name] += 1 names[i] = "{}_{}".format(name, uses[name]) # Iterate through the columns for col in range(rowlen): flag = Flags(Flags.split(flags[col])) if flag.i: continue type_flag = types and types[col].strip() try: orig_values = [ np.nan if i in MISSING_VALUES else i for i in (i.strip() for i in data[:, col]) ] except IndexError: # No data instances leads here orig_values = [] # In this case, coltype could be anything. It's set as-is # only to satisfy test_table.TableTestCase.test_append coltype = DiscreteVariable coltype_kwargs = {} valuemap = [] values = orig_values if type_flag in StringVariable.TYPE_HEADERS: coltype = StringVariable elif type_flag in ContinuousVariable.TYPE_HEADERS: coltype = ContinuousVariable try: values = [float(i) for i in orig_values] except ValueError: for row, num in enumerate(orig_values): try: float(num) except ValueError: break raise ValueError('Non-continuous value in (1-based) ' 'line {}, column {}'.format( row + len(headers) + 1, col + 1)) elif type_flag in TimeVariable.TYPE_HEADERS: coltype = TimeVariable elif (type_flag in DiscreteVariable.TYPE_HEADERS or _RE_DISCRETE_LIST.match(type_flag)): coltype = DiscreteVariable if _RE_DISCRETE_LIST.match(type_flag): valuemap = Flags.split(type_flag) coltype_kwargs.update(ordered=True) else: valuemap = sorted(set(orig_values) - {np.nan}) else: # No known type specified, use heuristics valuemap, values, coltype = guess_data_type(orig_values) if flag.m or coltype is StringVariable: append_to = (Mcols, metas) elif flag.w: append_to = (Wcols, None) elif flag.c: append_to = (Ycols, clses) else: append_to = (Xcols, attrs) cols, domain_vars = append_to cols.append(col) existing_var, new_var_name, column = None, None, None if domain_vars is not None: existing_var = names and names[col] if not existing_var: new_var_name = next(NAMEGEN) values, var = sanitize_variable(valuemap, values, orig_values, coltype, coltype_kwargs, domain_vars, existing_var, new_var_name, data) if domain_vars is not None: var.attributes.update(flag.attributes) domain_vars.append(var) # Write back the changed data. This is needeed to pass the # correct, converted values into Table.from_numpy below try: data[:, col] = values except IndexError: pass domain = Domain(attrs, clses, metas) if not data.size: return Table.from_domain(domain, 0) table = Table.from_numpy(domain, data[:, Xcols].astype(float, order='C'), data[:, Ycols].astype(float, order='C'), data[:, Mcols].astype(object, order='C'), data[:, Wcols].astype(float, order='C')) return table
def test_nonunique(self): widget = self.widget x = ContinuousVariable("x") d = DiscreteVariable("d", values=list("abc")) domain = Domain([x, d], []) dataA = Table.from_numpy(domain, np.array([[1.0, 0], [1, 1], [2, 1]])) dataB = Table.from_numpy(domain, np.array([[1.0, 0], [2, 1], [3, 1]])) dataB.ids = dataA.ids self.send_signal(widget.Inputs.data, dataA) self.send_signal(widget.Inputs.extra_data, dataB) widget.merging = widget.InnerJoin self.assertFalse(widget.Error.nonunique_left.is_shown()) self.assertFalse(widget.Error.nonunique_right.is_shown()) widget.attr_boxes.set_state([(INSTANCEID, INSTANCEID)]) widget.unconditional_commit() self.assertFalse(widget.Error.nonunique_left.is_shown()) self.assertFalse(widget.Error.nonunique_right.is_shown()) self.assertIsNotNone(self.get_output(widget.Outputs.data)) widget.attr_boxes.set_state([(INDEX, INDEX)]) widget.unconditional_commit() self.assertFalse(widget.Error.nonunique_left.is_shown()) self.assertFalse(widget.Error.nonunique_right.is_shown()) self.assertIsNotNone(self.get_output(widget.Outputs.data)) widget.attr_boxes.set_state([(x, x)]) widget.unconditional_commit() self.assertTrue(widget.Error.nonunique_left.is_shown()) self.assertFalse(widget.Error.nonunique_right.is_shown()) self.assertIsNone(self.get_output(widget.Outputs.data)) widget.merging = widget.LeftJoin widget.unconditional_commit() self.assertFalse(widget.Error.nonunique_left.is_shown()) self.assertFalse(widget.Error.nonunique_right.is_shown()) self.assertIsNotNone(self.get_output(widget.Outputs.data)) widget.merging = widget.InnerJoin widget.attr_boxes.set_state([(x, x), (d, d)]) widget.unconditional_commit() self.assertFalse(widget.Error.nonunique_left.is_shown()) self.assertFalse(widget.Error.nonunique_right.is_shown()) self.assertIsNotNone(self.get_output(widget.Outputs.data)) widget.attr_boxes.set_state([(d, d)]) widget.unconditional_commit() self.assertTrue(widget.Error.nonunique_left.is_shown()) self.assertTrue(widget.Error.nonunique_right.is_shown()) self.assertIsNone(self.get_output(widget.Outputs.data)) widget.merging = widget.LeftJoin widget.unconditional_commit() self.assertFalse(widget.Error.nonunique_left.is_shown()) self.assertTrue(widget.Error.nonunique_right.is_shown()) self.assertIsNone(self.get_output(widget.Outputs.data)) widget.merging = widget.InnerJoin widget.unconditional_commit() self.assertTrue(widget.Error.nonunique_left.is_shown()) self.assertTrue(widget.Error.nonunique_right.is_shown()) self.assertIsNone(self.get_output(widget.Outputs.data)) self.send_signal(widget.Inputs.data, None) self.send_signal(widget.Inputs.extra_data, None) self.assertFalse(widget.Error.nonunique_left.is_shown()) self.assertFalse(widget.Error.nonunique_right.is_shown()) self.assertIsNone(self.get_output(widget.Outputs.data))
def effective_data(self): return self.data.transform( Domain(self.effective_variables, self.data.domain.class_vars, self.data.domain.metas))
def test_does_not_crash_on_empty_domain(self): empty_data = Table('iris').transform(Domain([])) self.send_signal(self.widget.Inputs.data, empty_data)
def test_match_attr_name(self): widget = self.widget row = widget.attr_boxes.rows[0] data_combo, extra_combo = row.left_combo, row.right_combo domainA = Domain( [ DiscreteVariable("dA1", ("a", "b", "c", "d")), DiscreteVariable("dA2", ("aa", "bb")), DiscreteVariable("dA3", ("aa", "bb")) ], DiscreteVariable("cls", ("aaa", "bbb", "ccc")), [DiscreteVariable("mA1", ("cc", "dd")), StringVariable("mA2")]) XA = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 0], [3, 1, 0]]) yA = np.array([0, 1, 2, np.nan]) metasA = np.array([[0.0, "m1"], [1.0, "m2"], [np.nan, "m3"], [0.0, "m4"]]).astype(object) domainB = Domain( [ DiscreteVariable("dB1", values=("a", "b", "c")), ContinuousVariable("dA2") ], None, [StringVariable("cls"), DiscreteVariable("dA1", ("m4", "m5"))]) XB = np.array([[0, 0], [1, 1], [2, np.nan]]) yB = np.empty((3, 0)) metasB = np.array([[np.nan, np.nan], [1, 1], [0, 0]]).astype(object) dataA = Table(domainA, XA, yA, metasA) dataA.name = 'dataA' dataA.attributes = 'dataA attributes' dataB = Table(domainB, XB, yB, metasB) dataB.name = 'dataB' dataB.attributes = 'dataB attributes' self.send_signal(widget.Inputs.data, dataA) self.send_signal(widget.Inputs.extra_data, dataB) # match variable if available and the other combo is Row Index extra_combo.setCurrentIndex(0) extra_combo.activated.emit(0) data_combo.setCurrentIndex(2) data_combo.activated.emit(2) self.assertEqual(extra_combo.currentIndex(), 5) # match variable if available and the other combo is ID extra_combo.setCurrentIndex(1) extra_combo.activated.emit(1) data_combo.setCurrentIndex(2) data_combo.activated.emit(2) self.assertEqual(extra_combo.currentIndex(), 5) # don't match variable if other combo is set extra_combo.setCurrentIndex(4) extra_combo.activated.emit(4) data_combo.setCurrentIndex(2) data_combo.activated.emit(2) self.assertEqual(extra_combo.currentIndex(), 4) # don't match if nothing to match to extra_combo.setCurrentIndex(0) extra_combo.activated.emit(0) data_combo.setCurrentIndex(4) data_combo.activated.emit(4) self.assertEqual(extra_combo.currentIndex(), 0) # don't match numeric with non-numeric extra_combo.setCurrentIndex(0) extra_combo.activated.emit(0) data_combo.setCurrentIndex(3) data_combo.activated.emit(3) self.assertEqual(extra_combo.currentIndex(), 0) # allow matching string with discrete extra_combo.setCurrentIndex(0) extra_combo.activated.emit(0) data_combo.setCurrentIndex(5) data_combo.activated.emit(5) self.assertEqual(extra_combo.currentIndex(), 4)
def test_continuous_metas(self): domain = self.iris.domain metas = domain.attributes[:-1] + (StringVariable("str"), ) domain = Domain([], domain.class_var, metas) data = Table.from_table(domain, self.iris) self.send_signal(self.widget.Inputs.data, data)
def from_file(cls, filename): """ Load distance matrix from a file The file should be preferrably encoded in ascii/utf-8. White space at the beginning and end of lines is ignored. The first line of the file starts with the matrix dimension. It can be followed by a list flags - *axis=<number>*: the axis number - *symmetric*: the matrix is symmetric; when reading the element (i, j) it's value is also assigned to (j, i) - *asymmetric*: the matrix is asymmetric - *row_labels*: the file contains row labels - *col_labels*: the file contains column labels By default, matrices are symmetric, have axis 1 and no labels are given. Flags *labeled* and *labelled* are obsolete aliases for *row_labels*. If the file has column labels, they follow in the second line. Row labels appear at the beginning of each row. Labels are arbitrary strings that cannot contain newlines and tabulators. Labels are stored as instances of `Table` with a single meta attribute named "label". The remaining lines contain tab-separated numbers, preceded with labels, if present. Lines are padded with zeros if necessary. If the matrix is symmetric, the file contains the lower triangle; any data above the diagonal is ignored. Args: filename: file name """ with open(filename, encoding=detect_encoding(filename)) as fle: line = fle.readline() if not line: raise ValueError("empty file") data = line.strip().split() if not data[0].strip().isdigit(): raise ValueError("distance file must begin with dimension") n = int(data.pop(0)) symmetric = True axis = 1 col_labels = row_labels = None for flag in data: if flag in ("labelled", "labeled", "row_labels"): row_labels = [] elif flag == "col_labels": col_labels = [] elif flag == "symmetric": symmetric = True elif flag == "asymmetric": symmetric = False else: flag_data = flag.split("=") if len(flag_data) == 2: name, value = map(str.strip, flag_data) else: name, value = "", None if name == "axis" and value.isdigit(): axis = int(value) else: raise ValueError("invalid flag '{}'".format( flag, filename)) if col_labels is not None: col_labels = [ x.strip() for x in fle.readline().strip().split("\t") ] if len(col_labels) != n: raise ValueError("mismatching number of column labels") matrix = np.zeros((n, n)) for i, line in enumerate(fle): if i >= n: raise ValueError("too many rows".format(filename)) line = line.strip().split("\t") if row_labels is not None: row_labels.append(line.pop(0).strip()) if len(line) > n: raise ValueError( "too many columns in matrix row {}".format( "'{}'".format(row_labels[i]) if row_labels else i + 1)) for j, e in enumerate(line[:i + 1 if symmetric else n]): try: matrix[i, j] = float(e) except ValueError as exc: raise ValueError( "invalid element at row {}, column {}".format( "'{}'".format(row_labels[i]) if row_labels else i + 1, "'{}'".format(col_labels[j]) if col_labels else j + 1, )) from exc if symmetric: matrix[j, i] = matrix[i, j] if col_labels: col_labels = Table.from_list( Domain([], metas=[StringVariable("label")]), [[item] for item in col_labels], ) if row_labels: row_labels = Table.from_list( Domain([], metas=[StringVariable("label")]), [[item] for item in row_labels], ) return cls(matrix, row_labels, col_labels, axis)
def apply(self): degree = int(self.polynomialexpansion) learner = self.LEARNER(preprocessors=self.preprocessors, degree=degree, learner=LinearRegressionLearner() if self.learner is None else self.learner) learner.name = self.learner_name predictor = None self.Error.all_none.clear() if self.data is not None: attributes = self.x_var_model[self.x_var_index] class_var = self.y_var_model[self.y_var_index] data_table = Table(Domain([attributes], class_vars=[class_var]), self.data) # all lines has nan if sum( math.isnan(line[0]) or math.isnan(line.get_class()) for line in data_table) == len(data_table): self.Error.all_none() self.clear_plot() return predictor = learner(data_table) preprocessed_data = data_table for preprocessor in learner.active_preprocessors: preprocessed_data = preprocessor(preprocessed_data) x = preprocessed_data.X.ravel() y = preprocessed_data.Y.ravel() linspace = np.linspace(np.nanmin(x), np.nanmax(x), 1000).reshape(-1, 1) values = predictor(linspace, predictor.Value) # calculate prediction for x from data predicted = TestOnTrainingData(preprocessed_data, [learner]) self.rmse = round(RMSE(predicted)[0], 6) self.mae = round(MAE(predicted)[0], 6) # plot error bars self.plot_error_bars(x, predicted.actual, predicted.predicted.ravel()) # plot data points self.plot_scatter_points(x, y) # plot regression line self.plot_regression_line(linspace.ravel(), values.ravel()) x_label = self.x_var_model[self.x_var_index] axis = self.plot.getAxis("bottom") axis.setLabel(x_label) y_label = self.y_var_model[self.y_var_index] axis = self.plot.getAxis("left") axis.setLabel(y_label) self.set_range(x, y) self.Outputs.learner.send(learner) self.Outputs.model.send(predictor) # Send model coefficents model = None if predictor is not None: model = predictor.model if hasattr(model, "model"): model = model.model elif hasattr(model, "skl_model"): model = model.skl_model if model is not None and hasattr(model, "coef_"): domain = Domain([ContinuousVariable("coef")], metas=[StringVariable("name")]) coefs = [model.intercept_ + model.coef_[0]] + list(model.coef_[1:]) names = ["1", x_label] + \ ["{}^{}".format(x_label, i) for i in range(2, degree + 1)] coef_table = Table(domain, list(zip(coefs, names))) self.Outputs.coefficients.send(coef_table) else: self.Outputs.coefficients.send(None) self.send_data()
def test_colors_diff_domain(self): """ Test whether the color selection for values is correct. """ # pylint: disable=protected-access self.send_signal(self.widget.Inputs.data, self.iris) # case 1: two domains one subset other idom = self.iris.domain dom1 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values) ) dom2 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values[:2]) ) iris1 = self.iris[:100].transform(dom1) iris2 = self.iris[:100].transform(dom2) predictor_iris1 = ConstantLearner()(iris1) predictor_iris2 = ConstantLearner()(iris2) self.send_signal(self.widget.Inputs.predictors, predictor_iris1) self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1) colors = self.widget._get_colors() np.testing.assert_array_equal(colors, iris1.domain.class_var.colors) # case 2: two domains one subset other - different color order idom = self.iris.domain colors = idom.class_var.colors[::-1] dom1 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values) ) dom2 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values[:2]) ) dom1.class_var.colors = colors dom2.class_var.colors = colors[:2] iris1 = self.iris[:100].transform(dom1) iris2 = self.iris[:100].transform(dom2) predictor_iris1 = ConstantLearner()(iris1) predictor_iris2 = ConstantLearner()(iris2) self.send_signal(self.widget.Inputs.predictors, predictor_iris1) self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1) colors = self.widget._get_colors() np.testing.assert_array_equal(colors, iris1.domain.class_var.colors) # case 3: domain color, values miss-match - use default colors idom = self.iris.domain dom1 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values) ) dom2 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values) ) dom1.class_var.colors = dom1.class_var.colors[::-1] iris1 = self.iris.transform(dom1) iris2 = self.iris.transform(dom2) predictor_iris1 = ConstantLearner()(iris1) predictor_iris2 = ConstantLearner()(iris2) self.send_signal(self.widget.Inputs.predictors, predictor_iris1) self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1) colors = self.widget._get_colors() np.testing.assert_array_equal(colors, LimitedDiscretePalette(3).palette) # case 4: two domains different values order, matching colors idom = self.iris.domain # this way we know that default colors are not used colors = LimitedDiscretePalette(5).palette[2:] dom1 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values) ) dom2 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values[::-1]) ) dom1.class_var.colors = colors dom2.class_var.colors = colors[::-1] # colors mixed same than values iris1 = self.iris[:100].transform(dom1) iris2 = self.iris[:100].transform(dom2) predictor_iris1 = ConstantLearner()(iris1) predictor_iris2 = ConstantLearner()(iris2) self.send_signal(self.widget.Inputs.predictors, predictor_iris1) self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1) colors = self.widget._get_colors() np.testing.assert_array_equal(colors, iris1.domain.class_var.colors)
def _get_projection_data(self): if self.data is None or self.projection is None: return None return self.data.transform( Domain(self.data.domain.attributes, self.data.domain.class_vars, self.data.domain.metas + self.projection.domain.attributes))
def send_data(self): if self.optimize_k: row = self.selected_row() k = self.k_from + row if row is not None else None else: k = self.k km = self.clusterings.get(k) if self.data is None or km is None or isinstance(km, str): self.Outputs.annotated_data.send(None) self.Outputs.centroids.send(None) return domain = self.data.domain cluster_var = DiscreteVariable( get_unique_names(domain, "Cluster"), values=["C%d" % (x + 1) for x in range(km.k)]) clust_ids = km.labels silhouette_var = ContinuousVariable( get_unique_names(domain, "Silhouette")) if km.silhouette_samples is not None: self.Warning.no_silhouettes.clear() scores = np.arctan(km.silhouette_samples) / np.pi + 0.5 clust_scores = [] for i in range(km.k): in_clust = clust_ids == i if in_clust.any(): clust_scores.append(np.mean(scores[in_clust])) else: clust_scores.append(0.) clust_scores = np.atleast_2d(clust_scores).T else: self.Warning.no_silhouettes() scores = np.nan clust_scores = np.full((km.k, 1), np.nan) new_domain = add_columns(domain, metas=[cluster_var, silhouette_var]) new_table = self.data.transform(new_domain) new_table.get_column_view(cluster_var)[0][:] = clust_ids new_table.get_column_view(silhouette_var)[0][:] = scores domain_attributes = set(domain.attributes) centroid_attributes = [ attr.compute_value.variable if isinstance(attr.compute_value, ReplaceUnknowns) and attr.compute_value.variable in domain_attributes else attr for attr in km.domain.attributes ] centroid_domain = add_columns(Domain(centroid_attributes, [], domain.metas), metas=[cluster_var, silhouette_var]) centroids = Table( centroid_domain, km.centroids, None, np.hstack((np.full((km.k, len(domain.metas)), np.nan), np.arange(km.k).reshape(km.k, 1), clust_scores))) if self.data.name == Table.name: centroids.name = "centroids" else: centroids.name = f"{self.data.name} centroids" self.Outputs.annotated_data.send(new_table) self.Outputs.centroids.send(centroids)
def test_cls_with_single_instance(self): table = Table(Domain([ContinuousVariable("c1")], [DiscreteVariable("c2", values=("a", "b"))]), np.array([[1], [2], [3]]), np.array([[0], [0], [1]])) self.send_signal(self.widget.Inputs.data, table) self.widget.set_row_clustering(Clustering.Clustering)
def _test_predictions_with_absent_class(self, sparse): """Empty classes should not affect predictions""" x = np.array([ [1, 0, 0], [0, np.nan, 0], [0, 1, 0], [0, 0, 0], [1, 2, 0], [1, 1, 0], [1, 2, 0], [0, 1, 0]]) if sparse is not None: x = sparse(x) y = np.array([0, 0, 0, 2, 2, 2, 3, 3]) domain = Domain( [DiscreteVariable("a", values="ab"), DiscreteVariable("b", values="abc"), DiscreteVariable("c", values="a")], DiscreteVariable("y", values="abcd")) data = Table.from_numpy(domain, x, y) model = self.learner(data) np.testing.assert_almost_equal( model.class_prob, [4/11, 0, 4/11, 3/11] ) np.testing.assert_almost_equal( np.exp(model.log_cont_prob[0]) * model.class_prob[:, None], [[3/7, 2/7], [0, 0], [2/7, 3/7], [2/7, 2/7]]) np.testing.assert_almost_equal( np.exp(model.log_cont_prob[1]) * model.class_prob[:, None], [[2/5, 1/3, 1/5], [0, 0, 0], [2/5, 1/3, 2/5], [1/5, 1/3, 2/5]]) np.testing.assert_almost_equal( np.exp(model.log_cont_prob[2]) * model.class_prob[:, None], [[4/11], [0], [4/11], [3/11]]) test_x = np.array([[a, b, 0] for a in [0, 1] for b in [0, 1, 2]]) # Classifiers reject csc matrices in the base class # Naive bayesian classifier supports them if predict_storage is # called directly, which we do below if sparse is not None and sparse is not sp.csc_matrix: test_x = sparse(test_x) test_y = np.full((6, ), np.nan) # The following was computed manually, too exp_probs = np.array([ [0.47368421052632, 0, 0.31578947368421, 0.21052631578947], [0.39130434782609, 0, 0.26086956521739, 0.34782608695652], [0.24324324324324, 0, 0.32432432432432, 0.43243243243243], [0.31578947368421, 0, 0.47368421052632, 0.21052631578947], [0.26086956521739, 0, 0.39130434782609, 0.34782608695652], [0.15000000000000, 0, 0.45000000000000, 0.40000000000000] ]) # Test the faster algorithm for Table (numpy matrices) test_data = Table.from_numpy(domain, test_x, test_y) probs = model(test_data, ret=model.Probs) np.testing.assert_almost_equal(exp_probs, probs) values = model(test_data) np.testing.assert_equal(values, np.argmax(exp_probs, axis=1)) values, probs = model(test_data, ret=model.ValueProbs) np.testing.assert_almost_equal(exp_probs, probs) np.testing.assert_equal(values, np.argmax(exp_probs, axis=1)) # Test the slower algorithm for non-Table data (iteration in Python) test_data = NotATable.from_numpy(domain, test_x, test_y) probs = model(test_data, ret=model.Probs) np.testing.assert_almost_equal(exp_probs, probs) values = model(test_data) np.testing.assert_equal(values, np.argmax(exp_probs, axis=1)) values, probs = model(test_data, ret=model.ValueProbs) np.testing.assert_almost_equal(exp_probs, probs) np.testing.assert_equal(values, np.argmax(exp_probs, axis=1)) # Test prediction directly on numpy probs = model(test_x, ret=model.Probs) np.testing.assert_almost_equal(exp_probs, probs) values = model(test_x) np.testing.assert_equal(values, np.argmax(exp_probs, axis=1)) values, probs = model(test_x, ret=model.ValueProbs) np.testing.assert_almost_equal(exp_probs, probs) np.testing.assert_equal(values, np.argmax(exp_probs, axis=1)) # Test prediction on instances for inst, exp_prob in zip(test_data, exp_probs): np.testing.assert_almost_equal( model(inst, ret=model.Probs), exp_prob) self.assertEqual(model(inst), np.argmax(exp_prob)) value, prob = model(inst, ret=model.ValueProbs) np.testing.assert_almost_equal(prob, exp_prob) self.assertEqual(value, np.argmax(exp_prob)) # Test prediction by directly calling predict. This is needed to test # csc_matrix, but doesn't hurt others if sparse is sp.csc_matrix: test_x = sparse(test_x) values, probs = model.predict(test_x) np.testing.assert_almost_equal(exp_probs, probs) np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
def commit(self): items = getattr(self.matrix, "items", self.items) if not items: self.Outputs.selected_data.send(None) self.Outputs.annotated_data.send(None) return selection = self.dendrogram.selected_nodes() selection = sorted(selection, key=lambda c: c.value.first) indices = [leaf.value.index for leaf in leaves(self.root)] maps = [indices[node.value.first:node.value.last] for node in selection] selected_indices = list(chain(*maps)) unselected_indices = sorted(set(range(self.root.value.last)) - set(selected_indices)) if not selected_indices: self.Outputs.selected_data.send(None) annotated_data = create_annotated_table(items, []) \ if self.selection_method == 0 and self.matrix.axis else None self.Outputs.annotated_data.send(annotated_data) return selected_data = None if isinstance(items, Orange.data.Table) and self.matrix.axis == 1: # Select rows c = np.zeros(self.matrix.shape[0]) for i, indices in enumerate(maps): c[indices] = i c[unselected_indices] = len(maps) mask = c != len(maps) data, domain = items, items.domain attrs = domain.attributes classes = domain.class_vars metas = domain.metas var_name = get_unique_names(domain, "Cluster") values = [f"C{i + 1}" for i in range(len(maps))] clust_var = Orange.data.DiscreteVariable( var_name, values=values + ["Other"]) domain = Orange.data.Domain(attrs, classes, metas + (clust_var,)) data = items.transform(domain) with data.unlocked(data.metas): data.get_column_view(clust_var)[0][:] = c if selected_indices: selected_data = data[mask] clust_var = Orange.data.DiscreteVariable( var_name, values=values) selected_data.domain = Domain( attrs, classes, metas + (clust_var, )) annotated_data = create_annotated_table(data, selected_indices) elif isinstance(items, Orange.data.Table) and self.matrix.axis == 0: # Select columns attrs = [] for clust, indices in chain(enumerate(maps, start=1), [(0, unselected_indices)]): for i in indices: attr = items.domain[i].copy() attr.attributes["cluster"] = clust attrs.append(attr) domain = Orange.data.Domain( # len(unselected_indices) can be 0 attrs[:len(attrs) - len(unselected_indices)], items.domain.class_vars, items.domain.metas) selected_data = items.from_table(domain, items) domain = Orange.data.Domain( attrs, items.domain.class_vars, items.domain.metas) annotated_data = items.from_table(domain, items) self.Outputs.selected_data.send(selected_data) self.Outputs.annotated_data.send(annotated_data)
def test_init_source_class(self): attributes = (age, gender, income) d = Domain(attributes, (education, race)) d2 = Domain(["Gender", 0], "income", source=d) self.assertEqual(d2.variables, (gender, age, income))
def __call__(self, data, *_): if data is not None: raise ValueError("boom") return Model(Domain([]))
def test_wrong_vartypes_w_source(self): d = Domain((age, gender), metas=(ssn, )) with self.assertRaises(TypeError): Domain(-1, source=d)
def extract_col(data, var): nd = Domain([var]) d = data.transform(nd) return d.X[:, 0]
def test_get_item_slices(self): d = Domain((age, gender, income, race), metas=(ssn, race)) self.assertEqual(d[:2], (age, gender)) self.assertEqual(d[1:3], (gender, income)) self.assertEqual(d[2:], (income, race))
def get_domain(self, domain, data): """Create domain (and dataset) from changes made in the widget. Parameters ---------- domain : old domain data : source data Returns ------- (new_domain, [attribute_columns, class_var_columns, meta_columns]) """ # Allow type-checking with type() instead of isinstance() for exact comparison # pylint: disable=unidiomatic-typecheck variables = self.model().variables places = [[], [], []] # attributes, class_vars, metas cols = [[], [], []] # Xcols, Ycols, Mcols def numbers_are_round(var, col_data): if type(var) == ContinuousVariable: data = np.asarray(col_data.data) # Works for dense and sparse data = data[~np.isnan(data)] return (data == data.astype(int)).all() return False # Exit early with original domain if the user didn't actually change anything if all((name == orig_var.name and tpe == type(orig_var) and place == orig_plc) for (name, tpe, place, _, _), (orig_var, orig_plc) in zip( variables, chain(((at, Place.feature) for at in domain.attributes), ( (cl, Place.class_var) for cl in domain.class_vars), ( (mt, Place.meta) for mt in domain.metas)))): return domain, [data.X, data.Y, data.metas] for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \ zip(variables, chain([(at, Place.feature) for at in domain.attributes], [(cl, Place.class_var) for cl in domain.class_vars], [(mt, Place.meta) for mt in domain.metas])): if place == Place.skip: continue col_data = self._get_column(data, orig_var, orig_plc) is_sparse = sp.issparse(col_data) if name == orig_var.name and tpe == type(orig_var): var = orig_var elif tpe == type(orig_var): # change the name so that all_vars will get the correct name orig_var.name = name var = orig_var elif tpe == DiscreteVariable: values = list( str(i) for i in unique(col_data) if not self._is_missing(i)) round_numbers = numbers_are_round(orig_var, col_data) col_data = [ np.nan if self._is_missing(x) else values.index(str(x)) for x in self._iter_vals(col_data) ] if round_numbers: values = [str(int(float(v))) for v in values] var = tpe(name, values) col_data = self._to_column(col_data, is_sparse) elif tpe == StringVariable: var = tpe.make(name) if type(orig_var) == DiscreteVariable: col_data = [ orig_var.repr_val(x) if not np.isnan(x) else "" for x in self._iter_vals(col_data) ] elif type(orig_var) == ContinuousVariable: round_numbers = numbers_are_round(orig_var, col_data) col_data = [ '' if np.isnan(x) else str(int(x)) if round_numbers else orig_var.repr_val(x) for x in self._iter_vals(col_data) ] # don't obey sparsity for StringVariable since they are # in metas which are transformed to dense below col_data = self._to_column(col_data, False, dtype=object) elif tpe == ContinuousVariable and type( orig_var) == DiscreteVariable: var = tpe.make(name) if may_be_numeric: col_data = [ np.nan if self._is_missing(x) else float( orig_var.values[int(x)]) for x in self._iter_vals(col_data) ] col_data = self._to_column(col_data, is_sparse) else: var = tpe(name) places[place].append(var) cols[place].append(col_data) # merge columns for X, Y and metas feats = cols[Place.feature] X = self._merge(feats) if len(feats) else np.empty((len(data), 0)) Y = self._merge(cols[Place.class_var], force_dense=True) m = self._merge(cols[Place.meta], force_dense=True) domain = Domain(*places) return domain, [X, Y, m]
def test_has_continuous(self): self.assertFalse(Domain([]).has_continuous_attributes()) self.assertFalse(Domain([], [age]).has_continuous_attributes()) self.assertFalse(Domain([], [race]).has_continuous_attributes()) self.assertTrue(Domain([age], None).has_continuous_attributes()) self.assertFalse(Domain([race], None).has_continuous_attributes()) self.assertTrue(Domain([age, race], None).has_continuous_attributes()) self.assertTrue(Domain([race, age], None).has_continuous_attributes()) self.assertTrue(Domain([], [age]).has_continuous_attributes(True)) self.assertFalse(Domain([], [race]).has_continuous_attributes(True)) self.assertTrue(Domain([age], None).has_continuous_attributes(True)) self.assertFalse(Domain([race], None).has_continuous_attributes(True)) self.assertTrue(Domain([age], race).has_continuous_attributes(True)) self.assertTrue(Domain([race], age).has_continuous_attributes(True)) self.assertTrue( Domain([], [race, age]).has_continuous_attributes(True)) d = Domain([], None, [age]) self.assertTrue(d.has_continuous_attributes(False, True)) d = Domain([], None, [gender]) self.assertFalse(d.has_continuous_attributes(False, True)) d = Domain([], [gender], [age]) self.assertTrue(d.has_continuous_attributes(True, True)) d = Domain([], [race], [gender]) self.assertFalse(d.has_continuous_attributes(True, True))
def setSeries(self, timeseries, attr, xdim, ydim, fagg): if timeseries is None or not attr: self.clear() return # TODO: support discrete variables if isinstance(xdim, str) and xdim.isdigit(): xdim = [str(i) for i in range(1, int(xdim) + 1)] if isinstance(ydim, str) and ydim.isdigit(): ydim = [str(i) for i in range(1, int(ydim) + 1)] xvals, xfunc = xdim.value yvals, yfunc = ydim.value values = Timeseries(Domain([], [], attr, source=timeseries.domain), timeseries).metas time_values = np.ravel(timeseries[:, timeseries.time_variable]) if True: fromtimestamp = datetime.fromtimestamp time_values = [fromtimestamp(i) for i in time_values] if not yvals: yvals = sorted(set(yfunc(i) for i in time_values)) if not xvals: xvals = sorted(set(xfunc(i) for i in time_values)) indices = defaultdict(list) for i, tval in enumerate(time_values): indices[(xfunc(tval), yfunc(tval))].append(i) series = [] aggvals = [] self.indices = [] xname = self.AxesCategories.name_it(xdim) yname = self.AxesCategories.name_it(ydim) for yval in yvals: data = [] series.append(dict(name=yname(yval), data=data)) self.indices.append([]) for xval in xvals: inds = indices.get((xval, yval), ()) self.indices[-1].append(inds) point = dict(y=1) data.append(point) if inds: try: aggval = fagg(values[inds]) except ValueError: aggval = np.nan else: aggval = np.nan if np.isnan(aggval): aggval = 'NaN' point['select'] = '' point['color'] = 'white' else: aggvals.append(aggval) point['n'] = aggval # TODO: allow scaling over just rows or cols instead of all values as currently try: maxval, minval = np.max(aggvals), np.min(aggvals) except ValueError: self.clear() return ptpval = maxval - minval color = GradientPaletteGenerator('#ffcccc', '#cc0000') selected_color = GradientPaletteGenerator('#ccffcc', '#006600') for serie in series: for point in serie['data']: n = point['n'] if isinstance(n, Number): val = (n - minval) / ptpval point['color'] = color[val] point['states'] = dict(select=dict(color=selected_color[val])) # TODO: make a white hole in the middle. Center w/o data. self.chart(series=series, xAxis_categories=[xname(i) for i in xvals], yAxis_categories=[yname(i) for i in reversed(yvals)])
def test_unpickling_recreates_known_domains(self): Variable._clear_all_caches() domain = Domain([]) unpickled_domain = pickle.loads(pickle.dumps(domain)) self.assertTrue(hasattr(unpickled_domain, '_known_domains'))
def effective_data(self): return self.data.transform(Domain(self.effective_variables))