def read(self): who = matlab.whosmat(self.filename) if not who: raise IOError("Couldn't load matlab file " + self.filename) else: ml = matlab.loadmat(self.filename, chars_as_strings=True) ml = {a: b for a, b in ml.items() if isinstance(b, np.ndarray)} # X is the biggest numeric array numarrays = [] for name, con in ml.items(): if issubclass(con.dtype.type, numbers.Number): numarrays.append( (name, reduce(lambda x, y: x * y, con.shape, 1))) X = None if numarrays: nameX = max(numarrays, key=lambda x: x[1])[0] X = ml.pop(nameX) # find an array with compatible shapes attributes = [] if X is not None: nameattributes = None for name, con in ml.items(): if con.shape in [(X.shape[1], ), (1, X.shape[1])]: nameattributes = name break attributenames = ml.pop(nameattributes).ravel( ) if nameattributes else range(X.shape[1]) attributenames = [str(a).strip() for a in attributenames ] # strip because of numpy char array attributes = [ ContinuousVariable(name=a) for a in attributenames ] metas = [] metaattributes = [] sizemetas = None if X is None: counts = defaultdict(list) for name, con in ml.items(): counts[len(con)].append(name) if counts: sizemetas = max(counts.keys(), key=lambda x: len(counts[x])) else: sizemetas = len(X) if sizemetas: for name, con in ml.items(): if len(con) == sizemetas: metas.append(name) metadata = [] for m in sorted(metas): f = ml[m] metaattributes.append(StringVariable(m)) f.resize(sizemetas, 1) metadata.append(f) metadata = np.hstack(tuple(metadata)) domain = Domain(attributes, metas=metaattributes) if X is None: X = np.zeros((sizemetas, 0)) return Orange.data.Table.from_numpy(domain, X, Y=None, metas=metadata)
class TestInstance(unittest.TestCase): attributes = ["Feature %i" % i for i in range(10)] class_vars = ["Class %i" % i for i in range(1)] metas = [DiscreteVariable("Meta 1", values="XYZ"), ContinuousVariable("Meta 2"), StringVariable("Meta 3")] def mock_domain(self, with_classes=False, with_metas=False): attributes = self.attributes class_vars = self.class_vars if with_classes else [] metas = self.metas if with_metas else [] variables = attributes + class_vars return MagicMock(Domain, attributes=attributes, class_vars=class_vars, metas=metas, variables=variables) def create_domain(self, attributes=(), classes=(), metas=()): attr_vars = [ContinuousVariable(name=a) if isinstance(a, str) else a for a in attributes] class_vars = [ContinuousVariable(name=c) if isinstance(c, str) else c for c in classes] meta_vars = [DiscreteVariable(name=m, values=map(str, range(5))) if isinstance(m, str) else m for m in metas] domain = Domain(attr_vars, class_vars, meta_vars) return domain def test_init_x_no_data(self): domain = self.mock_domain() inst = Instance(domain) self.assertIsInstance(inst, Instance) self.assertIs(inst.domain, domain) self.assertEqual(inst._values.shape, (len(self.attributes), )) self.assertEqual(inst._x.shape, (len(self.attributes), )) self.assertEqual(inst._y.shape, (0, )) self.assertEqual(inst._metas.shape, (0, )) self.assertTrue(all(isnan(x) for x in inst._values)) self.assertTrue(all(isnan(x) for x in inst._x)) def test_init_xy_no_data(self): domain = self.mock_domain(with_classes=True) inst = Instance(domain) self.assertIsInstance(inst, Instance) self.assertIs(inst.domain, domain) self.assertEqual(inst._values.shape, (len(self.attributes) + len(self.class_vars), )) self.assertEqual(inst._x.shape, (len(self.attributes), )) self.assertEqual(inst._y.shape, (len(self.class_vars), )) self.assertEqual(inst._metas.shape, (0, )) self.assertTrue(all(isnan(x) for x in inst._values)) self.assertTrue(all(isnan(x) for x in inst._x)) self.assertTrue(all(isnan(x) for x in inst._y)) def test_init_xym_no_data(self): domain = self.mock_domain(with_classes=True, with_metas=True) inst = Instance(domain) self.assertIsInstance(inst, Instance) self.assertIs(inst.domain, domain) self.assertEqual(inst._values.shape, (len(self.attributes) + len(self.class_vars), )) self.assertEqual(inst._x.shape, (len(self.attributes), )) self.assertEqual(inst._y.shape, (len(self.class_vars), )) self.assertEqual(inst._metas.shape, (3, )) self.assertTrue(all(isnan(x) for x in inst._values)) self.assertTrue(all(isnan(x) for x in inst._x)) self.assertTrue(all(isnan(x) for x in inst._y)) with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) assert_array_equal(inst._metas, np.array([Unknown, Unknown, None])) def test_init_x_arr(self): domain = self.create_domain(["x", DiscreteVariable("g", values="MF")]) vals = np.array([42, 0]) inst = Instance(domain, vals) assert_array_equal(inst._values, vals) assert_array_equal(inst._x, vals) self.assertEqual(inst._y.shape, (0, )) self.assertEqual(inst._metas.shape, (0, )) domain = self.create_domain() inst = Instance(domain, np.empty((0,))) self.assertEqual(inst._x.shape, (0, )) self.assertEqual(inst._y.shape, (0, )) self.assertEqual(inst._metas.shape, (0, )) def test_init_x_list(self): domain = self.create_domain(["x", DiscreteVariable("g", values="MF")]) lst = [42, 0] vals = np.array(lst) inst = Instance(domain, vals) assert_array_equal(inst._values, vals) assert_array_equal(inst._x, vals) self.assertEqual(inst._y.shape, (0, )) self.assertEqual(inst._metas.shape, (0, )) domain = self.create_domain() inst = Instance(domain, []) self.assertEqual(inst._x.shape, (0, )) self.assertEqual(inst._y.shape, (0, )) self.assertEqual(inst._metas.shape, (0, )) def test_init_xy_arr(self): domain = self.create_domain(["x", DiscreteVariable("g", values="MF")], [DiscreteVariable("y", values="ABC")]) vals = np.array([42, 0, 1]) inst = Instance(domain, vals) assert_array_equal(inst._values, vals) assert_array_equal(inst._x, vals[:2]) self.assertEqual(inst._y.shape, (1, )) self.assertEqual(inst._y[0], 1) self.assertEqual(inst._metas.shape, (0, )) def test_init_xy_list(self): domain = self.create_domain(["x", DiscreteVariable("g", values="MF")], [DiscreteVariable("y", values="ABC")]) lst = [42, "M", "C"] vals = np.array([42, 0, 2]) inst = Instance(domain, vals) assert_array_equal(inst._values, vals) assert_array_equal(inst._x, vals[:2]) self.assertEqual(inst._y.shape, (1, )) self.assertEqual(inst._y[0], 2) self.assertEqual(inst._metas.shape, (0, )) def test_init_xym_arr(self): domain = self.create_domain(["x", DiscreteVariable("g", values="MF")], [DiscreteVariable("y", values="ABC")], self.metas) vals = np.array([42, "M", "B", "X", 43, "Foo"], dtype=object) inst = Instance(domain, vals) self.assertIsInstance(inst, Instance) self.assertIs(inst.domain, domain) self.assertEqual(inst._values.shape, (3, )) self.assertEqual(inst._x.shape, (2, )) self.assertEqual(inst._y.shape, (1, )) self.assertEqual(inst._metas.shape, (3, )) assert_array_equal(inst._values, np.array([42, 0, 1])) assert_array_equal(inst._x, np.array([42, 0])) self.assertEqual(inst._y[0], 1) assert_array_equal(inst._metas, np.array([0, 43, "Foo"], dtype=object)) def test_init_xym_list(self): domain = self.create_domain(["x", DiscreteVariable("g", values="MF")], [DiscreteVariable("y", values="ABC")], self.metas) vals = [42, "M", "B", "X", 43, "Foo"] inst = Instance(domain, vals) self.assertIsInstance(inst, Instance) self.assertIs(inst.domain, domain) self.assertEqual(inst._values.shape, (3, )) self.assertEqual(inst._x.shape, (2, )) self.assertEqual(inst._y.shape, (1, )) self.assertEqual(inst._metas.shape, (3, )) assert_array_equal(inst._values, np.array([42, 0, 1])) assert_array_equal(inst._x, np.array([42, 0])) self.assertEqual(inst._y[0], 1) assert_array_equal(inst._metas, np.array([0, 43, "Foo"], dtype=object)) def test_init_inst(self): domain = self.create_domain(["x", DiscreteVariable("g", values="MF")], [DiscreteVariable("y", values="ABC")], self.metas) vals = [42, "M", "B", "X", 43, "Foo"] inst = Instance(domain, vals) inst2 = Instance(domain, inst) assert_array_equal(inst2._values, np.array([42, 0, 1])) assert_array_equal(inst2._x, np.array([42, 0])) self.assertEqual(inst2._y[0], 1) assert_array_equal(inst2._metas, np.array([0, 43, "Foo"], dtype=object)) domain2 = self.create_domain(["z", domain[1], self.metas[1]], domain.class_vars, [self.metas[0], "w", domain[0]]) inst2 = Instance(domain2, inst) with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) assert_array_equal(inst2._values, np.array([Unknown, 0, 43, 1])) assert_array_equal(inst2._x, np.array([Unknown, 0, 43])) self.assertEqual(inst2._y[0], 1) assert_array_equal(inst2._metas, np.array([0, Unknown, 42], dtype=object)) def test_get_item(self): domain = self.create_domain(["x", DiscreteVariable("g", values="MF")], [DiscreteVariable("y", values="ABC")], self.metas) vals = [42, "M", "B", "X", 43, "Foo"] inst = Instance(domain, vals) val = inst[0] self.assertIsInstance(val, Value) self.assertEqual(inst[0], 42) self.assertEqual(inst["x"], 42) self.assertEqual(inst[domain[0]], 42) val = inst[1] self.assertIsInstance(val, Value) self.assertEqual(inst[1], "M") self.assertEqual(inst["g"], "M") self.assertEqual(inst[domain[1]], "M") val = inst[2] self.assertIsInstance(val, Value) self.assertEqual(inst[2], "B") self.assertEqual(inst["y"], "B") self.assertEqual(inst[domain.class_var], "B") val = inst[-2] self.assertIsInstance(val, Value) self.assertEqual(inst[-2], 43) self.assertEqual(inst["Meta 2"], 43) self.assertEqual(inst[self.metas[1]], 43) with self.assertRaises(ValueError): inst["asdf"] = 42 with self.assertRaises(ValueError): inst[ContinuousVariable("asdf")] = 42 def test_set_item(self): domain = self.create_domain(["x", DiscreteVariable("g", values="MF")], [DiscreteVariable("y", values="ABC")], self.metas) vals = [42, "M", "B", "X", 43, "Foo"] inst = Instance(domain, vals) inst[0] = 43 self.assertEqual(inst[0], 43) inst["x"] = 44 self.assertEqual(inst[0], 44) inst[domain[0]] = 45 self.assertEqual(inst[0], 45) inst[1] = "F" self.assertEqual(inst[1], "F") inst["g"] = "M" self.assertEqual(inst[1], "M") with self.assertRaises(ValueError): inst[1] = "N" with self.assertRaises(ValueError): inst["asdf"] = 42 inst[2] = "C" self.assertEqual(inst[2], "C") inst["y"] = "A" self.assertEqual(inst[2], "A") inst[domain.class_var] = "B" self.assertEqual(inst[2], "B") inst[-1] = "Y" self.assertEqual(inst[-1], "Y") inst["Meta 1"] = "Z" self.assertEqual(inst[-1], "Z") inst[domain.metas[0]] = "X" self.assertEqual(inst[-1], "X") def test_str(self): domain = self.create_domain(["x", DiscreteVariable("g", values="MF")]) inst = Instance(domain, [42, 0]) self.assertEqual(str(inst), "[42.000, M]") domain = self.create_domain(["x", DiscreteVariable("g", values="MF")], [DiscreteVariable("y", values="ABC")]) inst = Instance(domain, [42, "M", "B"]) self.assertEqual(str(inst), "[42.000, M | B]") domain = self.create_domain(["x", DiscreteVariable("g", values="MF")], [DiscreteVariable("y", values="ABC")], self.metas) inst = Instance(domain, [42, "M", "B", "X", 43, "Foo"]) self.assertEqual(str(inst), "[42.000, M | B] {X, 43.000, Foo}") domain = self.create_domain([], [DiscreteVariable("y", values="ABC")], self.metas) inst = Instance(domain, ["B", "X", 43, "Foo"]) self.assertEqual(str(inst), "[ | B] {X, 43.000, Foo}") domain = self.create_domain([], [], self.metas) inst = Instance(domain, ["X", 43, "Foo"]) self.assertEqual(str(inst), "[] {X, 43.000, Foo}") domain = self.create_domain(self.attributes) inst = Instance(domain, range(len(self.attributes))) self.assertEqual(str(inst), "[0.000, 1.000, 2.000, 3.000, 4.000, ...]") for attr in domain: attr.number_of_decimals = 0 self.assertEqual(str(inst), "[0, 1, 2, 3, 4, ...]") def test_eq(self): domain = self.create_domain(["x", DiscreteVariable("g", values="MF")], [DiscreteVariable("y", values="ABC")], self.metas) vals = [42, "M", "B", "X", 43, "Foo"] inst = Instance(domain, vals) inst2 = Instance(domain, vals) self.assertTrue(inst == inst2) self.assertTrue(inst2 == inst) inst2[0] = 43 self.assertFalse(inst == inst2) inst2[0] = Unknown self.assertFalse(inst == inst2) inst2 = Instance(domain, vals) inst2[2] = "C" self.assertFalse(inst == inst2) inst2 = Instance(domain, vals) inst2[-1] = "Y" self.assertFalse(inst == inst2) inst2 = Instance(domain, vals) inst2[-2] = "33" self.assertFalse(inst == inst2) inst2 = Instance(domain, vals) inst2[-3] = "Bar" self.assertFalse(inst == inst2)
def test_title_selection_strategy_title_heading(self): """ When a there is a title, heading, filename attribute, select this one as a default title. """ data = Table(Domain([], metas=[ StringVariable("title"), StringVariable("b"), StringVariable("c") ]), np.empty((3, 0)), metas=[["a" * 100, "a" * 40, "a" * 40], ["b" * 100, "a" * 40, "b" * 30], ["c" * 100, "a" * 40, "b" * 40]]) self.send_signal(self.widget.Inputs.data, data) self.wait_until_finished() self.assertEqual(data.domain["title"], self.widget.title_variable) self.check_output("title") data = Table(Domain([], metas=[ StringVariable("Title"), StringVariable("b"), StringVariable("c") ]), np.empty((3, 0)), metas=[["a" * 100, "a" * 40, "a" * 40], ["b" * 100, "a" * 40, "b" * 30], ["c" * 100, "a" * 40, "b" * 40]]) self.send_signal(self.widget.Inputs.data, data) self.wait_until_finished() self.assertEqual(data.domain["Title"], self.widget.title_variable) self.check_output("Title") # when title and heading present first select title data = Table(Domain([], metas=[ StringVariable("Title"), StringVariable("Heading"), StringVariable("c") ]), np.empty((3, 0)), metas=[["a" * 100, "a" * 40, "a" * 40], ["b" * 100, "a" * 40, "b" * 30], ["c" * 100, "a" * 40, "b" * 40]]) self.send_signal(self.widget.Inputs.data, data) self.wait_until_finished() self.assertEqual(data.domain["Title"], self.widget.title_variable) self.check_output("Title") data = Table(Domain([], metas=[ StringVariable("Heading"), StringVariable("Title"), StringVariable("c") ]), np.empty((3, 0)), metas=[["a" * 100, "a" * 40, "a" * 40], ["b" * 100, "a" * 40, "b" * 30], ["c" * 100, "a" * 40, "b" * 40]]) self.send_signal(self.widget.Inputs.data, data) self.wait_until_finished() self.assertEqual(data.domain["Title"], self.widget.title_variable) self.check_output("Title") data = Table(Domain([], metas=[ StringVariable("Heading"), StringVariable("Filename"), StringVariable("c") ]), np.empty((3, 0)), metas=[["a" * 100, "a" * 40, "a" * 40], ["b" * 100, "a" * 40, "b" * 30], ["c" * 100, "a" * 40, "b" * 40]]) self.send_signal(self.widget.Inputs.data, data) self.wait_until_finished() self.assertEqual(data.domain["Heading"], self.widget.title_variable) self.check_output("Heading")
PickleDiscreteVariable = create_pickling_tests( "PickleDiscreteVariable", ("with_name", lambda: DiscreteVariable(name="Feature 0")), ("with_int_values", lambda: DiscreteVariable(name="Feature 0", values=[1, 2, 3])), ("with_str_value", lambda: DiscreteVariable(name="Feature 0", values=["F", "M"])), ("ordered", lambda: DiscreteVariable( name="Feature 0", values=["F", "M"], ordered=True)), ("with_base_value", lambda: DiscreteVariable( name="Feature 0", values=["F", "M"], base_value=0))) PickleStringVariable = create_pickling_tests( "PickleStringVariable", ("with_name", lambda: StringVariable(name="Feature 0"))) @variabletest(DiscreteVariable) class VariableTestMakeProxy(unittest.TestCase): def test_make_proxy_disc(self): abc = DiscreteVariable("abc", values="abc", ordered=True) abc1 = abc.make_proxy() abc2 = abc1.make_proxy() self.assertIs(abc.master, abc) self.assertIs(abc1.master, abc) self.assertIs(abc2.master, abc) self.assertEqual(abc, abc1) self.assertEqual(abc, abc2) self.assertEqual(abc1, abc2)
def test_string(self): X = StringVariable("S") self._test_common(X)
return [datetime.datetime(year, 1, 1).timestamp() if not np.isnan(year) else np.nan for year in years] time_full = VarDataPair( TimeVariable('time_full'), np.array(_to_timestamps([2000, 2001, 2002, 2003, 2004]), dtype=float), ) time_missing = VarDataPair( TimeVariable('time_missing'), np.array(_to_timestamps([2000, np.nan, 2001, 2003, 2004]), dtype=float), ) # String variable variations string_full = VarDataPair( StringVariable('string_full'), np.array(['a', 'b', 'c', 'd', 'e'], dtype=object), ) string_missing = VarDataPair( StringVariable('string_missing'), np.array(['a', 'b', 'c', StringVariable.Unknown, 'e'], dtype=object), ) def make_table(attributes, target=None, metas=None): """Build an instance of a table given various variables. Parameters ---------- attributes : Iterable[Tuple[Variable, np.array] target : Optional[Iterable[Tuple[Variable, np.array]]
TimeVariable('time_missing'), np.array([0, np.nan, 2, 3, 4], dtype=float), ] time_all_missing = [ TimeVariable('time_all_missing'), np.array([np.nan] * 5, dtype=float), ] time_same = [ TimeVariable('time_same'), np.array([4] * 5, dtype=float), ] time = [time_full, time_missing, time_all_missing, time_same] # String variable variations string_full = [ StringVariable('string_full'), np.array(['a', 'b', 'c', 'd', 'e'], dtype=object), ] string_missing = [ StringVariable('string_missing'), np.array(['a', 'b', 'c', StringVariable.Unknown, 'e'], dtype=object), ] string_all_missing = [ StringVariable('string_all_missing'), np.array([StringVariable.Unknown] * 5, dtype=object), ] string_same = [ StringVariable('string_same'), np.array(['a'] * 5, dtype=object), ] string = [string_full, string_missing, string_all_missing, string_same]
def setUp(self) -> None: self.lookup = LookupMappingTransform( StringVariable("S"), DictMissingConst(np.nan, {"": np.nan, "a": 0, "b": 1}), dtype=float, )
""" input: Corpus preprocessed with Preprocess Text. Tokenizer is set to Sentences. output: Corpus where sentences are now documents. requires: Text add-on """ import numpy as np from Orange.data import Domain, StringVariable from orangecontrib.text.corpus import Corpus tokens = in_data.tokens title = [i for i in in_data.domain.metas if "title" in i.attributes][0] new_domain = Domain(attributes=[], metas=[StringVariable('Sentences'), title) titles = [] content = [] for i, doc in enumerate(tokens): for t in doc: titles.append(in_data[i][title.name].value) content.append(t) metas = np.column_stack((content, titles)) out_data = Corpus.from_numpy(domain=new_domain, X=np.empty((len(content), 0)), metas=metas) out_data.set_text_features([StringVariable('Sentences')]) out_data.set_title_variable(title)
def from_file(cls, filename): """ Load distance matrix from a file The file should be preferrably encoded in ascii/utf-8. White space at the beginning and end of lines is ignored. The first line of the file starts with the matrix dimension. It can be followed by a list flags - *axis=<number>*: the axis number - *symmetric*: the matrix is symmetric; when reading the element (i, j) it's value is also assigned to (j, i) - *asymmetric*: the matrix is asymmetric - *row_labels*: the file contains row labels - *col_labels*: the file contains column labels By default, matrices are symmetric, have axis 1 and no labels are given. Flags *labeled* and *labelled* are obsolete aliases for *row_labels*. If the file has column labels, they follow in the second line. Row labels appear at the beginning of each row. Labels are arbitrary strings that cannot contain newlines and tabulators. Labels are stored as instances of `Table` with a single meta attribute named "label". The remaining lines contain tab-separated numbers, preceded with labels, if present. Lines are padded with zeros if necessary. If the matrix is symmetric, the file contains the lower triangle; any data above the diagonal is ignored. Args: filename: file name """ with open(filename, encoding=detect_encoding(filename)) as fle: line = fle.readline() if not line: raise ValueError("empty file") data = line.strip().split() if not data[0].strip().isdigit(): raise ValueError("distance file must begin with dimension") n = int(data.pop(0)) symmetric = True axis = 1 col_labels = row_labels = None for flag in data: if flag in ("labelled", "labeled", "row_labels"): row_labels = [] elif flag == "col_labels": col_labels = [] elif flag == "symmetric": symmetric = True elif flag == "asymmetric": symmetric = False else: flag_data = flag.split("=") if len(flag_data) == 2: name, value = map(str.strip, flag_data) else: name, value = "", None if name == "axis" and value.isdigit(): axis = int(value) else: raise ValueError("invalid flag '{}'".format( flag, filename)) if col_labels is not None: col_labels = [ x.strip() for x in fle.readline().strip().split("\t") ] if len(col_labels) != n: raise ValueError("mismatching number of column labels") matrix = np.zeros((n, n)) for i, line in enumerate(fle): if i >= n: raise ValueError("too many rows".format(filename)) line = line.strip().split("\t") if row_labels is not None: row_labels.append(line.pop(0).strip()) if len(line) > n: raise ValueError( "too many columns in matrix row {}".format( "'{}'".format(row_labels[i]) if row_labels else i + 1)) for j, e in enumerate(line[:i + 1 if symmetric else n]): try: matrix[i, j] = float(e) except ValueError as exc: raise ValueError( "invalid element at row {}, column {}".format( "'{}'".format(row_labels[i]) if row_labels else i + 1, "'{}'".format(col_labels[j]) if col_labels else j + 1)) from exc if symmetric: matrix[j, i] = matrix[i, j] if col_labels: col_labels = Table.from_list( Domain([], metas=[StringVariable("label")]), [[item] for item in col_labels]) if row_labels: row_labels = Table.from_list( Domain([], metas=[StringVariable("label")]), [[item] for item in row_labels]) return cls(matrix, row_labels, col_labels, axis)
# Select FDR for gene sets FDR = 0.25 # Get data from Orange table and put it in a new tqable of gene sets (rows) and genes (columns) # If gene is in gene set put 1 in table data = in_data.metas columns = in_data.domain.metas columns = [column.name for column in columns] data = pd.DataFrame(data, columns=columns) data = data.loc[data['FDR'] <= FDR, :] gene_enrichment = pd.DataFrame() for gene_set_data in data.iterrows(): gene_set_data = gene_set_data[1] gene_set = gene_set_data['GO Term Name'] for gene in gene_set_data['Genes'].split(','): gene_enrichment.loc[gene_set, gene] = 1 #Replace NA with 0 gene_enrichment = gene_enrichment.fillna(0) #Orange table domain_columns = [] for col in gene_enrichment.columns: domain_columns.append(ContinuousVariable(name=col)) meta_columns = [StringVariable(name='Gene set')] out_data = Table.from_numpy(domain=Domain(domain_columns, metas=meta_columns), X=gene_enrichment.to_numpy(), metas=pd.DataFrame( gene_enrichment.index).to_numpy())
def network2tables(self): network = self.network # create the vertices data table nodes = network.nodes if isinstance(nodes,Table): # if it's a data table already if len(nodes.domain.attributes) == 0: # no attribute column, so no id column X = np.array(range(nodes.metas.shape[0])) # add an id column for it X = X.reshape(len(X),1) domain = Domain([ContinuousVariable("id")], nodes.domain.class_vars, nodes.domain.metas) vertices = Table.from_numpy(domain,X,nodes.Y,nodes.metas,nodes.W) self.vertices = vertices self.Information.inform("No attribute column of the vertices table, an id column is added.") else: # check if there's an id column idcol = None for i,attr in enumerate(nodes.domain.attributes): if attr.name=="id": idcol=attr break if idcol is None: # no id column, add an id column for it X1 = np.array(range(nodes.X.shape[0])) X = nodes.X X = np.insert(X, 0, values=X1, axis=1) attrs = [] for attr in nodes.domain.attributes: attrs.append(attr) attrs.insert(0,ContinuousVariable("id")) domain = Domain(attrs,nodes.domain.class_vars,nodes.domain.metas) vertices = Table.from_numpy(domain,X,nodes.Y,nodes.metas,nodes.W) self.vertices = vertices self.Information.inform("No id column of the vertices table, an id column is added.") else: # there's an id column already self.vertices = nodes else: # it's an label array of nodes, so add an id column nodes = nodes.reshape(len(nodes),1) # and a name column ids = np.array(range(len(nodes))) if network.coordinates is None: # no coordinates ids = ids.reshape(len(ids),1) domain = Domain([ContinuousVariable("id")], None, [StringVariable("name")]) vertices = Table.from_numpy(domain,ids,None,nodes,None) else: # with coordinates X = np.array([ids,network.coordinates[:,0],network.coordinates[:,1]]).T domain = Domain([ContinuousVariable("id"),ContinuousVariable("x"),\ ContinuousVariable("y")], None, [StringVariable("name")]) vertices = Table.from_numpy(domain,X,None,nodes,None) self.vertices = vertices self.Information.inform("Label array to vertices table, an id column is added.") # create the edges data table from sparse matrix edges = network.edges source = []; target = []; weight = [] isDirected = 0 for edge in edges: es = edge.edges if edge.directed: isDirected = 1 for i in range(es.shape[0]): matrix = es[i].tocoo() weight += matrix.data.tolist() source += [i]*matrix.nnz target += [r+c for r,c in zip(matrix.row, matrix.col)] directed = np.array([isDirected]*len(source)) X =np.array([source,target,weight,directed]).T domain = Domain([ContinuousVariable("source"),ContinuousVariable("target"),\ ContinuousVariable("weight"),ContinuousVariable("isDirected")],\ None, None) edges = Table.from_numpy(domain,X,None,None,None) self.edges = edges
self.set_selected_words() elif len(self.word_list_library) > self.word_list_index and \ self.word_list_library[self.word_list_index] != self.words: self.commit() def _save_state(self): self.word_list_library = [s.as_dict() for s in self.library_model] self.words = self.words_model[:] def send_report(self): library = self.library_model[self.word_list_index].name \ if self.library_model else "/" settings = [("Library", library)] if self.__input_words: self.report_data("Input Words", self.__input_words) settings.append(("Word variable", self.words_var)) rule = UpdateRules.ITEMS[self.update_rule_index] settings.append(("Update", rule)) self.report_items("Settings", settings) self.report_paragraph("Words", ", ".join(self.words_model[:])) if __name__ == "__main__": from Orange.widgets.utils.widgetpreview import WidgetPreview words_vars = [StringVariable("S1"), StringVariable("S2")] lst = [["foo", "A"], ["bar", "B"], ["foobar", "C"]] input_table = Table.from_list(Domain([], metas=words_vars), lst) # WidgetPreview(OWWordList).run(set_words=input_table) WidgetPreview(OWWordList).run()
def to_data_table(self, selected_genes: Optional[List[str]] = None) -> Table: """ Transform GeneMatcher results to Orange data table. Optionally we can provide a list of genes (Entrez Ids). The table on the output will be populated only with provided genes. Parameters ---------- selected_genes: list List of Entrez Ids Returns ------- Orange.data.Table Summary of Gene info in tabular format """ data_x = [] metas = [ StringVariable('Input gene ID'), StringVariable(ENTREZ_ID), StringVariable('Symbol'), StringVariable('Synonyms'), StringVariable('Description'), StringVariable('Other IDs'), StringVariable('Type of gene'), StringVariable('Chromosome'), StringVariable('Map location'), StringVariable('Locus tag'), StringVariable('Symbol from nomenclature authority'), StringVariable('Full name from nomenclature authority'), StringVariable('Nomenclature status'), StringVariable('Other designations'), StringVariable('Species'), StringVariable('Taxonomy ID'), ] domain = Domain([], metas=metas) genes: List[Gene] = self.genes if selected_genes is not None: selected_genes_set = set(selected_genes) genes = [ gene for gene in self.genes if str(gene.gene_id) in selected_genes_set ] for gene in genes: db_refs = (', '.join( '{}: {}'.format(key, val) for (key, val) in gene.db_refs.items()) if gene.db_refs else '') synonyms = ', '.join(gene.synonyms) if gene.synonyms else '' line = [ gene.input_identifier, gene.gene_id, gene.symbol, synonyms, gene.description, db_refs, gene.type_of_gene, gene.chromosome, gene.map_location, gene.locus_tag, gene.symbol_from_nomenclature_authority, gene.full_name_from_nomenclature_authority, gene.nomenclature_status, gene.other_designations, species_name_to_taxid(gene.species), gene.tax_id, ] data_x.append(line) table = Table(domain, data_x) table.name = 'Gene Matcher Results' table.attributes[TableAnnotation.tax_id] = self.tax_id table.attributes[TableAnnotation.gene_as_attr_name] = False table.attributes[TableAnnotation.gene_id_column] = ENTREZ_ID return table
def _to_orange_data_table(self, report_genes=True, merge_function=spots_mean, sample_type=None, transpose=False): """ Convert parsed GEO format to orange, save by genes or by spots. """ if transpose: # samples in rows sample2class = self._sample_to_class(sample_type) cvalues = sorted(set(sample2class.values())) if None in cvalues: cvalues.remove(None) samp_ann = self._sample_annotations() ad = defaultdict(set) for d in samp_ann.values(): for n, v in d.items(): ad[n].add(v) # auto-select sample type if there is only one if len(ad) == 1: sample_type = list(ad.keys())[0] classvar = DiscreteVariable(name=sample_type or "class", values=cvalues) spots = self.genes if report_genes else self.spots atts = [ContinuousVariable(name=gene) for gene in spots] metasvar = [ DiscreteVariable(name=n, values=sorted(values)) for n, values in ad.items() if n != sample_type ] X = [] Y = [] metas = [] for (i, sampleid) in enumerate(self.info["samples"]): vals = [((merge_function([ self.gds_data[spot].data[i] for spot in self.gene2spots[gene] ])) if report_genes else self.gds_data[gene].data[i]) for gene in spots] X.append(vals) Y.append(sample2class.get(sampleid, None)) metas.append([ samp_ann[sampleid].get(n, None) for n, _ in ad.items() if n != sample_type ]) domain = Domain(atts, classvar, metas=metasvar) return create_table(domain, X, Y, metas) else: # genes in rows annotations = self._sample_annotations(sample_type) atts = [ContinuousVariable(name=ss) for ss in self.info["samples"]] for i, a in enumerate(atts): setattr(a, "attributes", annotations[self.info["samples"][i]]) geneatname = "gene" if report_genes else "spot" metasvar = [StringVariable(geneatname)] nameval = self.genes if report_genes else self.spots X = [] metas = [] for g in nameval: if report_genes: X.append( list( map( lambda *x: merge_function(x), *[ self.gds_data[spot].data for spot in self.gene2spots[g] ]))) else: X.append(self.gds_data[g].data) metas = [[a] for a in nameval] domain = Domain(atts, [], metas=metasvar) return create_table(domain, X, None, metas)
def to_data_table(self, selected_genes=None): tax_id = set() data_x = [] metas = [ StringVariable('Input gene ID'), DiscreteVariable('Match result', values=['Matched', 'Match Conflict', 'Unmatched']), StringVariable(NCBI_ID), StringVariable('Symbol'), StringVariable('Synonyms'), StringVariable('Description'), StringVariable('Other IDs'), StringVariable('Type of gene'), StringVariable('Chromosome'), StringVariable('Map location'), StringVariable('Locus tag'), StringVariable('Symbol from nomenclature authority'), StringVariable('Full name from nomenclature authority'), StringVariable('Nomenclature status'), StringVariable('Other designations'), StringVariable('Taxonomy ID'), ] domain = Domain([], metas=metas) genes = self.genes if selected_genes is not None: genes = [ gene for gene in self.genes if str(gene.ncbi_id) in selected_genes ] for gene in genes: gene.load_ncbi_info() tax_id.add(gene.tax_id) match_status = self.gene_match_status(gene) db_refs = ', '.join( '{}: {}'.format(key, val) for (key, val) in gene.db_refs.items()) if gene.db_refs else '' synonyms = ', '.join(gene.synonyms) if gene.synonyms else '' line = [ gene.input_name, match_status, gene.ncbi_id, gene.symbol, synonyms, gene.description, db_refs, gene.type_of_gene, gene.chromosome, gene.map_location, gene.locus_tag, gene.symbol_from_nomenclature_authority, gene.full_name_from_nomenclature_authority, gene.nomenclature_status, gene.other_designations, gene.tax_id ] data_x.append(line) tax_id = filter(None.__ne__, tax_id) table = Table(domain, data_x) table.name = 'Gene Matcher Results' table.attributes[OrangeTableAnnotations.tax_id] = next(tax_id) table.attributes[OrangeTableAnnotations.gene_as_attribute_name] = False table.attributes[OrangeTableAnnotations.gene_id_column] = NCBI_ID return table
def generateGraph(self, N_changed=False): self.Error.clear() self.Warning.clear() matrix = None if N_changed: self.node_selection = NodeSelection.COMPONENTS if self.matrix is None: if hasattr(self, "infoa"): self.infoa.setText("No data loaded.") if hasattr(self, "infob"): self.infob.setText("") if hasattr(self, "infoc"): self.infoc.setText("") self.pconnected = 0 self.nedges = 0 self.graph = None self.sendSignals() return nEdgesEstimate = 2 * sum( y for x, y in zip(self.histogram.xData, self.histogram.yData) if x <= self.epsilon) if nEdgesEstimate > 200000: self.graph = None nedges = 0 n = 0 self.Error.number_of_edges(nEdgesEstimate) else: items = None matrix = self.matrix if matrix is not None and matrix.row_items is not None: row_items = self.matrix.row_items if isinstance(row_items, Table): if self.matrix.axis == 1: items = row_items else: items = [[v.name] for v in row_items.domain.attributes] else: items = [[str(x)] for x in self.matrix.row_items] if len(items) != self.matrix.shape[0]: self.Warning.invalid_number_of_items() items = None if items is None: items = list(range(self.matrix.shape[0])) if not isinstance(items, Table): items = Table(Domain([], metas=[StringVariable('label')]), items) mask = self.matrix <= self.epsilon weights = matrix[mask] if weights.size: weights = np.max(weights) - weights edges = sp.csr_matrix((weights, mask.nonzero())) self.graph = Network(items, edges) self.graph_matrix = self.matrix if self.graph is None: self.pconnected = 0 self.nedges = 0 else: self.pconnected = self.graph.number_of_nodes() self.nedges = self.graph.number_of_edges() if hasattr(self, "infoa"): self.infoa.setText("Data items on input: %d" % self.matrix.shape[0]) if hasattr(self, "infob"): self.infob.setText("Network nodes: %d (%3.1f%%)" % (self.pconnected, self.pconnected / float(self.matrix.shape[0]) * 100)) if hasattr(self, "infoc"): self.infoc.setText( "Network edges: %d (%.2f edges/node)" % (self.nedges, self.nedges / float(self.pconnected) if self.pconnected else 0)) self.Warning.large_number_of_nodes.clear() if self.pconnected > 1000 or self.nedges > 2000: self.Warning.large_number_of_nodes() self.sendSignals() self.histogram.setRegion(0, self.epsilon)
) from Orange.widgets.widget import Output from orangecontrib.text.corpus import Corpus from orangecontrib.text.import_documents import ImportDocuments, \ NoDocumentsException try: from orangecanvas.preview.previewbrowser import TextLabel except ImportError: from Orange.canvas.preview.previewbrowser import TextLabel # domain for skipped images output SKIPPED_DOMAIN = Domain([], metas=[ StringVariable("name"), StringVariable("path") ]) def prettifypath(path): home = os.path.expanduser("~/") if path.startswith(home): # case sensitivity! path = os.path.join("~", os.path.relpath(path, home)) return path log = logging.getLogger(__name__) class RuntimeEvent(QEvent):
def read_pajek(path, encoding='UTF-8', project=False, auto_table=False): """Reimplemented method for reading Pajek files; written in C++ for maximum performance. :param path: File or file name to write. :type path: string :param encoding: Encoding of input text file, default 'UTF-8'. :type encoding: string :param project: Determines whether the input file is a Pajek project file, possibly containing multiple networks and other data. If :obj:`True`, a list of networks is returned instead of just a network. Default is :obj:`False`. :type project: boolean. Return the network (or a list of networks if project=:obj:`True`) of type :obj:`Orange.network.Graph` or :obj:`Orange.network.DiGraph`. Examples >>> G = orangecontrib..network.nx.path_graph(4) >>> orangecontrib..network.readwrite.write_pajek(G, "test.net") >>> G = orangecontrib.network.readwrite.read_pajek("test.net") To create a Graph instead of a MultiGraph use >>> G1 = orangecontrib.network.Graph(G) References See http://vlado.fmf.uni-lj.si/pub/networks/pajek/doc/draweps.htm for format information. """ path = _check_network_dir(path) G = _wrap(rwpajek.read_pajek(path)) # Additionally read values into Table; needed to get G nodes properly sorted # (Consult OWNxFile.readDataFile(), orangeom.GraphLayout.readPajek(), and the Pajek format spec) import shlex, numpy as np rows, metas, remapping = [], [], {} with open(path, encoding='utf-8') as f: for line in f: if line.lower().startswith('*vertices'): nvertices = int(line.split()[1]) break # Read vertices lines for line in f: parts = shlex.split(line)[:4] if len(parts) == 1: i = label = parts[0] elif len(parts) == 2: i, label = parts metas.append((label, )) elif len(parts) == 4: i, label, x, y = parts # The format specification was never set in stone, it seems try: x, y = float(x), float(y) except ValueError: metas.append((label, x, y)) else: rows.append((x, y)) metas.append((label, )) i = int(i) - 1 # -1 because pajek is 1-indexed remapping[label] = i nvertices -= 1 if not nvertices: break from Orange.data import Domain, ContinuousVariable, StringVariable # Construct x-y-label table (added in OWNxFile.readDataFile()) table = None vars = [ContinuousVariable('x'), ContinuousVariable('y')] if rows else [] meta_vars = [ StringVariable('label ' + str(i)) for i in range(len(metas[0]) if metas else 0) ] if rows or metas: domain = Domain(vars, metas=meta_vars) table = Table.from_numpy(domain, np.array(rows, dtype=float).reshape( len(metas), len(rows[0]) if rows else 0), metas=np.array(metas, dtype=str)) if table is not None and auto_table: G.set_items(table) # Relabel nodes to integers, sorted by appearance for node in G.node: G.node[node]['label'] = node nx.relabel_nodes(G, remapping, copy=False) if table is not None and len(table) != G.number_of_nodes(): raise PajekBug( "There is a bug in your version of NetworkX reading Pajek files. " "Please update your NetworkX installation.") return G
pass def coordinates(tweet, _, __, dim): coord = tweet.geo.get("coordinates", None) if tweet.geo else None return coord["coordinates"][dim] if coord else None def country_code(tweet, _, places): place_id = tweet.geo.get("place_id", None) if tweet.geo else None return places[place_id].country_code if place_id else "" tv = TimeVariable("Date") METAS = [ (StringVariable("Content"), lambda doc, _, __: doc.text), ( DiscreteVariable("Author"), lambda doc, users, _: "@" + users[doc.author_id].username, ), (tv, lambda doc, _, __: tv.parse(doc.created_at.isoformat())), (DiscreteVariable("Language"), lambda doc, _, __: doc.lang), (DiscreteVariable("Location"), country_code), ( ContinuousVariable("Number of Likes", number_of_decimals=0), lambda doc, _, __: doc.public_metrics["like_count"], ), ( ContinuousVariable("Number of Retweets", number_of_decimals=0), lambda doc, _, __: doc.public_metrics["retweet_count"], ),
def test_match_attr_name(self): widget = self.widget row = widget.attr_boxes.rows[0] data_combo, extra_combo = row.left_combo, row.right_combo domainA = Domain([DiscreteVariable("dA1", ("a", "b", "c", "d")), DiscreteVariable("dA2", ("aa", "bb")), DiscreteVariable("dA3", ("aa", "bb"))], DiscreteVariable("cls", ("aaa", "bbb", "ccc")), [DiscreteVariable("mA1", ("cc", "dd")), StringVariable("mA2")]) XA = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 0], [3, 1, 0]]) yA = np.array([0, 1, 2, np.nan]) metasA = np.array([[0.0, "m1"], [1.0, "m2"], [np.nan, "m3"], [0.0, "m4"]]).astype(object) domainB = Domain([DiscreteVariable("dB1", values=("a", "b", "c")), ContinuousVariable("dA2")], None, [StringVariable("cls"), DiscreteVariable("dA1", ("m4", "m5"))]) XB = np.array([[0, 0], [1, 1], [2, np.nan]]) yB = np.empty((3, 0)) metasB = np.array([[np.nan, np.nan], [1, 1], [0, 0]]).astype(object) dataA = Table(domainA, XA, yA, metasA) dataA.name = 'dataA' dataA.attributes = 'dataA attributes' dataB = Table(domainB, XB, yB, metasB) dataB.name = 'dataB' dataB.attributes = 'dataB attributes' self.send_signal(widget.Inputs.data, dataA) self.send_signal(widget.Inputs.extra_data, dataB) # match variable if available and the other combo is Row Index extra_combo.setCurrentIndex(0) extra_combo.activated.emit(0) data_combo.setCurrentIndex(2) data_combo.activated.emit(2) self.assertEqual(extra_combo.currentIndex(), 5) # match variable if available and the other combo is ID extra_combo.setCurrentIndex(1) extra_combo.activated.emit(1) data_combo.setCurrentIndex(2) data_combo.activated.emit(2) self.assertEqual(extra_combo.currentIndex(), 5) # don't match variable if other combo is set extra_combo.setCurrentIndex(4) extra_combo.activated.emit(4) data_combo.setCurrentIndex(2) data_combo.activated.emit(2) self.assertEqual(extra_combo.currentIndex(), 4) # don't match if nothing to match to extra_combo.setCurrentIndex(0) extra_combo.activated.emit(0) data_combo.setCurrentIndex(4) data_combo.activated.emit(4) self.assertEqual(extra_combo.currentIndex(), 0) # don't match numeric with non-numeric extra_combo.setCurrentIndex(0) extra_combo.activated.emit(0) data_combo.setCurrentIndex(3) data_combo.activated.emit(3) self.assertEqual(extra_combo.currentIndex(), 0) # allow matching string with discrete extra_combo.setCurrentIndex(0) extra_combo.activated.emit(0) data_combo.setCurrentIndex(5) data_combo.activated.emit(5) self.assertEqual(extra_combo.currentIndex(), 4)
def coefficients(self) -> Table: return Table(Domain([ContinuousVariable("coef")], metas=[StringVariable("name")]), self.__parameters[:, None], metas=np.array(self.__parameters_names)[:, None])
def test_val(self): a = StringVariable("a") self.assertEqual(a.to_val(None), "") self.assertEqual(a.str_val(""), "?") self.assertEqual(a.str_val(Value(a, "")), "?") self.assertEqual(a.repr_val(Value(a, "foo")), '"foo"')
annotated = create_annotated_table(self.corpus, self.selection) self.Outputs.matching_docs.send(matched) self.Outputs.other_docs.send(other) self.Outputs.corpus.send(annotated) def send_report(self): if not self.corpus: return self.report_data("Corpus", self.corpus) if self.words is not None: self.report_paragraph("Words", ", ".join(self.words)) self.report_table(self._list_view, num_format="{:.3f}") def copy_to_clipboard(self): text = self._web_view.selectedText() QApplication.clipboard().setText(text) if __name__ == "__main__": # pylint: disable=ungrouped-imports from Orange.widgets.utils.widgetpreview import WidgetPreview words_var_ = StringVariable(WORDS_COLUMN_NAME) words_var_.attributes = {"type": "words"} lists = [[w] for w in ["human", "graph", "minors", "trees"]] words_ = Table.from_list(Domain([], metas=[words_var_]), lists) words_.name = "Words" WidgetPreview(OWSemanticViewer).run( set_corpus=Corpus.from_file("deerwester"), # deerwester book-excerpts set_words=words_)
def from_numpy(cls, X, Y=None, metas=None): """ Create a domain corresponding to the given numpy arrays. This method is usually invoked from :meth:`Orange.data.Table.from_numpy`. All attributes are assumed to be continuous and are named "Feature <n>". Target variables are discrete if the only two values are 0 and 1; otherwise they are continuous. Discrete targets are named "Class <n>" and continuous are named "Target <n>". Domain is marked as :attr:`anonymous`, so data from any other domain of the same shape can be converted into this one and vice-versa. :param `numpy.ndarray` X: 2-dimensional array with data :param Y: 1- of 2- dimensional data for target :type Y: `numpy.ndarray` or None :param `numpy.ndarray` metas: meta attributes :type metas: `numpy.ndarray` or None :return: a new domain :rtype: :class:`Domain` """ def get_places(max_index): return 0 if max_index == 1 else int(log(max_index, 10)) + 1 def get_name(base, index, places): return base if not places \ else "{} {:0{}}".format(base, index + 1, places) if X.ndim != 2: raise ValueError('X must be a 2-dimensional array') n_attrs = X.shape[1] places = get_places(n_attrs) attr_vars = [ ContinuousVariable(name=get_name("Feature", a, places)) for a in range(n_attrs) ] class_vars = [] if Y is not None: if Y.ndim == 1: Y = Y.reshape(len(Y), 1) elif Y.ndim != 2: raise ValueError('Y has invalid shape') n_classes = Y.shape[1] places = get_places(n_classes) for i, values in enumerate(Y.T): if set(values) == {0, 1}: name = get_name('Class', i, places) values = ['v1', 'v2'] class_vars.append(DiscreteVariable(name, values)) else: name = get_name('Target', i + 1, places) class_vars.append(ContinuousVariable(name)) if metas is not None: n_metas = metas.shape[1] places = get_places(n_metas) meta_vars = [ StringVariable(get_name("Meta", m, places)) for m in range(n_metas) ] else: meta_vars = [] domain = cls(attr_vars, class_vars, meta_vars) domain.anonymous = True return domain
def test_string_meta(self): """Check widget for dataset with only one string meta""" domain = Domain([], metas=[StringVariable("m")]) data = Table(domain, np.empty((6, 0)), metas=np.array(["meta"] * 6).reshape(6, 1)) self.send_signal(self.widget.Inputs.data, data)
def test_continuous_metas(self): domain = self.iris.domain metas = domain.attributes[:-1] + (StringVariable("str"), ) domain = Domain([], domain.class_var, metas) data = Table.from_table(domain, self.iris) self.send_signal(self.widget.Inputs.data, data)
def test_output(self): # start with 1 editor self.widget.editors[-1].findChild(QPushButton).click() self.widget.editors[-1].findChild(QPushButton).click() corpus = self.get_output(self.widget.Outputs.corpus) self.assertEqual(0, len(corpus.domain.attributes)) self.assertTupleEqual( (StringVariable("Title"), StringVariable("Document")), corpus.domain.metas) np.testing.assert_array_equal(["?"], corpus.titles) self.assertListEqual(["?"], corpus.documents) np.testing.assert_array_equal([["", ""]], corpus.metas) self.add_document_btn.click() self.add_document_btn.click() editor1, editor2, editor3 = self.widget.editors editor1.title_le.setText("Document 1") editor2.title_le.setText("Document 2") editor3.title_le.setText("Document 3") editor1.text_area.setPlainText("Test 1") editor2.text_area.setPlainText("Test 2") editor3.text_area.setPlainText("Test 3") editor1.text_area.editingFinished.emit() editor2.text_area.editingFinished.emit() editor3.text_area.editingFinished.emit() corpus = self.get_output(self.widget.Outputs.corpus) np.testing.assert_array_equal( ["Document 1", "Document 2", "Document 3"], corpus.titles) self.assertListEqual(["Test 1", "Test 2", "Test 3"], corpus.documents) np.testing.assert_array_equal( [ ["Document 1", "Test 1"], ["Document 2", "Test 2"], ["Document 3", "Test 3"], ], corpus.metas, ) editor2.findChild(QPushButton).click() corpus = self.get_output(self.widget.Outputs.corpus) np.testing.assert_array_equal(["Document 1", "Document 3"], corpus.titles) self.assertListEqual(["Test 1", "Test 3"], corpus.documents) np.testing.assert_array_equal( [ ["Document 1", "Test 1"], ["Document 3", "Test 3"], ], corpus.metas, ) self.add_document_btn.click() corpus = self.get_output(self.widget.Outputs.corpus) np.testing.assert_array_equal(["Document 1", "Document 3", "?"], corpus.titles) self.assertListEqual(["Test 1", "Test 3", "?"], corpus.documents) np.testing.assert_array_equal( [["Document 1", "Test 1"], ["Document 3", "Test 3"], ["", ""]], corpus.metas, ) self.widget.editors[0].findChild(QPushButton).click() corpus = self.get_output(self.widget.Outputs.corpus) np.testing.assert_array_equal(["Document 3", "?"], corpus.titles) self.assertListEqual(["Test 3", "?"], corpus.documents) np.testing.assert_array_equal( [["Document 3", "Test 3"], ["", ""]], corpus.metas, ) self.widget.editors[-1].findChild(QPushButton).click() corpus = self.get_output(self.widget.Outputs.corpus) np.testing.assert_array_equal(["Document 3"], corpus.titles) self.assertListEqual(["Test 3"], corpus.documents) np.testing.assert_array_equal([["Document 3", "Test 3"]], corpus.metas)
def vars_from_df(df, role=None, force_nominal=False): if role is None and hasattr(df, 'orange_role'): _role = df.orange_role else: _role = role # If df index is not a simple RangeIndex (or similar), put it into data if not any(str(i).startswith('_o') for i in df.index) \ and not (df.index.is_integer() and (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing)): df = df.reset_index() Xcols, Ycols, Mcols = [], [], [] Xexpr, Yexpr, Mexpr = [], [], [] attrs, class_vars, metas = [], [], [] contains_strings = _role == Role.Meta for column in df.columns: s = df[column] if hasattr(df, 'orange_variables') and column in df.orange_variables: original_var = df.orange_variables[column] var = original_var.copy(compute_value=None) if _role == Role.Attribute: Xcols.append(column) Xexpr.append(None) attrs.append(var) elif _role == Role.ClassAttribute: Ycols.append(column) Yexpr.append(None) class_vars.append(var) else: # if role == Role.Meta: Mcols.append(column) Mexpr.append(None) metas.append(var) elif _is_discrete(s, force_nominal): discrete = s.astype('category').cat var = DiscreteVariable(str(column), discrete.categories.astype(str).tolist()) attrs.append(var) Xcols.append(column) Xexpr.append(lambda s, _: np.asarray( s.astype('category').cat.codes.replace(-1, np.nan) )) elif _is_datetime(s): var = TimeVariable(str(column)) s = pd.to_datetime(s, infer_datetime_format=True) attrs.append(var) Xcols.append(column) Xexpr.append(lambda s, v: np.asarray( s.astype('str').replace('NaT', np.nan).map(v.parse) )) elif is_numeric_dtype(s): var = ContinuousVariable( # set number of decimals to 0 if int else keeps default behaviour str(column), number_of_decimals=(0 if is_integer_dtype(s) else None) ) attrs.append(var) Xcols.append(column) Xexpr.append(None) else: contains_strings = True var = StringVariable(str(column)) metas.append(var) Mcols.append(column) Mexpr.append(lambda s, _: np.asarray(s, dtype=object)) # if role isn't explicitly set, try to # export dataframes into one contiguous block. # for this all columns must be of the same role if isinstance(df, OrangeDataFrame) \ and not role \ and contains_strings \ and not force_nominal: attrs.extend(class_vars) attrs.extend(metas) metas = attrs Xcols.extend(Ycols) Xcols.extend(Mcols) Mcols = Xcols Xexpr.extend(Yexpr) Xexpr.extend(Mexpr) Mexpr = Xexpr attrs, class_vars = [], [] Xcols, Ycols = [], [] Xexpr, Yexpr = [], [] XYM = [] for Avars, Acols, Aexpr in zip( (attrs, class_vars, metas), (Xcols, Ycols, Mcols), (Xexpr, Yexpr, Mexpr)): if not Acols: A = None if Acols != Xcols else np.empty((df.shape[0], 0)) XYM.append(A) continue if not any(Aexpr): Adf = df if all(c in Acols for c in df.columns) else df[Acols] if all(isinstance(a, SparseDtype) for a in Adf.dtypes): A = csr_matrix(Adf.sparse.to_coo()) else: A = np.asarray(Adf) XYM.append(A) continue # we'll have to copy the table to resolve any expressions # TODO eliminate expr (preprocessing for pandas -> table) A = np.array([expr(df[col], var) if expr else np.asarray(df[col]) for var, col, expr in zip(Avars, Acols, Aexpr)]).T XYM.append(A) return XYM, Domain(attrs, class_vars, metas)
return any( isinstance(pp, BaseNormalizer) for pp in corpus.used_preprocessor.preprocessors) if __name__ == "__main__": from orangewidget.utils.widgetpreview import WidgetPreview from orangecontrib.text import preprocess corpus = Corpus.from_file("book-excerpts") # corpus.set_title_variable("Text") pp_list = [ preprocess.LowercaseTransformer(), preprocess.StripAccentsTransformer(), preprocess.SnowballStemmer(), ] for p in pp_list: corpus = p(corpus) w = StringVariable("Words") w.attributes["type"] = "words" words = ["house", "doctor", "boy", "way", "Rum"] words = Table( Domain([], metas=[w]), np.empty((len(words), 0)), metas=np.array(words).reshape((-1, 1)), ) WidgetPreview(OWScoreDocuments).run(set_data=corpus, set_words=words)