Пример #1
0
    def read(self):
        who = matlab.whosmat(self.filename)
        if not who:
            raise IOError("Couldn't load matlab file " + self.filename)
        else:
            ml = matlab.loadmat(self.filename, chars_as_strings=True)

            ml = {a: b for a, b in ml.items() if isinstance(b, np.ndarray)}

            # X is the biggest numeric array
            numarrays = []
            for name, con in ml.items():
                if issubclass(con.dtype.type, numbers.Number):
                    numarrays.append(
                        (name, reduce(lambda x, y: x * y, con.shape, 1)))
            X = None
            if numarrays:
                nameX = max(numarrays, key=lambda x: x[1])[0]
                X = ml.pop(nameX)

            # find an array with compatible shapes
            attributes = []
            if X is not None:
                nameattributes = None
                for name, con in ml.items():
                    if con.shape in [(X.shape[1], ), (1, X.shape[1])]:
                        nameattributes = name
                        break
                attributenames = ml.pop(nameattributes).ravel(
                ) if nameattributes else range(X.shape[1])
                attributenames = [str(a).strip() for a in attributenames
                                  ]  # strip because of numpy char array
                attributes = [
                    ContinuousVariable(name=a) for a in attributenames
                ]

            metas = []
            metaattributes = []

            sizemetas = None
            if X is None:
                counts = defaultdict(list)
                for name, con in ml.items():
                    counts[len(con)].append(name)
                if counts:
                    sizemetas = max(counts.keys(),
                                    key=lambda x: len(counts[x]))
            else:
                sizemetas = len(X)
            if sizemetas:
                for name, con in ml.items():
                    if len(con) == sizemetas:
                        metas.append(name)

            metadata = []
            for m in sorted(metas):
                f = ml[m]
                metaattributes.append(StringVariable(m))
                f.resize(sizemetas, 1)
                metadata.append(f)

            metadata = np.hstack(tuple(metadata))

            domain = Domain(attributes, metas=metaattributes)
            if X is None:
                X = np.zeros((sizemetas, 0))
            return Orange.data.Table.from_numpy(domain,
                                                X,
                                                Y=None,
                                                metas=metadata)
Пример #2
0
class TestInstance(unittest.TestCase):
    attributes = ["Feature %i" % i for i in range(10)]
    class_vars = ["Class %i" % i for i in range(1)]
    metas = [DiscreteVariable("Meta 1", values="XYZ"),
             ContinuousVariable("Meta 2"),
             StringVariable("Meta 3")]

    def mock_domain(self, with_classes=False, with_metas=False):
        attributes = self.attributes
        class_vars = self.class_vars if with_classes else []
        metas = self.metas if with_metas else []
        variables = attributes + class_vars
        return MagicMock(Domain,
                         attributes=attributes,
                         class_vars=class_vars,
                         metas=metas,
                         variables=variables)

    def create_domain(self, attributes=(), classes=(), metas=()):
        attr_vars = [ContinuousVariable(name=a) if isinstance(a, str) else a
                     for a in attributes]
        class_vars = [ContinuousVariable(name=c) if isinstance(c, str) else c
                      for c in classes]
        meta_vars = [DiscreteVariable(name=m, values=map(str, range(5)))
                     if isinstance(m, str) else m
                     for m in metas]
        domain = Domain(attr_vars, class_vars, meta_vars)
        return domain

    def test_init_x_no_data(self):
        domain = self.mock_domain()
        inst = Instance(domain)
        self.assertIsInstance(inst, Instance)
        self.assertIs(inst.domain, domain)
        self.assertEqual(inst._values.shape, (len(self.attributes), ))
        self.assertEqual(inst._x.shape, (len(self.attributes), ))
        self.assertEqual(inst._y.shape, (0, ))
        self.assertEqual(inst._metas.shape, (0, ))
        self.assertTrue(all(isnan(x) for x in inst._values))
        self.assertTrue(all(isnan(x) for x in inst._x))

    def test_init_xy_no_data(self):
        domain = self.mock_domain(with_classes=True)
        inst = Instance(domain)
        self.assertIsInstance(inst, Instance)
        self.assertIs(inst.domain, domain)
        self.assertEqual(inst._values.shape,
                         (len(self.attributes) + len(self.class_vars), ))
        self.assertEqual(inst._x.shape, (len(self.attributes), ))
        self.assertEqual(inst._y.shape, (len(self.class_vars), ))
        self.assertEqual(inst._metas.shape, (0, ))
        self.assertTrue(all(isnan(x) for x in inst._values))
        self.assertTrue(all(isnan(x) for x in inst._x))
        self.assertTrue(all(isnan(x) for x in inst._y))

    def test_init_xym_no_data(self):
        domain = self.mock_domain(with_classes=True, with_metas=True)
        inst = Instance(domain)
        self.assertIsInstance(inst, Instance)
        self.assertIs(inst.domain, domain)
        self.assertEqual(inst._values.shape,
                         (len(self.attributes) + len(self.class_vars), ))
        self.assertEqual(inst._x.shape, (len(self.attributes), ))
        self.assertEqual(inst._y.shape, (len(self.class_vars), ))
        self.assertEqual(inst._metas.shape, (3, ))
        self.assertTrue(all(isnan(x) for x in inst._values))
        self.assertTrue(all(isnan(x) for x in inst._x))
        self.assertTrue(all(isnan(x) for x in inst._y))
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", FutureWarning)
            assert_array_equal(inst._metas, np.array([Unknown, Unknown, None]))

    def test_init_x_arr(self):
        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")])
        vals = np.array([42, 0])
        inst = Instance(domain, vals)
        assert_array_equal(inst._values, vals)
        assert_array_equal(inst._x, vals)
        self.assertEqual(inst._y.shape, (0, ))
        self.assertEqual(inst._metas.shape, (0, ))

        domain = self.create_domain()
        inst = Instance(domain, np.empty((0,)))
        self.assertEqual(inst._x.shape, (0, ))
        self.assertEqual(inst._y.shape, (0, ))
        self.assertEqual(inst._metas.shape, (0, ))


    def test_init_x_list(self):
        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")])
        lst = [42, 0]
        vals = np.array(lst)
        inst = Instance(domain, vals)
        assert_array_equal(inst._values, vals)
        assert_array_equal(inst._x, vals)
        self.assertEqual(inst._y.shape, (0, ))
        self.assertEqual(inst._metas.shape, (0, ))

        domain = self.create_domain()
        inst = Instance(domain, [])
        self.assertEqual(inst._x.shape, (0, ))
        self.assertEqual(inst._y.shape, (0, ))
        self.assertEqual(inst._metas.shape, (0, ))

    def test_init_xy_arr(self):
        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")],
                                    [DiscreteVariable("y", values="ABC")])
        vals = np.array([42, 0, 1])
        inst = Instance(domain, vals)
        assert_array_equal(inst._values, vals)
        assert_array_equal(inst._x, vals[:2])
        self.assertEqual(inst._y.shape, (1, ))
        self.assertEqual(inst._y[0], 1)
        self.assertEqual(inst._metas.shape, (0, ))

    def test_init_xy_list(self):
        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")],
                                    [DiscreteVariable("y", values="ABC")])
        lst = [42, "M", "C"]
        vals = np.array([42, 0, 2])
        inst = Instance(domain, vals)
        assert_array_equal(inst._values, vals)
        assert_array_equal(inst._x, vals[:2])
        self.assertEqual(inst._y.shape, (1, ))
        self.assertEqual(inst._y[0], 2)
        self.assertEqual(inst._metas.shape, (0, ))

    def test_init_xym_arr(self):
        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")],
                                    [DiscreteVariable("y", values="ABC")],
                                    self.metas)
        vals = np.array([42, "M", "B", "X", 43, "Foo"], dtype=object)
        inst = Instance(domain, vals)
        self.assertIsInstance(inst, Instance)
        self.assertIs(inst.domain, domain)
        self.assertEqual(inst._values.shape, (3, ))
        self.assertEqual(inst._x.shape, (2, ))
        self.assertEqual(inst._y.shape, (1, ))
        self.assertEqual(inst._metas.shape, (3, ))
        assert_array_equal(inst._values, np.array([42, 0, 1]))
        assert_array_equal(inst._x, np.array([42, 0]))
        self.assertEqual(inst._y[0], 1)
        assert_array_equal(inst._metas, np.array([0, 43, "Foo"], dtype=object))

    def test_init_xym_list(self):
        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")],
                                    [DiscreteVariable("y", values="ABC")],
                                    self.metas)
        vals = [42, "M", "B", "X", 43, "Foo"]
        inst = Instance(domain, vals)
        self.assertIsInstance(inst, Instance)
        self.assertIs(inst.domain, domain)
        self.assertEqual(inst._values.shape, (3, ))
        self.assertEqual(inst._x.shape, (2, ))
        self.assertEqual(inst._y.shape, (1, ))
        self.assertEqual(inst._metas.shape, (3, ))
        assert_array_equal(inst._values, np.array([42, 0, 1]))
        assert_array_equal(inst._x, np.array([42, 0]))
        self.assertEqual(inst._y[0], 1)
        assert_array_equal(inst._metas, np.array([0, 43, "Foo"], dtype=object))

    def test_init_inst(self):
        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")],
                                    [DiscreteVariable("y", values="ABC")],
                                    self.metas)
        vals = [42, "M", "B", "X", 43, "Foo"]
        inst = Instance(domain, vals)

        inst2 = Instance(domain, inst)
        assert_array_equal(inst2._values, np.array([42, 0, 1]))
        assert_array_equal(inst2._x, np.array([42, 0]))
        self.assertEqual(inst2._y[0], 1)
        assert_array_equal(inst2._metas, np.array([0, 43, "Foo"], dtype=object))

        domain2 = self.create_domain(["z", domain[1], self.metas[1]],
                                     domain.class_vars,
                                     [self.metas[0], "w", domain[0]])
        inst2 = Instance(domain2, inst)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", FutureWarning)
            assert_array_equal(inst2._values, np.array([Unknown, 0, 43, 1]))
            assert_array_equal(inst2._x, np.array([Unknown, 0, 43]))
            self.assertEqual(inst2._y[0], 1)
            assert_array_equal(inst2._metas, np.array([0, Unknown, 42],
                                                      dtype=object))

    def test_get_item(self):
        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")],
                                    [DiscreteVariable("y", values="ABC")],
                                    self.metas)
        vals = [42, "M", "B", "X", 43, "Foo"]
        inst = Instance(domain, vals)

        val = inst[0]
        self.assertIsInstance(val, Value)
        self.assertEqual(inst[0], 42)
        self.assertEqual(inst["x"], 42)
        self.assertEqual(inst[domain[0]], 42)

        val = inst[1]
        self.assertIsInstance(val, Value)
        self.assertEqual(inst[1], "M")
        self.assertEqual(inst["g"], "M")
        self.assertEqual(inst[domain[1]], "M")

        val = inst[2]
        self.assertIsInstance(val, Value)
        self.assertEqual(inst[2], "B")
        self.assertEqual(inst["y"], "B")
        self.assertEqual(inst[domain.class_var], "B")

        val = inst[-2]
        self.assertIsInstance(val, Value)
        self.assertEqual(inst[-2], 43)
        self.assertEqual(inst["Meta 2"], 43)
        self.assertEqual(inst[self.metas[1]], 43)

        with self.assertRaises(ValueError):
            inst["asdf"] = 42
        with self.assertRaises(ValueError):
            inst[ContinuousVariable("asdf")] = 42

    def test_set_item(self):
        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")],
                                    [DiscreteVariable("y", values="ABC")],
                                    self.metas)
        vals = [42, "M", "B", "X", 43, "Foo"]
        inst = Instance(domain, vals)

        inst[0] = 43
        self.assertEqual(inst[0], 43)
        inst["x"] = 44
        self.assertEqual(inst[0], 44)
        inst[domain[0]] = 45
        self.assertEqual(inst[0], 45)

        inst[1] = "F"
        self.assertEqual(inst[1], "F")
        inst["g"] = "M"
        self.assertEqual(inst[1], "M")
        with self.assertRaises(ValueError):
            inst[1] = "N"
        with self.assertRaises(ValueError):
            inst["asdf"] = 42

        inst[2] = "C"
        self.assertEqual(inst[2], "C")
        inst["y"] = "A"
        self.assertEqual(inst[2], "A")
        inst[domain.class_var] = "B"
        self.assertEqual(inst[2], "B")

        inst[-1] = "Y"
        self.assertEqual(inst[-1], "Y")
        inst["Meta 1"] = "Z"
        self.assertEqual(inst[-1], "Z")
        inst[domain.metas[0]] = "X"
        self.assertEqual(inst[-1], "X")

    def test_str(self):
        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")])
        inst = Instance(domain, [42, 0])
        self.assertEqual(str(inst), "[42.000, M]")

        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")],
                                    [DiscreteVariable("y", values="ABC")])
        inst = Instance(domain, [42, "M", "B"])
        self.assertEqual(str(inst), "[42.000, M | B]")

        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")],
                                    [DiscreteVariable("y", values="ABC")],
                                    self.metas)
        inst = Instance(domain, [42, "M", "B", "X", 43, "Foo"])
        self.assertEqual(str(inst), "[42.000, M | B] {X, 43.000, Foo}")

        domain = self.create_domain([],
                                    [DiscreteVariable("y", values="ABC")],
                                    self.metas)
        inst = Instance(domain, ["B", "X", 43, "Foo"])
        self.assertEqual(str(inst), "[ | B] {X, 43.000, Foo}")

        domain = self.create_domain([],
                                    [],
                                    self.metas)
        inst = Instance(domain, ["X", 43, "Foo"])
        self.assertEqual(str(inst), "[] {X, 43.000, Foo}")

        domain = self.create_domain(self.attributes)
        inst = Instance(domain, range(len(self.attributes)))
        self.assertEqual(str(inst), "[0.000, 1.000, 2.000, 3.000, 4.000, ...]")

        for attr in domain:
            attr.number_of_decimals = 0
        self.assertEqual(str(inst), "[0, 1, 2, 3, 4, ...]")

    def test_eq(self):
        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")],
                                    [DiscreteVariable("y", values="ABC")],
                                    self.metas)
        vals = [42, "M", "B", "X", 43, "Foo"]
        inst = Instance(domain, vals)
        inst2 = Instance(domain, vals)
        self.assertTrue(inst == inst2)
        self.assertTrue(inst2 == inst)

        inst2[0] = 43
        self.assertFalse(inst == inst2)

        inst2[0] = Unknown
        self.assertFalse(inst == inst2)

        inst2 = Instance(domain, vals)
        inst2[2] = "C"
        self.assertFalse(inst == inst2)

        inst2 = Instance(domain, vals)
        inst2[-1] = "Y"
        self.assertFalse(inst == inst2)

        inst2 = Instance(domain, vals)
        inst2[-2] = "33"
        self.assertFalse(inst == inst2)

        inst2 = Instance(domain, vals)
        inst2[-3] = "Bar"
        self.assertFalse(inst == inst2)
Пример #3
0
    def test_title_selection_strategy_title_heading(self):
        """
        When a there is a title, heading, filename attribute, select this one
        as a default title.
        """
        data = Table(Domain([],
                            metas=[
                                StringVariable("title"),
                                StringVariable("b"),
                                StringVariable("c")
                            ]),
                     np.empty((3, 0)),
                     metas=[["a" * 100, "a" * 40, "a" * 40],
                            ["b" * 100, "a" * 40, "b" * 30],
                            ["c" * 100, "a" * 40, "b" * 40]])
        self.send_signal(self.widget.Inputs.data, data)
        self.wait_until_finished()
        self.assertEqual(data.domain["title"], self.widget.title_variable)
        self.check_output("title")

        data = Table(Domain([],
                            metas=[
                                StringVariable("Title"),
                                StringVariable("b"),
                                StringVariable("c")
                            ]),
                     np.empty((3, 0)),
                     metas=[["a" * 100, "a" * 40, "a" * 40],
                            ["b" * 100, "a" * 40, "b" * 30],
                            ["c" * 100, "a" * 40, "b" * 40]])
        self.send_signal(self.widget.Inputs.data, data)
        self.wait_until_finished()
        self.assertEqual(data.domain["Title"], self.widget.title_variable)
        self.check_output("Title")

        # when title and heading present first select title
        data = Table(Domain([],
                            metas=[
                                StringVariable("Title"),
                                StringVariable("Heading"),
                                StringVariable("c")
                            ]),
                     np.empty((3, 0)),
                     metas=[["a" * 100, "a" * 40, "a" * 40],
                            ["b" * 100, "a" * 40, "b" * 30],
                            ["c" * 100, "a" * 40, "b" * 40]])
        self.send_signal(self.widget.Inputs.data, data)
        self.wait_until_finished()
        self.assertEqual(data.domain["Title"], self.widget.title_variable)
        self.check_output("Title")

        data = Table(Domain([],
                            metas=[
                                StringVariable("Heading"),
                                StringVariable("Title"),
                                StringVariable("c")
                            ]),
                     np.empty((3, 0)),
                     metas=[["a" * 100, "a" * 40, "a" * 40],
                            ["b" * 100, "a" * 40, "b" * 30],
                            ["c" * 100, "a" * 40, "b" * 40]])
        self.send_signal(self.widget.Inputs.data, data)
        self.wait_until_finished()
        self.assertEqual(data.domain["Title"], self.widget.title_variable)
        self.check_output("Title")

        data = Table(Domain([],
                            metas=[
                                StringVariable("Heading"),
                                StringVariable("Filename"),
                                StringVariable("c")
                            ]),
                     np.empty((3, 0)),
                     metas=[["a" * 100, "a" * 40, "a" * 40],
                            ["b" * 100, "a" * 40, "b" * 30],
                            ["c" * 100, "a" * 40, "b" * 40]])
        self.send_signal(self.widget.Inputs.data, data)
        self.wait_until_finished()
        self.assertEqual(data.domain["Heading"], self.widget.title_variable)
        self.check_output("Heading")
Пример #4
0
PickleDiscreteVariable = create_pickling_tests(
    "PickleDiscreteVariable",
    ("with_name", lambda: DiscreteVariable(name="Feature 0")),
    ("with_int_values",
     lambda: DiscreteVariable(name="Feature 0", values=[1, 2, 3])),
    ("with_str_value",
     lambda: DiscreteVariable(name="Feature 0", values=["F", "M"])),
    ("ordered", lambda: DiscreteVariable(
        name="Feature 0", values=["F", "M"], ordered=True)),
    ("with_base_value", lambda: DiscreteVariable(
        name="Feature 0", values=["F", "M"], base_value=0)))

PickleStringVariable = create_pickling_tests(
    "PickleStringVariable",
    ("with_name", lambda: StringVariable(name="Feature 0")))


@variabletest(DiscreteVariable)
class VariableTestMakeProxy(unittest.TestCase):
    def test_make_proxy_disc(self):
        abc = DiscreteVariable("abc", values="abc", ordered=True)
        abc1 = abc.make_proxy()
        abc2 = abc1.make_proxy()
        self.assertIs(abc.master, abc)
        self.assertIs(abc1.master, abc)
        self.assertIs(abc2.master, abc)
        self.assertEqual(abc, abc1)
        self.assertEqual(abc, abc2)
        self.assertEqual(abc1, abc2)
Пример #5
0
 def test_string(self):
     X = StringVariable("S")
     self._test_common(X)
Пример #6
0
    return [datetime.datetime(year, 1, 1).timestamp() if not np.isnan(year)
            else np.nan for year in years]


time_full = VarDataPair(
    TimeVariable('time_full'),
    np.array(_to_timestamps([2000, 2001, 2002, 2003, 2004]), dtype=float),
)
time_missing = VarDataPair(
    TimeVariable('time_missing'),
    np.array(_to_timestamps([2000, np.nan, 2001, 2003, 2004]), dtype=float),
)

# String variable variations
string_full = VarDataPair(
    StringVariable('string_full'),
    np.array(['a', 'b', 'c', 'd', 'e'], dtype=object),
)
string_missing = VarDataPair(
    StringVariable('string_missing'),
    np.array(['a', 'b', 'c', StringVariable.Unknown, 'e'], dtype=object),
)


def make_table(attributes, target=None, metas=None):
    """Build an instance of a table given various variables.

    Parameters
    ----------
    attributes : Iterable[Tuple[Variable, np.array]
    target : Optional[Iterable[Tuple[Variable, np.array]]
Пример #7
0
    TimeVariable('time_missing'),
    np.array([0, np.nan, 2, 3, 4], dtype=float),
]
time_all_missing = [
    TimeVariable('time_all_missing'),
    np.array([np.nan] * 5, dtype=float),
]
time_same = [
    TimeVariable('time_same'),
    np.array([4] * 5, dtype=float),
]
time = [time_full, time_missing, time_all_missing, time_same]

# String variable variations
string_full = [
    StringVariable('string_full'),
    np.array(['a', 'b', 'c', 'd', 'e'], dtype=object),
]
string_missing = [
    StringVariable('string_missing'),
    np.array(['a', 'b', 'c', StringVariable.Unknown, 'e'], dtype=object),
]
string_all_missing = [
    StringVariable('string_all_missing'),
    np.array([StringVariable.Unknown] * 5, dtype=object),
]
string_same = [
    StringVariable('string_same'),
    np.array(['a'] * 5, dtype=object),
]
string = [string_full, string_missing, string_all_missing, string_same]
Пример #8
0
 def setUp(self) -> None:
     self.lookup = LookupMappingTransform(
         StringVariable("S"),
         DictMissingConst(np.nan, {"": np.nan, "a": 0, "b": 1}),
         dtype=float,
     )
Пример #9
0
"""
input: Corpus preprocessed with Preprocess Text. Tokenizer is set to Sentences.
output: Corpus where sentences are now documents.
requires: Text add-on
"""

import numpy as np
from Orange.data import Domain, StringVariable
from orangecontrib.text.corpus import Corpus

tokens = in_data.tokens
title = [i for i in in_data.domain.metas if "title" in i.attributes][0]
new_domain = Domain(attributes=[], metas=[StringVariable('Sentences'),
                                          title)

titles = []
content = []


for i, doc in enumerate(tokens):
    for t in doc:
        titles.append(in_data[i][title.name].value)
        content.append(t)

metas = np.column_stack((content, titles))
out_data = Corpus.from_numpy(domain=new_domain, X=np.empty((len(content), 0)),
                             metas=metas)
out_data.set_text_features([StringVariable('Sentences')])
out_data.set_title_variable(title)
Пример #10
0
    def from_file(cls, filename):
        """
        Load distance matrix from a file

        The file should be preferrably encoded in ascii/utf-8. White space at
        the beginning and end of lines is ignored.

        The first line of the file starts with the matrix dimension. It
        can be followed by a list flags

        - *axis=<number>*: the axis number
        - *symmetric*: the matrix is symmetric; when reading the element (i, j)
          it's value is also assigned to (j, i)
        - *asymmetric*: the matrix is asymmetric
        - *row_labels*: the file contains row labels
        - *col_labels*: the file contains column labels

        By default, matrices are symmetric, have axis 1 and no labels are given.
        Flags *labeled* and *labelled* are obsolete aliases for *row_labels*.

        If the file has column labels, they follow in the second line.
        Row labels appear at the beginning of each row.
        Labels are arbitrary strings that cannot contain newlines and
        tabulators. Labels are stored as instances of `Table` with a single
        meta attribute named "label".

        The remaining lines contain tab-separated numbers, preceded with labels,
        if present. Lines are padded with zeros if necessary. If the matrix is
        symmetric, the file contains the lower triangle; any data above the
        diagonal is ignored.

        Args:
            filename: file name
        """
        with open(filename, encoding=detect_encoding(filename)) as fle:
            line = fle.readline()
            if not line:
                raise ValueError("empty file")
            data = line.strip().split()
            if not data[0].strip().isdigit():
                raise ValueError("distance file must begin with dimension")
            n = int(data.pop(0))
            symmetric = True
            axis = 1
            col_labels = row_labels = None
            for flag in data:
                if flag in ("labelled", "labeled", "row_labels"):
                    row_labels = []
                elif flag == "col_labels":
                    col_labels = []
                elif flag == "symmetric":
                    symmetric = True
                elif flag == "asymmetric":
                    symmetric = False
                else:
                    flag_data = flag.split("=")
                    if len(flag_data) == 2:
                        name, value = map(str.strip, flag_data)
                    else:
                        name, value = "", None
                    if name == "axis" and value.isdigit():
                        axis = int(value)
                    else:
                        raise ValueError("invalid flag '{}'".format(
                            flag, filename))
            if col_labels is not None:
                col_labels = [
                    x.strip() for x in fle.readline().strip().split("\t")
                ]
                if len(col_labels) != n:
                    raise ValueError("mismatching number of column labels")

            matrix = np.zeros((n, n))
            for i, line in enumerate(fle):
                if i >= n:
                    raise ValueError("too many rows".format(filename))
                line = line.strip().split("\t")
                if row_labels is not None:
                    row_labels.append(line.pop(0).strip())
                if len(line) > n:
                    raise ValueError(
                        "too many columns in matrix row {}".format(
                            "'{}'".format(row_labels[i]) if row_labels else i +
                            1))
                for j, e in enumerate(line[:i + 1 if symmetric else n]):
                    try:
                        matrix[i, j] = float(e)
                    except ValueError as exc:
                        raise ValueError(
                            "invalid element at row {}, column {}".format(
                                "'{}'".format(row_labels[i]) if row_labels else
                                i + 1, "'{}'".format(col_labels[j])
                                if col_labels else j + 1)) from exc
                    if symmetric:
                        matrix[j, i] = matrix[i, j]
        if col_labels:
            col_labels = Table.from_list(
                Domain([], metas=[StringVariable("label")]),
                [[item] for item in col_labels])
        if row_labels:
            row_labels = Table.from_list(
                Domain([], metas=[StringVariable("label")]),
                [[item] for item in row_labels])
        return cls(matrix, row_labels, col_labels, axis)
# Select FDR for gene sets
FDR = 0.25

# Get data from Orange table and put it in a new tqable of gene sets (rows) and genes (columns)
# If gene is in gene set put 1 in table
data = in_data.metas
columns = in_data.domain.metas
columns = [column.name for column in columns]
data = pd.DataFrame(data, columns=columns)
data = data.loc[data['FDR'] <= FDR, :]
gene_enrichment = pd.DataFrame()
for gene_set_data in data.iterrows():
    gene_set_data = gene_set_data[1]
    gene_set = gene_set_data['GO Term Name']
    for gene in gene_set_data['Genes'].split(','):
        gene_enrichment.loc[gene_set, gene] = 1

#Replace NA with 0
gene_enrichment = gene_enrichment.fillna(0)

#Orange table
domain_columns = []
for col in gene_enrichment.columns:
    domain_columns.append(ContinuousVariable(name=col))

meta_columns = [StringVariable(name='Gene set')]
out_data = Table.from_numpy(domain=Domain(domain_columns, metas=meta_columns),
                            X=gene_enrichment.to_numpy(),
                            metas=pd.DataFrame(
                                gene_enrichment.index).to_numpy())
Пример #12
0
 def network2tables(self):
     network = self.network
     # create the vertices data table
     nodes = network.nodes
     if isinstance(nodes,Table):                 # if it's a data table already
         if len(nodes.domain.attributes) == 0:   # no attribute column, so no id column
             X = np.array(range(nodes.metas.shape[0]))   # add an id column for it
             X = X.reshape(len(X),1)
             domain = Domain([ContinuousVariable("id")], nodes.domain.class_vars, nodes.domain.metas)
             vertices = Table.from_numpy(domain,X,nodes.Y,nodes.metas,nodes.W)
             self.vertices = vertices
             self.Information.inform("No attribute column of the vertices table, an id column is added.")
         else:                                   # check if there's an id column
             idcol = None
             for i,attr in enumerate(nodes.domain.attributes):
                 if attr.name=="id":
                     idcol=attr
                     break
             if idcol is None:   # no id column, add an id column for it
                 X1 = np.array(range(nodes.X.shape[0]))
                 X = nodes.X
                 X = np.insert(X, 0, values=X1, axis=1)
                 attrs = []
                 for attr in nodes.domain.attributes:
                     attrs.append(attr)
                 attrs.insert(0,ContinuousVariable("id"))
                 domain = Domain(attrs,nodes.domain.class_vars,nodes.domain.metas)
                 vertices = Table.from_numpy(domain,X,nodes.Y,nodes.metas,nodes.W)
                 self.vertices = vertices
                 self.Information.inform("No id column of the vertices table, an id column is added.")                    
             else:               # there's an id column already
                 self.vertices = nodes
     else:                       # it's an label array of nodes, so add an id column
         nodes = nodes.reshape(len(nodes),1)                    #  and a name column
         ids = np.array(range(len(nodes)))
         if network.coordinates is None:                        # no coordinates
             ids = ids.reshape(len(ids),1)
             domain = Domain([ContinuousVariable("id")], None, [StringVariable("name")])
             vertices = Table.from_numpy(domain,ids,None,nodes,None)
         else:                                                  # with coordinates
             X = np.array([ids,network.coordinates[:,0],network.coordinates[:,1]]).T
             domain = Domain([ContinuousVariable("id"),ContinuousVariable("x"),\
                              ContinuousVariable("y")], None, [StringVariable("name")])
             vertices = Table.from_numpy(domain,X,None,nodes,None)
             
         self.vertices = vertices
         self.Information.inform("Label array to vertices table, an id column is added.")
     
     # create the edges data table from sparse matrix
     edges = network.edges
     source = []; target = []; weight = []
     isDirected = 0
     for edge in edges:
         es = edge.edges
         if edge.directed:
             isDirected = 1
         for i in range(es.shape[0]):
             matrix = es[i].tocoo()
             weight += matrix.data.tolist()
             source += [i]*matrix.nnz
             target += [r+c for r,c in zip(matrix.row, matrix.col)]
     directed = np.array([isDirected]*len(source))
     X =np.array([source,target,weight,directed]).T
     domain = Domain([ContinuousVariable("source"),ContinuousVariable("target"),\
                       ContinuousVariable("weight"),ContinuousVariable("isDirected")],\
                     None, None)
     edges = Table.from_numpy(domain,X,None,None,None)
     self.edges = edges
Пример #13
0
                self.set_selected_words()
            elif len(self.word_list_library) > self.word_list_index and \
                self.word_list_library[self.word_list_index] != self.words:
                self.commit()

    def _save_state(self):
        self.word_list_library = [s.as_dict() for s in self.library_model]
        self.words = self.words_model[:]

    def send_report(self):
        library = self.library_model[self.word_list_index].name \
            if self.library_model else "/"
        settings = [("Library", library)]
        if self.__input_words:
            self.report_data("Input Words", self.__input_words)
            settings.append(("Word variable", self.words_var))
            rule = UpdateRules.ITEMS[self.update_rule_index]
            settings.append(("Update", rule))
        self.report_items("Settings", settings)
        self.report_paragraph("Words", ", ".join(self.words_model[:]))


if __name__ == "__main__":
    from Orange.widgets.utils.widgetpreview import WidgetPreview

    words_vars = [StringVariable("S1"), StringVariable("S2")]
    lst = [["foo", "A"], ["bar", "B"], ["foobar", "C"]]
    input_table = Table.from_list(Domain([], metas=words_vars), lst)
    # WidgetPreview(OWWordList).run(set_words=input_table)
    WidgetPreview(OWWordList).run()
    def to_data_table(self,
                      selected_genes: Optional[List[str]] = None) -> Table:
        """ Transform GeneMatcher results to Orange data table.

        Optionally we can provide a list of genes (Entrez Ids).
        The table on the output will be populated only with provided genes.

        Parameters
        ----------
        selected_genes: list
            List of Entrez Ids

        Returns
        -------
        Orange.data.Table
            Summary of Gene info in tabular format
        """
        data_x = []
        metas = [
            StringVariable('Input gene ID'),
            StringVariable(ENTREZ_ID),
            StringVariable('Symbol'),
            StringVariable('Synonyms'),
            StringVariable('Description'),
            StringVariable('Other IDs'),
            StringVariable('Type of gene'),
            StringVariable('Chromosome'),
            StringVariable('Map location'),
            StringVariable('Locus tag'),
            StringVariable('Symbol from nomenclature authority'),
            StringVariable('Full name from nomenclature authority'),
            StringVariable('Nomenclature status'),
            StringVariable('Other designations'),
            StringVariable('Species'),
            StringVariable('Taxonomy ID'),
        ]
        domain = Domain([], metas=metas)

        genes: List[Gene] = self.genes
        if selected_genes is not None:
            selected_genes_set = set(selected_genes)
            genes = [
                gene for gene in self.genes
                if str(gene.gene_id) in selected_genes_set
            ]

        for gene in genes:
            db_refs = (', '.join(
                '{}: {}'.format(key, val)
                for (key,
                     val) in gene.db_refs.items()) if gene.db_refs else '')
            synonyms = ', '.join(gene.synonyms) if gene.synonyms else ''

            line = [
                gene.input_identifier,
                gene.gene_id,
                gene.symbol,
                synonyms,
                gene.description,
                db_refs,
                gene.type_of_gene,
                gene.chromosome,
                gene.map_location,
                gene.locus_tag,
                gene.symbol_from_nomenclature_authority,
                gene.full_name_from_nomenclature_authority,
                gene.nomenclature_status,
                gene.other_designations,
                species_name_to_taxid(gene.species),
                gene.tax_id,
            ]

            data_x.append(line)

        table = Table(domain, data_x)
        table.name = 'Gene Matcher Results'
        table.attributes[TableAnnotation.tax_id] = self.tax_id
        table.attributes[TableAnnotation.gene_as_attr_name] = False
        table.attributes[TableAnnotation.gene_id_column] = ENTREZ_ID
        return table
Пример #15
0
    def _to_orange_data_table(self,
                              report_genes=True,
                              merge_function=spots_mean,
                              sample_type=None,
                              transpose=False):
        """ Convert parsed GEO format to orange, save by genes or by spots.
        """
        if transpose:  # samples in rows
            sample2class = self._sample_to_class(sample_type)
            cvalues = sorted(set(sample2class.values()))
            if None in cvalues:
                cvalues.remove(None)

            samp_ann = self._sample_annotations()

            ad = defaultdict(set)
            for d in samp_ann.values():
                for n, v in d.items():
                    ad[n].add(v)

            # auto-select sample type if there is only one
            if len(ad) == 1:
                sample_type = list(ad.keys())[0]

            classvar = DiscreteVariable(name=sample_type or "class",
                                        values=cvalues)
            spots = self.genes if report_genes else self.spots
            atts = [ContinuousVariable(name=gene) for gene in spots]

            metasvar = [
                DiscreteVariable(name=n, values=sorted(values))
                for n, values in ad.items() if n != sample_type
            ]

            X = []
            Y = []
            metas = []
            for (i, sampleid) in enumerate(self.info["samples"]):
                vals = [((merge_function([
                    self.gds_data[spot].data[i]
                    for spot in self.gene2spots[gene]
                ])) if report_genes else self.gds_data[gene].data[i])
                        for gene in spots]
                X.append(vals)
                Y.append(sample2class.get(sampleid, None))
                metas.append([
                    samp_ann[sampleid].get(n, None) for n, _ in ad.items()
                    if n != sample_type
                ])

            domain = Domain(atts, classvar, metas=metasvar)
            return create_table(domain, X, Y, metas)
        else:  # genes in rows
            annotations = self._sample_annotations(sample_type)
            atts = [ContinuousVariable(name=ss) for ss in self.info["samples"]]
            for i, a in enumerate(atts):
                setattr(a, "attributes", annotations[self.info["samples"][i]])

            geneatname = "gene" if report_genes else "spot"
            metasvar = [StringVariable(geneatname)]
            nameval = self.genes if report_genes else self.spots

            X = []
            metas = []
            for g in nameval:
                if report_genes:
                    X.append(
                        list(
                            map(
                                lambda *x: merge_function(x), *[
                                    self.gds_data[spot].data
                                    for spot in self.gene2spots[g]
                                ])))
                else:
                    X.append(self.gds_data[g].data)
            metas = [[a] for a in nameval]
            domain = Domain(atts, [], metas=metasvar)
            return create_table(domain, X, None, metas)
Пример #16
0
    def to_data_table(self, selected_genes=None):
        tax_id = set()
        data_x = []
        metas = [
            StringVariable('Input gene ID'),
            DiscreteVariable('Match result',
                             values=['Matched', 'Match Conflict',
                                     'Unmatched']),
            StringVariable(NCBI_ID),
            StringVariable('Symbol'),
            StringVariable('Synonyms'),
            StringVariable('Description'),
            StringVariable('Other IDs'),
            StringVariable('Type of gene'),
            StringVariable('Chromosome'),
            StringVariable('Map location'),
            StringVariable('Locus tag'),
            StringVariable('Symbol from nomenclature authority'),
            StringVariable('Full name from nomenclature authority'),
            StringVariable('Nomenclature status'),
            StringVariable('Other designations'),
            StringVariable('Taxonomy ID'),
        ]
        domain = Domain([], metas=metas)

        genes = self.genes
        if selected_genes is not None:
            genes = [
                gene for gene in self.genes
                if str(gene.ncbi_id) in selected_genes
            ]

        for gene in genes:
            gene.load_ncbi_info()
            tax_id.add(gene.tax_id)
            match_status = self.gene_match_status(gene)

            db_refs = ', '.join(
                '{}: {}'.format(key, val)
                for (key, val) in gene.db_refs.items()) if gene.db_refs else ''
            synonyms = ', '.join(gene.synonyms) if gene.synonyms else ''

            line = [
                gene.input_name, match_status, gene.ncbi_id, gene.symbol,
                synonyms, gene.description, db_refs, gene.type_of_gene,
                gene.chromosome, gene.map_location, gene.locus_tag,
                gene.symbol_from_nomenclature_authority,
                gene.full_name_from_nomenclature_authority,
                gene.nomenclature_status, gene.other_designations, gene.tax_id
            ]

            data_x.append(line)

        tax_id = filter(None.__ne__, tax_id)

        table = Table(domain, data_x)
        table.name = 'Gene Matcher Results'
        table.attributes[OrangeTableAnnotations.tax_id] = next(tax_id)
        table.attributes[OrangeTableAnnotations.gene_as_attribute_name] = False
        table.attributes[OrangeTableAnnotations.gene_id_column] = NCBI_ID
        return table
Пример #17
0
    def generateGraph(self, N_changed=False):
        self.Error.clear()
        self.Warning.clear()
        matrix = None

        if N_changed:
            self.node_selection = NodeSelection.COMPONENTS

        if self.matrix is None:
            if hasattr(self, "infoa"):
                self.infoa.setText("No data loaded.")
            if hasattr(self, "infob"):
                self.infob.setText("")
            if hasattr(self, "infoc"):
                self.infoc.setText("")
            self.pconnected = 0
            self.nedges = 0
            self.graph = None
            self.sendSignals()
            return

        nEdgesEstimate = 2 * sum(
            y for x, y in zip(self.histogram.xData, self.histogram.yData)
            if x <= self.epsilon)

        if nEdgesEstimate > 200000:
            self.graph = None
            nedges = 0
            n = 0
            self.Error.number_of_edges(nEdgesEstimate)
        else:
            items = None
            matrix = self.matrix
            if matrix is not None and matrix.row_items is not None:
                row_items = self.matrix.row_items
                if isinstance(row_items, Table):
                    if self.matrix.axis == 1:
                        items = row_items
                    else:
                        items = [[v.name] for v in row_items.domain.attributes]
                else:
                    items = [[str(x)] for x in self.matrix.row_items]
            if len(items) != self.matrix.shape[0]:
                self.Warning.invalid_number_of_items()
                items = None
            if items is None:
                items = list(range(self.matrix.shape[0]))
            if not isinstance(items, Table):
                items = Table(Domain([], metas=[StringVariable('label')]),
                              items)

            mask = self.matrix <= self.epsilon
            weights = matrix[mask]
            if weights.size:
                weights = np.max(weights) - weights
            edges = sp.csr_matrix((weights, mask.nonzero()))
            self.graph = Network(items, edges)

        self.graph_matrix = self.matrix

        if self.graph is None:
            self.pconnected = 0
            self.nedges = 0
        else:
            self.pconnected = self.graph.number_of_nodes()
            self.nedges = self.graph.number_of_edges()
        if hasattr(self, "infoa"):
            self.infoa.setText("Data items on input: %d" %
                               self.matrix.shape[0])
        if hasattr(self, "infob"):
            self.infob.setText("Network nodes: %d (%3.1f%%)" %
                               (self.pconnected, self.pconnected /
                                float(self.matrix.shape[0]) * 100))
        if hasattr(self, "infoc"):
            self.infoc.setText(
                "Network edges: %d (%.2f edges/node)" %
                (self.nedges, self.nedges /
                 float(self.pconnected) if self.pconnected else 0))

        self.Warning.large_number_of_nodes.clear()
        if self.pconnected > 1000 or self.nedges > 2000:
            self.Warning.large_number_of_nodes()

        self.sendSignals()
        self.histogram.setRegion(0, self.epsilon)
Пример #18
0
)
from Orange.widgets.widget import Output

from orangecontrib.text.corpus import Corpus
from orangecontrib.text.import_documents import ImportDocuments, \
    NoDocumentsException

try:
    from orangecanvas.preview.previewbrowser import TextLabel
except ImportError:
    from Orange.canvas.preview.previewbrowser import TextLabel


# domain for skipped images output
SKIPPED_DOMAIN = Domain([], metas=[
    StringVariable("name"),
    StringVariable("path")
])


def prettifypath(path):
    home = os.path.expanduser("~/")
    if path.startswith(home):  # case sensitivity!
        path = os.path.join("~", os.path.relpath(path, home))
    return path


log = logging.getLogger(__name__)


class RuntimeEvent(QEvent):
def read_pajek(path, encoding='UTF-8', project=False, auto_table=False):
    """Reimplemented method for reading Pajek files; written in
    C++ for maximum performance.

    :param path: File or file name to write.
    :type path: string

    :param encoding: Encoding of input text file, default 'UTF-8'.
    :type encoding: string

    :param project: Determines whether the input file is a Pajek project file,
        possibly containing multiple networks and other data. If :obj:`True`,
        a list of networks is returned instead of just a network. Default is
        :obj:`False`.
    :type project: boolean.

    Return the network (or a list of networks if project=:obj:`True`) of type
    :obj:`Orange.network.Graph` or :obj:`Orange.network.DiGraph`.


    Examples

    >>> G = orangecontrib..network.nx.path_graph(4)
    >>> orangecontrib..network.readwrite.write_pajek(G, "test.net")
    >>> G = orangecontrib.network.readwrite.read_pajek("test.net")

    To create a Graph instead of a MultiGraph use

    >>> G1 = orangecontrib.network.Graph(G)

    References

    See http://vlado.fmf.uni-lj.si/pub/networks/pajek/doc/draweps.htm
    for format information.
    """
    path = _check_network_dir(path)
    G = _wrap(rwpajek.read_pajek(path))

    # Additionally read values into Table; needed to get G nodes properly sorted
    # (Consult OWNxFile.readDataFile(), orangeom.GraphLayout.readPajek(), and the Pajek format spec)
    import shlex, numpy as np
    rows, metas, remapping = [], [], {}
    with open(path, encoding='utf-8') as f:
        for line in f:
            if line.lower().startswith('*vertices'):
                nvertices = int(line.split()[1])
                break
        # Read vertices lines
        for line in f:
            parts = shlex.split(line)[:4]
            if len(parts) == 1:
                i = label = parts[0]
            elif len(parts) == 2:
                i, label = parts
                metas.append((label, ))
            elif len(parts) == 4:
                i, label, x, y = parts
                # The format specification was never set in stone, it seems
                try:
                    x, y = float(x), float(y)
                except ValueError:
                    metas.append((label, x, y))
                else:
                    rows.append((x, y))
                    metas.append((label, ))
            i = int(i) - 1  # -1 because pajek is 1-indexed
            remapping[label] = i
            nvertices -= 1
            if not nvertices: break
    from Orange.data import Domain, ContinuousVariable, StringVariable
    # Construct x-y-label table (added in OWNxFile.readDataFile())
    table = None
    vars = [ContinuousVariable('x'), ContinuousVariable('y')] if rows else []
    meta_vars = [
        StringVariable('label ' + str(i))
        for i in range(len(metas[0]) if metas else 0)
    ]
    if rows or metas:
        domain = Domain(vars, metas=meta_vars)
        table = Table.from_numpy(domain,
                                 np.array(rows, dtype=float).reshape(
                                     len(metas),
                                     len(rows[0]) if rows else 0),
                                 metas=np.array(metas, dtype=str))
    if table is not None and auto_table:
        G.set_items(table)
    # Relabel nodes to integers, sorted by appearance
    for node in G.node:
        G.node[node]['label'] = node
    nx.relabel_nodes(G, remapping, copy=False)
    if table is not None and len(table) != G.number_of_nodes():
        raise PajekBug(
            "There is a bug in your version of NetworkX reading Pajek files. "
            "Please update your NetworkX installation.")
    return G
Пример #20
0
    pass


def coordinates(tweet, _, __, dim):
    coord = tweet.geo.get("coordinates", None) if tweet.geo else None
    return coord["coordinates"][dim] if coord else None


def country_code(tweet, _, places):
    place_id = tweet.geo.get("place_id", None) if tweet.geo else None
    return places[place_id].country_code if place_id else ""


tv = TimeVariable("Date")
METAS = [
    (StringVariable("Content"), lambda doc, _, __: doc.text),
    (
        DiscreteVariable("Author"),
        lambda doc, users, _: "@" + users[doc.author_id].username,
    ),
    (tv, lambda doc, _, __: tv.parse(doc.created_at.isoformat())),
    (DiscreteVariable("Language"), lambda doc, _, __: doc.lang),
    (DiscreteVariable("Location"), country_code),
    (
        ContinuousVariable("Number of Likes", number_of_decimals=0),
        lambda doc, _, __: doc.public_metrics["like_count"],
    ),
    (
        ContinuousVariable("Number of Retweets", number_of_decimals=0),
        lambda doc, _, __: doc.public_metrics["retweet_count"],
    ),
Пример #21
0
    def test_match_attr_name(self):
        widget = self.widget
        row = widget.attr_boxes.rows[0]
        data_combo, extra_combo = row.left_combo, row.right_combo

        domainA = Domain([DiscreteVariable("dA1", ("a", "b", "c", "d")),
                          DiscreteVariable("dA2", ("aa", "bb")),
                          DiscreteVariable("dA3", ("aa", "bb"))],
                         DiscreteVariable("cls", ("aaa", "bbb", "ccc")),
                         [DiscreteVariable("mA1", ("cc", "dd")),
                          StringVariable("mA2")])
        XA = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 0], [3, 1, 0]])
        yA = np.array([0, 1, 2, np.nan])
        metasA = np.array([[0.0, "m1"], [1.0, "m2"], [np.nan, "m3"],
                           [0.0, "m4"]]).astype(object)

        domainB = Domain([DiscreteVariable("dB1", values=("a", "b", "c")),
                          ContinuousVariable("dA2")],
                         None,
                         [StringVariable("cls"),
                          DiscreteVariable("dA1", ("m4", "m5"))])
        XB = np.array([[0, 0], [1, 1], [2, np.nan]])
        yB = np.empty((3, 0))
        metasB = np.array([[np.nan, np.nan], [1, 1], [0, 0]]).astype(object)
        dataA = Table(domainA, XA, yA, metasA)
        dataA.name = 'dataA'
        dataA.attributes = 'dataA attributes'
        dataB = Table(domainB, XB, yB, metasB)
        dataB.name = 'dataB'
        dataB.attributes = 'dataB attributes'

        self.send_signal(widget.Inputs.data, dataA)
        self.send_signal(widget.Inputs.extra_data, dataB)

        # match variable if available and the other combo is Row Index
        extra_combo.setCurrentIndex(0)
        extra_combo.activated.emit(0)
        data_combo.setCurrentIndex(2)
        data_combo.activated.emit(2)
        self.assertEqual(extra_combo.currentIndex(), 5)

        # match variable if available and the other combo is ID
        extra_combo.setCurrentIndex(1)
        extra_combo.activated.emit(1)
        data_combo.setCurrentIndex(2)
        data_combo.activated.emit(2)
        self.assertEqual(extra_combo.currentIndex(), 5)

        # don't match variable if other combo is set
        extra_combo.setCurrentIndex(4)
        extra_combo.activated.emit(4)
        data_combo.setCurrentIndex(2)
        data_combo.activated.emit(2)
        self.assertEqual(extra_combo.currentIndex(), 4)

        # don't match if nothing to match to
        extra_combo.setCurrentIndex(0)
        extra_combo.activated.emit(0)
        data_combo.setCurrentIndex(4)
        data_combo.activated.emit(4)
        self.assertEqual(extra_combo.currentIndex(), 0)

        # don't match numeric with non-numeric
        extra_combo.setCurrentIndex(0)
        extra_combo.activated.emit(0)
        data_combo.setCurrentIndex(3)
        data_combo.activated.emit(3)
        self.assertEqual(extra_combo.currentIndex(), 0)

        # allow matching string with discrete
        extra_combo.setCurrentIndex(0)
        extra_combo.activated.emit(0)
        data_combo.setCurrentIndex(5)
        data_combo.activated.emit(5)
        self.assertEqual(extra_combo.currentIndex(), 4)
Пример #22
0
 def coefficients(self) -> Table:
     return Table(Domain([ContinuousVariable("coef")],
                         metas=[StringVariable("name")]),
                  self.__parameters[:, None],
                  metas=np.array(self.__parameters_names)[:, None])
Пример #23
0
 def test_val(self):
     a = StringVariable("a")
     self.assertEqual(a.to_val(None), "")
     self.assertEqual(a.str_val(""), "?")
     self.assertEqual(a.str_val(Value(a, "")), "?")
     self.assertEqual(a.repr_val(Value(a, "foo")), '"foo"')
Пример #24
0
            annotated = create_annotated_table(self.corpus, self.selection)
        self.Outputs.matching_docs.send(matched)
        self.Outputs.other_docs.send(other)
        self.Outputs.corpus.send(annotated)

    def send_report(self):
        if not self.corpus:
            return
        self.report_data("Corpus", self.corpus)
        if self.words is not None:
            self.report_paragraph("Words", ", ".join(self.words))
            self.report_table(self._list_view, num_format="{:.3f}")

    def copy_to_clipboard(self):
        text = self._web_view.selectedText()
        QApplication.clipboard().setText(text)


if __name__ == "__main__":
    # pylint: disable=ungrouped-imports
    from Orange.widgets.utils.widgetpreview import WidgetPreview

    words_var_ = StringVariable(WORDS_COLUMN_NAME)
    words_var_.attributes = {"type": "words"}
    lists = [[w] for w in ["human", "graph", "minors", "trees"]]
    words_ = Table.from_list(Domain([], metas=[words_var_]), lists)
    words_.name = "Words"
    WidgetPreview(OWSemanticViewer).run(
        set_corpus=Corpus.from_file("deerwester"),  # deerwester book-excerpts
        set_words=words_)
    def from_numpy(cls, X, Y=None, metas=None):
        """
        Create a domain corresponding to the given numpy arrays. This method
        is usually invoked from :meth:`Orange.data.Table.from_numpy`.

        All attributes are assumed to be continuous and are named
        "Feature <n>". Target variables are discrete if the only two values
        are 0 and 1; otherwise they are continuous. Discrete
        targets are named "Class <n>" and continuous are named "Target <n>".
        Domain is marked as :attr:`anonymous`, so data from any other domain of
        the same shape can be converted into this one and vice-versa.

        :param `numpy.ndarray` X: 2-dimensional array with data
        :param Y: 1- of 2- dimensional data for target
        :type Y: `numpy.ndarray` or None
        :param `numpy.ndarray` metas: meta attributes
        :type metas: `numpy.ndarray` or None
        :return: a new domain
        :rtype: :class:`Domain`
        """
        def get_places(max_index):
            return 0 if max_index == 1 else int(log(max_index, 10)) + 1

        def get_name(base, index, places):
            return base if not places \
                else "{} {:0{}}".format(base, index + 1, places)

        if X.ndim != 2:
            raise ValueError('X must be a 2-dimensional array')
        n_attrs = X.shape[1]
        places = get_places(n_attrs)
        attr_vars = [
            ContinuousVariable(name=get_name("Feature", a, places))
            for a in range(n_attrs)
        ]
        class_vars = []
        if Y is not None:
            if Y.ndim == 1:
                Y = Y.reshape(len(Y), 1)
            elif Y.ndim != 2:
                raise ValueError('Y has invalid shape')
            n_classes = Y.shape[1]
            places = get_places(n_classes)
            for i, values in enumerate(Y.T):
                if set(values) == {0, 1}:
                    name = get_name('Class', i, places)
                    values = ['v1', 'v2']
                    class_vars.append(DiscreteVariable(name, values))
                else:
                    name = get_name('Target', i + 1, places)
                    class_vars.append(ContinuousVariable(name))
        if metas is not None:
            n_metas = metas.shape[1]
            places = get_places(n_metas)
            meta_vars = [
                StringVariable(get_name("Meta", m, places))
                for m in range(n_metas)
            ]
        else:
            meta_vars = []

        domain = cls(attr_vars, class_vars, meta_vars)
        domain.anonymous = True
        return domain
Пример #26
0
 def test_string_meta(self):
     """Check widget for dataset with only one string meta"""
     domain = Domain([], metas=[StringVariable("m")])
     data = Table(domain, np.empty((6, 0)),
                  metas=np.array(["meta"] * 6).reshape(6, 1))
     self.send_signal(self.widget.Inputs.data, data)
Пример #27
0
 def test_continuous_metas(self):
     domain = self.iris.domain
     metas = domain.attributes[:-1] + (StringVariable("str"), )
     domain = Domain([], domain.class_var, metas)
     data = Table.from_table(domain, self.iris)
     self.send_signal(self.widget.Inputs.data, data)
Пример #28
0
    def test_output(self):
        # start with 1 editor
        self.widget.editors[-1].findChild(QPushButton).click()
        self.widget.editors[-1].findChild(QPushButton).click()

        corpus = self.get_output(self.widget.Outputs.corpus)
        self.assertEqual(0, len(corpus.domain.attributes))
        self.assertTupleEqual(
            (StringVariable("Title"), StringVariable("Document")),
            corpus.domain.metas)
        np.testing.assert_array_equal(["?"], corpus.titles)
        self.assertListEqual(["?"], corpus.documents)
        np.testing.assert_array_equal([["", ""]], corpus.metas)

        self.add_document_btn.click()
        self.add_document_btn.click()
        editor1, editor2, editor3 = self.widget.editors
        editor1.title_le.setText("Document 1")
        editor2.title_le.setText("Document 2")
        editor3.title_le.setText("Document 3")
        editor1.text_area.setPlainText("Test 1")
        editor2.text_area.setPlainText("Test 2")
        editor3.text_area.setPlainText("Test 3")
        editor1.text_area.editingFinished.emit()
        editor2.text_area.editingFinished.emit()
        editor3.text_area.editingFinished.emit()

        corpus = self.get_output(self.widget.Outputs.corpus)
        np.testing.assert_array_equal(
            ["Document 1", "Document 2", "Document 3"], corpus.titles)
        self.assertListEqual(["Test 1", "Test 2", "Test 3"], corpus.documents)
        np.testing.assert_array_equal(
            [
                ["Document 1", "Test 1"],
                ["Document 2", "Test 2"],
                ["Document 3", "Test 3"],
            ],
            corpus.metas,
        )

        editor2.findChild(QPushButton).click()
        corpus = self.get_output(self.widget.Outputs.corpus)
        np.testing.assert_array_equal(["Document 1", "Document 3"],
                                      corpus.titles)
        self.assertListEqual(["Test 1", "Test 3"], corpus.documents)
        np.testing.assert_array_equal(
            [
                ["Document 1", "Test 1"],
                ["Document 3", "Test 3"],
            ],
            corpus.metas,
        )

        self.add_document_btn.click()
        corpus = self.get_output(self.widget.Outputs.corpus)
        np.testing.assert_array_equal(["Document 1", "Document 3", "?"],
                                      corpus.titles)
        self.assertListEqual(["Test 1", "Test 3", "?"], corpus.documents)
        np.testing.assert_array_equal(
            [["Document 1", "Test 1"], ["Document 3", "Test 3"], ["", ""]],
            corpus.metas,
        )

        self.widget.editors[0].findChild(QPushButton).click()
        corpus = self.get_output(self.widget.Outputs.corpus)
        np.testing.assert_array_equal(["Document 3", "?"], corpus.titles)
        self.assertListEqual(["Test 3", "?"], corpus.documents)
        np.testing.assert_array_equal(
            [["Document 3", "Test 3"], ["", ""]],
            corpus.metas,
        )

        self.widget.editors[-1].findChild(QPushButton).click()
        corpus = self.get_output(self.widget.Outputs.corpus)
        np.testing.assert_array_equal(["Document 3"], corpus.titles)
        self.assertListEqual(["Test 3"], corpus.documents)
        np.testing.assert_array_equal([["Document 3", "Test 3"]], corpus.metas)
Пример #29
0
def vars_from_df(df, role=None, force_nominal=False):
    if role is None and hasattr(df, 'orange_role'):
        _role = df.orange_role
    else:
        _role = role

    # If df index is not a simple RangeIndex (or similar), put it into data
    if not any(str(i).startswith('_o') for i in df.index) \
            and not (df.index.is_integer() and (df.index.is_monotonic_increasing
                                                or df.index.is_monotonic_decreasing)):
        df = df.reset_index()

    Xcols, Ycols, Mcols = [], [], []
    Xexpr, Yexpr, Mexpr = [], [], []
    attrs, class_vars, metas = [], [], []

    contains_strings = _role == Role.Meta
    for column in df.columns:
        s = df[column]
        if hasattr(df, 'orange_variables') and column in df.orange_variables:
            original_var = df.orange_variables[column]
            var = original_var.copy(compute_value=None)
            if _role == Role.Attribute:
                Xcols.append(column)
                Xexpr.append(None)
                attrs.append(var)
            elif _role == Role.ClassAttribute:
                Ycols.append(column)
                Yexpr.append(None)
                class_vars.append(var)
            else:  # if role == Role.Meta:
                Mcols.append(column)
                Mexpr.append(None)
                metas.append(var)
        elif _is_discrete(s, force_nominal):
            discrete = s.astype('category').cat
            var = DiscreteVariable(str(column),
                                   discrete.categories.astype(str).tolist())
            attrs.append(var)
            Xcols.append(column)
            Xexpr.append(lambda s, _: np.asarray(
                s.astype('category').cat.codes.replace(-1, np.nan)
            ))
        elif _is_datetime(s):
            var = TimeVariable(str(column))
            s = pd.to_datetime(s, infer_datetime_format=True)
            attrs.append(var)
            Xcols.append(column)
            Xexpr.append(lambda s, v: np.asarray(
                s.astype('str').replace('NaT', np.nan).map(v.parse)
            ))
        elif is_numeric_dtype(s):
            var = ContinuousVariable(
                # set number of decimals to 0 if int else keeps default behaviour
                str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
            )
            attrs.append(var)
            Xcols.append(column)
            Xexpr.append(None)
        else:
            contains_strings = True
            var = StringVariable(str(column))
            metas.append(var)
            Mcols.append(column)
            Mexpr.append(lambda s, _: np.asarray(s, dtype=object))

    # if role isn't explicitly set, try to
    # export dataframes into one contiguous block.
    # for this all columns must be of the same role
    if isinstance(df, OrangeDataFrame) \
            and not role \
            and contains_strings \
            and not force_nominal:
        attrs.extend(class_vars)
        attrs.extend(metas)
        metas = attrs
        Xcols.extend(Ycols)
        Xcols.extend(Mcols)
        Mcols = Xcols
        Xexpr.extend(Yexpr)
        Xexpr.extend(Mexpr)
        Mexpr = Xexpr

        attrs, class_vars = [], []
        Xcols, Ycols = [], []
        Xexpr, Yexpr = [], []

    XYM = []
    for Avars, Acols, Aexpr in zip(
            (attrs, class_vars, metas),
            (Xcols, Ycols, Mcols),
            (Xexpr, Yexpr, Mexpr)):
        if not Acols:
            A = None if Acols != Xcols else np.empty((df.shape[0], 0))
            XYM.append(A)
            continue
        if not any(Aexpr):
            Adf = df if all(c in Acols
                            for c in df.columns) else df[Acols]
            if all(isinstance(a, SparseDtype) for a in Adf.dtypes):
                A = csr_matrix(Adf.sparse.to_coo())
            else:
                A = np.asarray(Adf)
            XYM.append(A)
            continue
        # we'll have to copy the table to resolve any expressions
        # TODO eliminate expr (preprocessing for pandas -> table)
        A = np.array([expr(df[col], var) if expr else np.asarray(df[col])
                      for var, col, expr in zip(Avars, Acols, Aexpr)]).T
        XYM.append(A)

    return XYM, Domain(attrs, class_vars, metas)
Пример #30
0
        return any(
            isinstance(pp, BaseNormalizer)
            for pp in corpus.used_preprocessor.preprocessors)


if __name__ == "__main__":
    from orangewidget.utils.widgetpreview import WidgetPreview

    from orangecontrib.text import preprocess

    corpus = Corpus.from_file("book-excerpts")
    # corpus.set_title_variable("Text")

    pp_list = [
        preprocess.LowercaseTransformer(),
        preprocess.StripAccentsTransformer(),
        preprocess.SnowballStemmer(),
    ]
    for p in pp_list:
        corpus = p(corpus)

    w = StringVariable("Words")
    w.attributes["type"] = "words"
    words = ["house", "doctor", "boy", "way", "Rum"]
    words = Table(
        Domain([], metas=[w]),
        np.empty((len(words), 0)),
        metas=np.array(words).reshape((-1, 1)),
    )
    WidgetPreview(OWScoreDocuments).run(set_data=corpus, set_words=words)