Пример #1
0
 def test_transform(self):
     lookup = Lookup(None, np.array([1, 2, 0, 2]))
     column = np.array([1, 2, 3, 0, np.nan, 0], dtype=np.float64)
     for col in [column, sp.csr_matrix(column)]:
         np.testing.assert_array_equal(
             lookup.transform(col),
             np.array([2, 0, 2, 1, np.nan, 1], dtype=np.float64))
Пример #2
0
def remove_unused_values(var, data):
    column_data = Table.from_table(
        Domain([var]),
        data
    )
    array = column_data.X.ravel()
    mask = np.isfinite(array)
    unique = np.array(np.unique(array[mask]), dtype=int)

    if len(unique) == len(var.values):
        return var

    used_values = [var.values[i] for i in unique]
    translation_table = np.array([np.NaN] * len(var.values))
    translation_table[unique] = range(len(used_values))

    base_value = -1
    if 0 >= var.base_value < len(var.values):
        base = translation_table[var.base_value]
        if np.isfinite(base):
            base_value = int(base)

    return DiscreteVariable("{}".format(var.name),
                            values=used_values,
                            base_value=base_value,
                            compute_value=Lookup(var, translation_table)
                            )
Пример #3
0
def column_imputer_as_value(variable, table):
    if variable.is_discrete:
        fmt = "{var.name}"
        value = "N/A"
        var = Orange.data.DiscreteVariable(
            fmt.format(var=variable),
            values=variable.values + [value],
            base_value=variable.base_value,
            compute_value=Lookup(variable,
                                 numpy.arange(len(variable.values), dtype=int),
                                 unknown=len(variable.values)))
        codomain = [var]
        transformers = [var.compute_value]
    elif variable.is_continuous:
        fmt = "{var.name}_def"
        var = Orange.data.DiscreteVariable(fmt.format(var=variable),
                                           values=("undef", "def"),
                                           compute_value=IsDefined(variable))
        codomain = [variable, var]
        stats = basic_stats.BasicStats(table, variable)
        transformers = [
            ReplaceUnknowns(variable, stats.mean), var.compute_value
        ]
    else:
        raise TypeError(type(variable))

    return ColumnImputerAsValue(table.domain, Orange.data.Domain(codomain),
                                transformers)
Пример #4
0
 def test_discrete_reorder(self):
     D = DiscreteVariable("D", values=("2", "3", "1", "0"))
     DD = apply_transform_var(D, [
         CategoriesMapping((("0", "0"), ("1", "1"), ("2", "2"), ("3", "3")))
     ])
     self.assertSequenceEqual(DD.values, ["0", "1", "2", "3"])
     self._assertLookupEquals(DD.compute_value,
                              Lookup(D, np.array([2, 3, 1, 0])))
Пример #5
0
def merge_lookup(A, B):
    """
    Merge two consecutive Lookup transforms into one.
    """
    lookup_table = np.array(A.lookup_table)
    mask = np.isfinite(lookup_table)
    indices = np.array(lookup_table[mask], dtype=int)
    lookup_table[mask] = B.lookup_table[indices]
    return Lookup(A.variable, lookup_table)
Пример #6
0
 def test_hash_nan(self):
     """
     Hash should be always the same for same lookup
     Test introduced because of bug in numpy (PY3.10) and was present when nan
     in lookup table: https://github.com/numpy/numpy/issues/21210
     """
     lookup = Lookup(None, np.array([1, 2, np.nan, 2]))
     hashes = [hash(lookup) for _ in range(10)]
     self.assertTrue(all(x == hashes[0] for x in hashes))
Пример #7
0
def apply_transform_discete(var, trs):
    # type: (Orange.data.DiscreteVariable, ...) -> ...
    # pylint: disable=too-many-branches
    name, annotations = var.name, var.attributes
    base_value = var.base_value
    mapping = None
    ordered = var.ordered
    for tr in trs:
        if isinstance(tr, Rename):
            name = tr.name
        elif isinstance(tr, CategoriesMapping):
            mapping = tr.mapping
        elif isinstance(tr, Annotate):
            annotations = _parse_attributes(tr.annotations)
        elif isinstance(tr, ChangeOrdered):
            ordered = tr.ordered

    source_values = var.values
    if mapping is not None:
        dest_values = [cj for ci, cj in mapping if cj is not None]
    else:
        dest_values = var.values

    def positions(values):
        rval = {c: i for i, c in enumerate(values)}
        assert len(rval) == len(values)
        return rval

    source_codes = positions(source_values)
    dest_codes = positions(dest_values)
    if mapping is not None:
        # construct a lookup table
        lookup = np.full(len(source_values), np.nan, dtype=np.float)
        for ci, cj in mapping:
            if ci is not None and cj is not None:
                i, j = source_codes[ci], dest_codes[cj]
                lookup[i] = j

        if base_value != -1:
            base_value = lookup[base_value]
            if np.isnan(base_value):
                base_value = -1
        lookup = Lookup(var, lookup)
    else:
        lookup = Identity(var)
    variable = Orange.data.DiscreteVariable(
        name,
        values=dest_values,
        base_value=base_value,
        compute_value=lookup,
        ordered=ordered,
    )
    variable.attributes.update(annotations)
    return variable
Пример #8
0
def remove_unused_values(var, data):
    unique = nanunique(data.get_column_view(var)[0].astype(float)).astype(int)
    if len(unique) == len(var.values):
        return var
    used_values = [var.values[i] for i in unique]
    translation_table = np.array([np.NaN] * len(var.values))
    translation_table[unique] = range(len(used_values))
    return DiscreteVariable(var.name,
                            values=used_values,
                            sparse=var.sparse,
                            compute_value=Lookup(var, translation_table))
Пример #9
0
def sort_var_values(var):
    newvalues = list(sorted(var.values))

    if newvalues == list(var.values):
        return var

    translation_table = np.array(
        [float(newvalues.index(value)) for value in var.values]
    )

    return DiscreteVariable(var.name, values=newvalues,
                            compute_value=Lookup(var, translation_table))
Пример #10
0
 def test_discrete_merge(self):
     D = DiscreteVariable("D", values=("2", "3", "1", "0"))
     mapping = (
         ("0", "x"),
         ("1", "y"),
         ("2", "x"),
         ("3", "y"),
     )
     tr = [CategoriesMapping(mapping)]
     DD = apply_transform_var(D, tr)
     self.assertSequenceEqual(DD.values, ["x", "y"])
     self._assertLookupEquals(DD.compute_value,
                              Lookup(D, np.array([0, 1, 1, 0])))
Пример #11
0
 def test_discrete_add_drop(self):
     D = DiscreteVariable("D", values=("2", "3", "1", "0"))
     mapping = (
         ("0", None),
         ("1", "1"),
         ("2", "2"),
         ("3", None),
         (None, "A"),
     )
     tr = [CategoriesMapping(mapping)]
     DD = apply_transform_var(D, tr)
     self.assertSequenceEqual(DD.values, ["1", "2", "A"])
     self._assertLookupEquals(DD.compute_value,
                              Lookup(D, np.array([1, np.nan, 0, np.nan])))
Пример #12
0
def remove_unused_values(var, data):
    column_data = Table.from_table(Domain([var]), data)
    unique = nanunique(column_data.X).astype(int)
    if len(unique) == len(var.values):
        return var

    used_values = [var.values[i] for i in unique]
    translation_table = np.array([np.NaN] * len(var.values))
    translation_table[unique] = range(len(used_values))

    return DiscreteVariable("{}".format(var.name),
                            values=used_values,
                            compute_value=Lookup(var, translation_table),
                            sparse=var.sparse)
Пример #13
0
def remove_unused_values(var, data):
    column_data = Orange.data.Table.from_table(Orange.data.Domain([var]), data)
    array = column_data.X.ravel()
    mask = numpy.isfinite(array)
    unique = numpy.array(numpy.unique(array[mask]), dtype=int)

    if len(unique) == len(var.values):
        return var

    used_values = [var.values[i] for i in unique]
    new_var = Orange.data.DiscreteVariable("R_{}".format(var.name),
                                           values=used_values)
    translation_table = numpy.array([numpy.NaN] * len(var.values))
    translation_table[unique] = range(len(new_var.values))

    if 0 >= var.base_value < len(var.values):
        base = translation_table[var.base_value]
        if numpy.isfinite(base):
            new_var.base_value = int(base)

    new_var.compute_value = Lookup(var, translation_table)
    return new_var
Пример #14
0
    def test_lookup(self):
        t1 = Lookup(self.disc1, np.array([0, 2, 1]), 1)
        t1a = Lookup(self.disc1a, np.array([0, 2, 1]), 1)
        t2 = Lookup(self.disc2, np.array([0, 2, 1]), 1)
        self.assertEqual(t1, t1)
        self.assertEqual(t1, t1a)
        self.assertNotEqual(t1, t2)

        self.assertEqual(hash(t1), hash(t1a))
        self.assertNotEqual(hash(t1), hash(t2))

        t1 = Lookup(self.disc1, np.array([0, 2, 1]), 1)
        t1a = Lookup(self.disc1a, np.array([1, 2, 0]), 1)
        self.assertNotEqual(t1, t1a)
        self.assertNotEqual(hash(t1), hash(t1a))

        t1 = Lookup(self.disc1, np.array([0, 2, 1]), 1)
        t1a = Lookup(self.disc1a, np.array([0, 2, 1]), 2)
        self.assertNotEqual(t1, t1a)
        self.assertNotEqual(hash(t1), hash(t1a))
Пример #15
0
 def test_transform(self):
     lookup = Lookup(None, np.array([1, 2, 0, 2]))
     column = np.array([1, 2, 3, 0, np.nan, 0], dtype=np.float64)
     np.testing.assert_array_equal(
         lookup.transform(column),
         np.array([2, 0, 2, 1, np.nan, 1], dtype=np.float64))