Exemplo n.º 1
0
    def test_value_counts_unique_nunique(self):
        for orig in self.objs:
            o = orig.copy()
            klass = type(o)
            values = o._values

            if isinstance(values, Index):
                # reset name not to affect latter process
                values.name = None

            # create repeated values, 'n'th element is repeated by n+1 times
            # skip boolean, because it only has 2 values at most
            if isinstance(o, Index) and o.is_boolean():
                continue
            elif isinstance(o, Index):
                expected_index = Index(o[::-1])
                expected_index.name = None
                o = o.repeat(range(1, len(o) + 1))
                o.name = "a"
            else:
                expected_index = Index(values[::-1])
                idx = o.index.repeat(range(1, len(o) + 1))
                # take-based repeat
                indices = np.repeat(np.arange(len(o)), range(1, len(o) + 1))
                rep = values.take(indices)
                o = klass(rep, index=idx, name="a")

            # check values has the same dtype as the original
            assert o.dtype == orig.dtype

            expected_s = Series(
                range(10, 0, -1), index=expected_index, dtype="int64", name="a"
            )

            result = o.value_counts()
            tm.assert_series_equal(result, expected_s)
            assert result.index.name is None
            assert result.name == "a"

            result = o.unique()
            if isinstance(o, Index):
                assert isinstance(result, type(o))
                tm.assert_index_equal(result, orig)
                assert result.dtype == orig.dtype
            elif is_datetime64tz_dtype(o):
                # datetimetz Series returns array of Timestamp
                assert result[0] == orig[0]
                for r in result:
                    assert isinstance(r, Timestamp)

                tm.assert_numpy_array_equal(
                    result.astype(object), orig._values.astype(object)
                )
            else:
                tm.assert_numpy_array_equal(result, orig.values)
                assert result.dtype == orig.dtype

            assert o.nunique() == len(np.unique(o.values))
Exemplo n.º 2
0
    def test_index(self) -> None:
        expected = PandasIndex(PIndex([0, 1, 2], name="index"), [])
        assert_that(self.data_backend.index.equals(expected), equal_to(True))

        new_frame = self.data_backend.set_index(ExampleStore.ab_index)
        pindex = PIndex([("a", 1), ("b", 2), ("c", 3)])
        pindex.name = "ab_index"
        expected = PandasIndex(pindex, ["a", "b"])
        assert_that(new_frame.index.equals(expected), equal_to(True))
Exemplo n.º 3
0
    def test_intersection_name_preservation(self, index2, keeps_name, sort):
        index1 = Index([1, 2, 3, 4, 5], name="index")
        expected = Index([3, 4, 5])
        result = index1.intersection(index2, sort)

        if keeps_name:
            expected.name = "index"

        assert result.name == expected.name
        tm.assert_index_equal(result, expected)
Exemplo n.º 4
0
    def test_value_counts_unique_nunique(self, index_or_series_obj):
        orig = index_or_series_obj
        obj = orig.copy()
        klass = type(obj)
        values = obj._values

        if orig.duplicated().any():
            pytest.xfail(
                "The test implementation isn't flexible enough to deal "
                "with duplicated values. This isn't a bug in the "
                "application code, but in the test code.")

        # create repeated values, 'n'th element is repeated by n+1 times
        if isinstance(obj, Index):
            expected_index = Index(obj[::-1])
            expected_index.name = None
            obj = obj.repeat(range(1, len(obj) + 1))
        else:
            expected_index = Index(values[::-1])
            idx = obj.index.repeat(range(1, len(obj) + 1))
            # take-based repeat
            indices = np.repeat(np.arange(len(obj)), range(1, len(obj) + 1))
            rep = values.take(indices)
            obj = klass(rep, index=idx)

        # check values has the same dtype as the original
        assert obj.dtype == orig.dtype

        expected_s = Series(range(len(orig), 0, -1),
                            index=expected_index,
                            dtype="int64")

        result = obj.value_counts()
        tm.assert_series_equal(result, expected_s)
        assert result.index.name is None

        result = obj.unique()
        if isinstance(obj, Index):
            assert isinstance(result, type(obj))
            tm.assert_index_equal(result, orig)
            assert result.dtype == orig.dtype
        elif is_datetime64tz_dtype(obj):
            # datetimetz Series returns array of Timestamp
            assert result[0] == orig[0]
            for r in result:
                assert isinstance(r, Timestamp)

            tm.assert_numpy_array_equal(result.astype(object),
                                        orig._values.astype(object))
        else:
            tm.assert_numpy_array_equal(result, orig.values)
            assert result.dtype == orig.dtype

        # dropna=True would break for MultiIndex
        assert obj.nunique(dropna=False) == len(np.unique(obj.values))
Exemplo n.º 5
0
    def test_value_counts_unique_nunique(self):
        for orig in self.objs:
            o = orig.copy()
            klass = type(o)
            values = o._values

            if isinstance(values, Index):
                # reset name not to affect latter process
                values.name = None

            # create repeated values, 'n'th element is repeated by n+1 times
            # skip boolean, because it only has 2 values at most
            if isinstance(o, Index) and o.is_boolean():
                continue
            elif isinstance(o, Index):
                expected_index = Index(o[::-1])
                expected_index.name = None
                o = o.repeat(range(1, len(o) + 1))
                o.name = 'a'
            else:
                expected_index = Index(values[::-1])
                idx = o.index.repeat(range(1, len(o) + 1))
                # take-based repeat
                indices = np.repeat(np.arange(len(o)), range(1, len(o) + 1))
                rep = values.take(indices)
                o = klass(rep, index=idx, name='a')

            # check values has the same dtype as the original
            assert o.dtype == orig.dtype

            expected_s = Series(range(10, 0, -1), index=expected_index,
                                dtype='int64', name='a')

            result = o.value_counts()
            tm.assert_series_equal(result, expected_s)
            assert result.index.name is None
            assert result.name == 'a'

            result = o.unique()
            if isinstance(o, Index):
                assert isinstance(result, o.__class__)
                tm.assert_index_equal(result, orig)
            elif is_datetime64tz_dtype(o):
                # datetimetz Series returns array of Timestamp
                assert result[0] == orig[0]
                for r in result:
                    assert isinstance(r, Timestamp)

                tm.assert_numpy_array_equal(
                    result.astype(object),
                    orig._values.astype(object))
            else:
                tm.assert_numpy_array_equal(result, orig.values)

            assert o.nunique() == len(np.unique(o.values))
Exemplo n.º 6
0
    def test_intersection_monotonic(self, index2, keeps_name, sort):
        index1 = Index([5, 3, 2, 4, 1], name="index")
        expected = Index([5, 3, 4])

        if keeps_name:
            expected.name = "index"

        result = index1.intersection(index2, sort=sort)
        if sort is None:
            expected = expected.sort_values()
        tm.assert_index_equal(result, expected)
Exemplo n.º 7
0
    def _read_index_node(self, node):
        data = node[:]
        kind = node._v_attrs.kind
        name = None

        if 'name' in node._v_attrs:
            name = node._v_attrs.name

        index = Index(_unconvert_index(data, kind))
        index.name = name

        return name, index
Exemplo n.º 8
0
    def _read_index_node(self, node):
        data = node[:]
        kind = node._v_attrs.kind
        name = None

        if 'name' in node._v_attrs:
            name = node._v_attrs.name

        index = Index(_unconvert_index(data, kind))
        index.name = name

        return name, index
Exemplo n.º 9
0
    def _read_index_node(self, node):
        data = node[:]
        kind = node._v_attrs.kind

        try:
            name = node._v_attrs.name
        except Exception:
            name = None

        index = Index(_unconvert_index(data, kind))
        index.name = name

        return name, index
Exemplo n.º 10
0
    def _read_index_node(self, node):
        data = node[:]
        kind = node._v_attrs.kind

        try:
            name = node._v_attrs.name
        except Exception:
            name = None

        index = Index(_unconvert_index(data, kind))
        index.name = name

        return name, index
Exemplo n.º 11
0
    def from_pandas_index(cls, index: pd.Index, dim: Hashable):
        from .variable import IndexVariable

        if index.name is None:
            name = dim
            index = index.copy()
            index.name = dim
        else:
            name = index.name

        data = PandasIndexingAdapter(index)
        index_var = IndexVariable(dim, data, fastpath=True)

        return cls(index, dim), {name: index_var}
Exemplo n.º 12
0
    def _read_index_node(self, node):
        data = node[:]
        kind = node._v_attrs.kind
        name = None

        if 'name' in node._v_attrs:
            name = node._v_attrs.name

        if kind in ('date', 'datetime'):
            index = Index(_unconvert_index(data, kind), dtype=object)
        else:
            index = Index(_unconvert_index(data, kind))

        index.name = name

        return name, index
Exemplo n.º 13
0
    def test_constructor_name(self):
        # GH#12288
        orig = RangeIndex(10)
        orig.name = "original"

        copy = RangeIndex(orig)
        copy.name = "copy"

        assert orig.name == "original"
        assert copy.name == "copy"

        new = Index(copy)
        assert new.name == "copy"

        new.name = "new"
        assert orig.name == "original"
        assert copy.name == "copy"
        assert new.name == "new"
Exemplo n.º 14
0
    def test_constructor_name(self):
        # GH12288
        orig = RangeIndex(10)
        orig.name = 'original'

        copy = RangeIndex(orig)
        copy.name = 'copy'

        assert orig.name == 'original'
        assert copy.name == 'copy'

        new = Index(copy)
        assert new.name == 'copy'

        new.name = 'new'
        assert orig.name == 'original'
        assert copy.name == 'copy'
        assert new.name == 'new'
Exemplo n.º 15
0
    def test_constructor_name(self):
        # GH12288
        orig = RangeIndex(10)
        orig.name = 'original'

        copy = RangeIndex(orig)
        copy.name = 'copy'

        self.assertTrue(orig.name, 'original')
        self.assertTrue(copy.name, 'copy')

        new = Index(copy)
        self.assertTrue(new.name, 'copy')

        new.name = 'new'
        self.assertTrue(orig.name, 'original')
        self.assertTrue(new.name, 'copy')
        self.assertTrue(new.name, 'new')
Exemplo n.º 16
0
    def test_constructor_name(self):
        # GH12288
        orig = RangeIndex(10)
        orig.name = 'original'

        copy = RangeIndex(orig)
        copy.name = 'copy'

        self.assertTrue(orig.name, 'original')
        self.assertTrue(copy.name, 'copy')

        new = Index(copy)
        self.assertTrue(new.name, 'copy')

        new.name = 'new'
        self.assertTrue(orig.name, 'original')
        self.assertTrue(new.name, 'copy')
        self.assertTrue(new.name, 'new')
Exemplo n.º 17
0
    def test_constructor_name(self):
        # GH12288
        orig = RangeIndex(10)
        orig.name = 'original'

        copy = RangeIndex(orig)
        copy.name = 'copy'

        assert orig.name == 'original'
        assert copy.name == 'copy'

        new = Index(copy)
        assert new.name == 'copy'

        new.name = 'new'
        assert orig.name == 'original'
        assert copy.name == 'copy'
        assert new.name == 'new'
Exemplo n.º 18
0
    def test_value_counts_unique_nunique_null(self):

        for null_obj in [np.nan, None]:
            for orig in self.objs:
                o = orig.copy()
                klass = type(o)
                values = o._ndarray_values

                if not self._allow_na_ops(o):
                    continue

                # special assign to the numpy array
                if is_datetimetz(o):
                    if isinstance(o, DatetimeIndex):
                        v = o.asi8
                        v[0:2] = iNaT
                        values = o._shallow_copy(v)
                    else:
                        o = o.copy()
                        o[0:2] = iNaT
                        values = o._values

                elif needs_i8_conversion(o):
                    values[0:2] = iNaT
                    values = o._shallow_copy(values)
                else:
                    values[0:2] = null_obj
                # check values has the same dtype as the original

                assert values.dtype == o.dtype

                # create repeated values, 'n'th element is repeated by n+1
                # times
                if isinstance(o, (DatetimeIndex, PeriodIndex)):
                    expected_index = o.copy()
                    expected_index.name = None

                    # attach name to klass
                    o = klass(values.repeat(range(1, len(o) + 1)))
                    o.name = 'a'
                else:
                    if is_datetimetz(o):
                        expected_index = orig._values._shallow_copy(values)
                    else:
                        expected_index = Index(values)
                    expected_index.name = None
                    o = o.repeat(range(1, len(o) + 1))
                    o.name = 'a'

                # check values has the same dtype as the original
                assert o.dtype == orig.dtype
                # check values correctly have NaN
                nanloc = np.zeros(len(o), dtype=np.bool)
                nanloc[:3] = True
                if isinstance(o, Index):
                    tm.assert_numpy_array_equal(pd.isna(o), nanloc)
                else:
                    exp = Series(nanloc, o.index, name='a')
                    tm.assert_series_equal(pd.isna(o), exp)

                expected_s_na = Series(list(range(10, 2, -1)) + [3],
                                       index=expected_index[9:0:-1],
                                       dtype='int64', name='a')
                expected_s = Series(list(range(10, 2, -1)),
                                    index=expected_index[9:1:-1],
                                    dtype='int64', name='a')

                result_s_na = o.value_counts(dropna=False)
                tm.assert_series_equal(result_s_na, expected_s_na)
                assert result_s_na.index.name is None
                assert result_s_na.name == 'a'
                result_s = o.value_counts()
                tm.assert_series_equal(o.value_counts(), expected_s)
                assert result_s.index.name is None
                assert result_s.name == 'a'

                result = o.unique()
                if isinstance(o, Index):
                    tm.assert_index_equal(result,
                                          Index(values[1:], name='a'))
                elif is_datetimetz(o):
                    # unable to compare NaT / nan
                    vals = values[2:].astype(object).values
                    tm.assert_numpy_array_equal(result[1:], vals)
                    assert result[0] is pd.NaT
                else:
                    tm.assert_numpy_array_equal(result[1:], values[2:])

                    assert pd.isna(result[0])
                    assert result.dtype == orig.dtype

                assert o.nunique() == 8
                assert o.nunique(dropna=False) == 9
Exemplo n.º 19
0
    def test_value_counts_unique_nunique_null(self, null_obj,
                                              index_or_series_obj):
        orig = index_or_series_obj
        obj = orig.copy()
        klass = type(obj)
        values = obj._ndarray_values
        num_values = len(orig)

        if not allow_na_ops(obj):
            pytest.skip("type doesn't allow for NA operations")
        elif isinstance(orig, (pd.CategoricalIndex, pd.IntervalIndex)):
            pytest.skip(f"values of {klass} cannot be changed")
        elif isinstance(orig, pd.MultiIndex):
            pytest.skip("MultiIndex doesn't support isna")
        elif orig.duplicated().any():
            pytest.xfail(
                "The test implementation isn't flexible enough to deal "
                "with duplicated values. This isn't a bug in the "
                "application code, but in the test code.")

        # special assign to the numpy array
        if is_datetime64tz_dtype(obj):
            if isinstance(obj, DatetimeIndex):
                v = obj.asi8
                v[0:2] = iNaT
                values = obj._shallow_copy(v)
            else:
                obj = obj.copy()
                obj[0:2] = pd.NaT
                values = obj._values

        elif needs_i8_conversion(obj):
            values[0:2] = iNaT
            values = obj._shallow_copy(values)
        else:
            values[0:2] = null_obj

        # check values has the same dtype as the original
        assert values.dtype == obj.dtype

        # create repeated values, 'n'th element is repeated by n+1
        # times
        if isinstance(obj, (DatetimeIndex, PeriodIndex)):
            expected_index = obj.copy()
            expected_index.name = None

            # attach name to klass
            obj = klass(values.repeat(range(1, len(obj) + 1)))
            obj.name = "a"
        else:
            if isinstance(obj, DatetimeIndex):
                expected_index = orig._values._shallow_copy(values)
            else:
                expected_index = Index(values)
            expected_index.name = None
            obj = obj.repeat(range(1, len(obj) + 1))
            obj.name = "a"

        # check values has the same dtype as the original
        assert obj.dtype == orig.dtype

        # check values correctly have NaN
        nanloc = np.zeros(len(obj), dtype=np.bool)
        nanloc[:3] = True
        if isinstance(obj, Index):
            tm.assert_numpy_array_equal(pd.isna(obj), nanloc)
        else:
            exp = Series(nanloc, obj.index, name="a")
            tm.assert_series_equal(pd.isna(obj), exp)

        expected_data = list(range(num_values, 2, -1))
        expected_data_na = expected_data.copy()
        if expected_data_na:
            expected_data_na.append(3)
        expected_s_na = Series(
            expected_data_na,
            index=expected_index[num_values - 1:0:-1],
            dtype="int64",
            name="a",
        )
        expected_s = Series(
            expected_data,
            index=expected_index[num_values - 1:1:-1],
            dtype="int64",
            name="a",
        )

        result_s_na = obj.value_counts(dropna=False)
        tm.assert_series_equal(result_s_na, expected_s_na)
        assert result_s_na.index.name is None
        assert result_s_na.name == "a"
        result_s = obj.value_counts()
        tm.assert_series_equal(obj.value_counts(), expected_s)
        assert result_s.index.name is None
        assert result_s.name == "a"

        result = obj.unique()
        if isinstance(obj, Index):
            tm.assert_index_equal(result, Index(values[1:], name="a"))
        elif is_datetime64tz_dtype(obj):
            # unable to compare NaT / nan
            tm.assert_extension_array_equal(result[1:], values[2:])
            assert result[0] is pd.NaT
        elif len(obj) > 0:
            tm.assert_numpy_array_equal(result[1:], values[2:])

            assert pd.isna(result[0])
            assert result.dtype == orig.dtype

        assert obj.nunique() == max(0, num_values - 2)
        assert obj.nunique(dropna=False) == max(0, num_values - 1)
Exemplo n.º 20
0
def replace_multi_index_level(
    df: "classes.BeliefsDataFrame",
    level: str,
    index: pd.Index,
    intersection: bool = False,
) -> "classes.BeliefsDataFrame":
    """Replace one of the index levels of the multi-indexed DataFrame. Returns a new DataFrame object.
    :param df: a BeliefsDataFrame (or just a multi-indexed DataFrame).
    :param level: the name of the index level to replace.
    :param index: the new index.
    :param intersection: policy for replacing the index level.
    If intersection is False then simply replace (note that the new index should have the same length as the old index).
    If intersection is True then add indices not contained in the old index and delete indices not contained in the new
    index. New rows have nan columns values and copies of the first row for other index levels (note that the resulting
    index is usually longer and contains values that were both in the old and new index, i.e. the intersection).
    """
    # Todo: check whether timezone information is copied over correctly

    # Check input
    if intersection is False and len(index) != len(df.index):
        raise ValueError(
            "Cannot simply replace multi-index level with an index of different length than the original. "
            "Use intersection instead?")
    if index.name is None:
        index.name = level

    new_index_values = []
    new_index_names = []
    if intersection is True:
        contained_in_old = index.isin(df.index.get_level_values(level))
        new_index_not_in_old = index[~contained_in_old]
        contained_in_new = df.index.get_level_values(level).isin(index)
        for i in df.index.names:
            if i == level:  # For the index level that should be replaced
                # Copy old values that the new index contains, and add new values that the old index does not contain
                new_index_values.append(
                    df.index.get_level_values(i)[contained_in_new].append(
                        new_index_not_in_old))
                new_index_names.append(index.name)
            else:  # For the other index levels
                # Copy old values that the new index contains, and add the first value to the new rows
                new_row_values = pd.Index([df.index.get_level_values(i)[0]] *
                                          len(new_index_not_in_old))
                new_index_values.append(
                    df.index.get_level_values(i)[contained_in_new].append(
                        new_row_values))
                new_index_names.append(i)
    else:
        for i in df.index.names:
            if i == level:  # For the index level that should be replaced
                # Replace with new index
                new_index_values.append(index)
                new_index_names.append(index.name)
            else:  # For the other index levels
                # Copy all old values
                new_index_values.append(df.index.get_level_values(i))
                new_index_names.append(i)

    # Construct new MultiIndex
    mux = pd.MultiIndex.from_arrays(new_index_values, names=new_index_names)

    df = df.copy(deep=True)
    # Apply new MultiIndex
    if intersection is True:
        # Reindex such that new rows get nan column values
        df = df.reindex(mux)
    else:
        # Replace the index
        df.index = mux
    return df.sort_index()
Exemplo n.º 21
0
    def test_value_counts_unique_nunique_null(self):

        for null_obj in [np.nan, None]:
            for orig in self.objs:
                o = orig.copy()
                klass = type(o)
                values = o._ndarray_values

                if not self._allow_na_ops(o):
                    continue

                # special assign to the numpy array
                if is_datetimetz(o):
                    if isinstance(o, DatetimeIndex):
                        v = o.asi8
                        v[0:2] = iNaT
                        values = o._shallow_copy(v)
                    else:
                        o = o.copy()
                        o[0:2] = iNaT
                        values = o._values

                elif needs_i8_conversion(o):
                    values[0:2] = iNaT
                    values = o._shallow_copy(values)
                else:
                    values[0:2] = null_obj
                # check values has the same dtype as the original

                assert values.dtype == o.dtype

                # create repeated values, 'n'th element is repeated by n+1
                # times
                if isinstance(o, (DatetimeIndex, PeriodIndex)):
                    expected_index = o.copy()
                    expected_index.name = None

                    # attach name to klass
                    o = klass(values.repeat(range(1, len(o) + 1)))
                    o.name = 'a'
                else:
                    if is_datetimetz(o):
                        expected_index = orig._values._shallow_copy(values)
                    else:
                        expected_index = Index(values)
                    expected_index.name = None
                    o = o.repeat(range(1, len(o) + 1))
                    o.name = 'a'

                # check values has the same dtype as the original
                assert o.dtype == orig.dtype
                # check values correctly have NaN
                nanloc = np.zeros(len(o), dtype=np.bool)
                nanloc[:3] = True
                if isinstance(o, Index):
                    tm.assert_numpy_array_equal(pd.isna(o), nanloc)
                else:
                    exp = Series(nanloc, o.index, name='a')
                    tm.assert_series_equal(pd.isna(o), exp)

                expected_s_na = Series(list(range(10, 2, -1)) + [3],
                                       index=expected_index[9:0:-1],
                                       dtype='int64',
                                       name='a')
                expected_s = Series(list(range(10, 2, -1)),
                                    index=expected_index[9:1:-1],
                                    dtype='int64',
                                    name='a')

                result_s_na = o.value_counts(dropna=False)
                tm.assert_series_equal(result_s_na, expected_s_na)
                assert result_s_na.index.name is None
                assert result_s_na.name == 'a'
                result_s = o.value_counts()
                tm.assert_series_equal(o.value_counts(), expected_s)
                assert result_s.index.name is None
                assert result_s.name == 'a'

                result = o.unique()
                if isinstance(o, Index):
                    tm.assert_index_equal(result, Index(values[1:], name='a'))
                elif is_datetimetz(o):
                    # unable to compare NaT / nan
                    vals = values[2:].astype(object).values
                    tm.assert_numpy_array_equal(result[1:], vals)
                    assert result[0] is pd.NaT
                else:
                    tm.assert_numpy_array_equal(result[1:], values[2:])

                    assert pd.isna(result[0])
                    assert result.dtype == orig.dtype

                assert o.nunique() == 8
                assert o.nunique(dropna=False) == 9