def test_meta_nonempty_scalar(): meta = meta_nonempty(np.float64(1.0)) assert isinstance(meta, np.float64) x = pd.Timestamp(2000, 1, 1) meta = meta_nonempty(x) assert meta is x
def test_meta_nonempty(): df1 = pd.DataFrame({'A': pd.Categorical(['Alice', 'Bob', 'Carol']), 'B': list('abc'), 'C': 'bar', 'D': np.float32(1), 'E': np.int32(1), 'F': pd.Timestamp('2016-01-01'), 'G': pd.date_range('2016-01-01', periods=3, tz='America/New_York'), 'H': pd.Timedelta('1 hours', 'ms'), 'I': np.void(b' '), 'J': pd.Categorical([UNKNOWN_CATEGORIES] * 3)}, columns=list('DCBAHGFEIJ')) df2 = df1.iloc[0:0] df3 = meta_nonempty(df2) assert (df3.dtypes == df2.dtypes).all() assert df3['A'][0] == 'Alice' assert df3['B'][0] == 'foo' assert df3['C'][0] == 'foo' assert df3['D'][0] == np.float32(1) assert df3['D'][0].dtype == 'f4' assert df3['E'][0] == np.int32(1) assert df3['E'][0].dtype == 'i4' assert df3['F'][0] == pd.Timestamp('1970-01-01 00:00:00') assert df3['G'][0] == pd.Timestamp('1970-01-01 00:00:00', tz='America/New_York') assert df3['H'][0] == pd.Timedelta('1', 'ms') assert df3['I'][0] == 'foo' assert df3['J'][0] == UNKNOWN_CATEGORIES s = meta_nonempty(df2['A']) assert s.dtype == df2['A'].dtype assert (df3['A'] == s).all()
def test_meta_nonempty(): df1 = pd.DataFrame({'A': pd.Categorical(['Alice', 'Bob', 'Carol']), 'B': list('abc'), 'C': 'bar', 'D': 3.0, 'E': pd.Timestamp('2016-01-01'), 'F': pd.date_range('2016-01-01', periods=3, tz='America/New_York'), 'G': pd.Timedelta('1 hours'), 'H': np.void(b' ')}, columns=list('DCBAHGFE')) df2 = df1.iloc[0:0] df3 = meta_nonempty(df2) assert (df3.dtypes == df2.dtypes).all() assert df3['A'][0] == 'Alice' assert df3['B'][0] == 'foo' assert df3['C'][0] == 'foo' assert df3['D'][0] == 1.0 assert df3['E'][0] == pd.Timestamp('1970-01-01 00:00:00') assert df3['F'][0] == pd.Timestamp('1970-01-01 00:00:00', tz='America/New_York') assert df3['G'][0] == pd.Timedelta('1 days') assert df3['H'][0] == 'foo' s = meta_nonempty(df2['A']) assert s.dtype == df2['A'].dtype assert (df3['A'] == s).all()
def test_meta_nonempty(): df1 = pd.DataFrame( { 'A': pd.Categorical(['Alice', 'Bob', 'Carol']), 'B': list('abc'), 'C': 'bar', 'D': np.float32(1), 'E': np.int32(1), 'F': pd.Timestamp('2016-01-01'), 'G': pd.date_range('2016-01-01', periods=3, tz='America/New_York'), 'H': pd.Timedelta('1 hours', 'ms'), 'I': np.void(b' '), 'J': pd.Categorical([UNKNOWN_CATEGORIES] * 3) }, columns=list('DCBAHGFEIJ')) df2 = df1.iloc[0:0] df3 = meta_nonempty(df2) assert (df3.dtypes == df2.dtypes).all() assert df3['A'][0] == 'Alice' assert df3['B'][0] == 'foo' assert df3['C'][0] == 'foo' assert df3['D'][0] == np.float32(1) assert df3['D'][0].dtype == 'f4' assert df3['E'][0] == np.int32(1) assert df3['E'][0].dtype == 'i4' assert df3['F'][0] == pd.Timestamp('1970-01-01 00:00:00') assert df3['G'][0] == pd.Timestamp('1970-01-01 00:00:00', tz='America/New_York') assert df3['H'][0] == pd.Timedelta('1', 'ms') assert df3['I'][0] == 'foo' assert df3['J'][0] == UNKNOWN_CATEGORIES s = meta_nonempty(df2['A']) assert s.dtype == df2['A'].dtype assert (df3['A'] == s).all()
def test_meta_nonempty(): df1 = pd.DataFrame( { "A": pd.Categorical(["Alice", "Bob", "Carol"]), "B": list("abc"), "C": "bar", "D": np.float32(1), "E": np.int32(1), "F": pd.Timestamp("2016-01-01"), "G": pd.date_range("2016-01-01", periods=3, tz="America/New_York"), "H": pd.Timedelta("1 hours"), "I": np.void(b" "), "J": pd.Categorical([UNKNOWN_CATEGORIES] * 3), }, columns=list("DCBAHGFEIJ"), ) df2 = df1.iloc[0:0] df3 = meta_nonempty(df2) assert (df3.dtypes == df2.dtypes).all() assert df3["A"][0] == "Alice" assert df3["B"][0] == "foo" assert df3["C"][0] == "foo" assert df3["D"][0] == np.float32(1) assert df3["D"][0].dtype == "f4" assert df3["E"][0] == np.int32(1) assert df3["E"][0].dtype == "i4" assert df3["F"][0] == pd.Timestamp("1970-01-01 00:00:00") assert df3["G"][0] == pd.Timestamp("1970-01-01 00:00:00", tz="America/New_York") assert df3["H"][0] == pd.Timedelta("1") assert df3["I"][0] == "foo" assert df3["J"][0] == UNKNOWN_CATEGORIES s = meta_nonempty(df2["A"]) assert s.dtype == df2["A"].dtype assert (df3["A"] == s).all()
def test_meta_nonempty_index(): idx = pd.RangeIndex(1, name='foo') res = meta_nonempty(idx) assert type(res) is pd.RangeIndex assert res.name == idx.name idx = pd.Int64Index([1], name='foo') res = meta_nonempty(idx) assert type(res) is pd.Int64Index assert res.name == idx.name idx = pd.Index(['a'], name='foo') res = meta_nonempty(idx) assert type(res) is pd.Index assert res.name == idx.name idx = pd.DatetimeIndex(['1970-01-01'], freq='d', tz='America/New_York', name='foo') res = meta_nonempty(idx) assert type(res) is pd.DatetimeIndex assert res.tz == idx.tz assert res.freq == idx.freq assert res.name == idx.name idx = pd.PeriodIndex(['1970-01-01'], freq='d', name='foo') res = meta_nonempty(idx) assert type(res) is pd.PeriodIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.TimedeltaIndex([np.timedelta64(1, 'D')], freq='d', name='foo') res = meta_nonempty(idx) assert type(res) is pd.TimedeltaIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.CategoricalIndex(['a'], ['a', 'b'], ordered=True, name='foo') res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert (res.categories == idx.categories).all() assert res.ordered == idx.ordered assert res.name == idx.name idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES], ordered=True, name='foo') res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert res.ordered == idx.ordered assert res.name == idx.name levels = [pd.Int64Index([1], name='a'), pd.Float64Index([1.0], name='b')] idx = pd.MultiIndex(levels=levels, labels=[[0], [0]], names=['a', 'b']) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names
def test_meta_nonempty_scalar(): meta = meta_nonempty(np.float64(1.0)) assert isinstance(meta, np.float64) x = pd.Timestamp(2000, 1, 1) meta = meta_nonempty(x) assert meta is x # DatetimeTZDtype x = pd.DatetimeTZDtype(tz="UTC") meta = meta_nonempty(x) assert meta == pd.Timestamp(1, tz=x.tz, unit=x.unit)
def transform(self, columns: ColumnNames, df: DataFrameType) -> DataFrameType: # Sort if necessary if self.sort_cols: df = df.sort_values(self.sort_cols, ignore_index=True) # List aggregations do not work with empty data. # Use synthetic metadata to predict output columns. empty_df = not len(df) _df = meta_nonempty(df) if empty_df else df # Get "complete" aggregation dicts _list_aggs, _conv_aggs = _get_agg_dicts(self.groupby_cols, self.list_aggs, self.conv_aggs, columns) # Apply aggregations new_df = _apply_aggs(_df, self.groupby_cols, _list_aggs, _conv_aggs, name_sep=self.name_sep) if empty_df: return new_df.iloc[:0] return new_df
def test_meta_nonempty_empty_categories(): for dtype in ['O', 'f8', 'M8']: # Index idx = pd.CategoricalIndex([], pd.Index([], dtype=dtype), ordered=True, name='foo') res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert type(res.categories) is type(idx.categories) assert res.ordered == idx.ordered assert res.name == idx.name # Series s = idx.to_series() res = meta_nonempty(s) assert res.dtype == s.dtype assert type(res.cat.categories) is type(s.cat.categories) assert res.cat.ordered == s.cat.ordered assert res.name == s.name
def _maybe_partial_time_string(self, iindexer): """ Convert index-indexer for partial time string slicing if obj.index is DatetimeIndex / PeriodIndex """ idx = meta_nonempty(self.obj._meta.index) iindexer = _maybe_partial_time_string(idx, iindexer) return iindexer
def test_meta_duplicated(): df = pd.DataFrame(columns=['A', 'A', 'B']) res = meta_nonempty(df) exp = pd.DataFrame([['foo', 'foo', 'foo'], ['foo', 'foo', 'foo']], index=['a', 'b'], columns=['A', 'A', 'B']) tm.assert_frame_equal(res, exp)
def test_meta_nonempty_empty_categories(): for dtype in ["O", "f8", "M8[ns]"]: # Index idx = pd.CategoricalIndex( [], pd.Index([], dtype=dtype), ordered=True, name="foo" ) res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert type(res.categories) is type(idx.categories) assert res.ordered == idx.ordered assert res.name == idx.name # Series s = idx.to_series() res = meta_nonempty(s) assert res.dtype == "category" assert s.dtype == "category" assert type(res.cat.categories) is type(s.cat.categories) assert res.cat.ordered == s.cat.ordered assert res.name == s.name
def test_meta_duplicated(): df = pd.DataFrame(columns=["A", "A", "B"]) res = meta_nonempty(df) exp = pd.DataFrame( [["foo", "foo", "foo"], ["foo", "foo", "foo"]], index=["a", "b"], columns=["A", "A", "B"], ) tm.assert_frame_equal(res, exp)
def test_meta_nonempty_uint64index(): idx = pd.UInt64Index([1], name='foo') res = meta_nonempty(idx) assert type(res) is pd.UInt64Index assert res.name == idx.name
def test_nonempty_series_nullable_float(): ser = pd.Series([], dtype="Float64") non_empty = meta_nonempty(ser) assert non_empty.dtype == "Float64"
def test_nonempty_series_sparse(): ser = pd.Series(pd.array([0, 1], dtype="Sparse")) with warnings.catch_warnings(record=True) as record: meta_nonempty(ser) assert not record
def test_meta_nonempty_uint64index(): idx = pd.Index([1], name="foo", dtype="uint64") res = meta_nonempty(idx) assert type(res) is type(idx) assert res.dtype == "uint64" assert res.name == idx.name
def test_meta_nonempty_index(): idx = pd.RangeIndex(1, name="foo") res = meta_nonempty(idx) assert type(res) is pd.RangeIndex assert res.name == idx.name idx = pd.Index([1], name="foo", dtype="int") res = meta_nonempty(idx) assert type(res) is type(idx) assert res.dtype == "int64" assert res.name == idx.name idx = pd.Index(["a"], name="foo") res = meta_nonempty(idx) assert type(res) is pd.Index assert res.name == idx.name idx = pd.DatetimeIndex(["1970-01-01"], freq="d", tz="America/New_York", name="foo") res = meta_nonempty(idx) assert type(res) is pd.DatetimeIndex assert res.tz == idx.tz assert res.freq == idx.freq assert res.name == idx.name idx = pd.PeriodIndex(["1970-01-01"], freq="d", name="foo") res = meta_nonempty(idx) assert type(res) is pd.PeriodIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.TimedeltaIndex([np.timedelta64(1, "D")], freq="d", name="foo") res = meta_nonempty(idx) assert type(res) is pd.TimedeltaIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.CategoricalIndex(["xyx"], ["xyx", "zzz"], ordered=True, name="foo") res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert (res.categories == idx.categories).all() assert res.ordered == idx.ordered assert res.name == idx.name idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES], ordered=True, name="foo") res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert res.ordered == idx.ordered assert res.name == idx.name levels = [pd.Index([1], name="a"), pd.Index([1.0], name="b")] codes = [[0], [0]] idx = pd.MultiIndex(levels=levels, names=["a", "b"], codes=codes) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names levels = [ pd.Index([1], name="a"), pd.CategoricalIndex(data=["xyx"], categories=["xyx"], name="b"), pd.TimedeltaIndex([np.timedelta64(1, "D")], name="timedelta"), ] codes = [[0], [0], [0]] idx = pd.MultiIndex(levels=levels, names=["a", "b", "timedelta"], codes=codes) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names
def meta_nonempty_dataframe(df, index=None): return GeoDataFrame(meta_nonempty(pd.DataFrame(df.head(0))))
def test_nonempty_series_sparse(): ser = pd.Series(pd.array([0, 1], dtype="Sparse")) with pytest.warns(None) as w: meta_nonempty(ser) assert len(w) == 0