def test_expanding_error(self): with self.assertRaisesRegex(ValueError, "min_periods must be >= 0"): ps.range(10).expanding(-1) with self.assertRaisesRegex( TypeError, "psdf_or_psser must be a series or dataframe; however, got:.*int" ): Expanding(1, 2)
def test_rolling_error(self): with self.assertRaisesRegex(ValueError, "window must be >= 0"): ps.range(10).rolling(window=-1) with self.assertRaisesRegex(ValueError, "min_periods must be >= 0"): ps.range(10).rolling(window=1, min_periods=-1) with self.assertRaisesRegex( TypeError, "psdf_or_psser must be a series or dataframe; however, got:.*int" ): Rolling(1, 2)
def test_html_repr(self): kdf = ps.range(ReprTest.max_display_count) self.assertTrue("Showing only the first" not in kdf._repr_html_()) self.assertEqual(kdf._repr_html_(), kdf.to_pandas()._repr_html_()) kdf = ps.range(ReprTest.max_display_count + 1) self.assertTrue("Showing only the first" in kdf._repr_html_()) with option_context("display.max_rows", None): kdf = ps.range(ReprTest.max_display_count + 1) self.assertEqual(kdf._repr_html_(), kdf.to_pandas()._repr_html_())
def test_repr_dataframe(self): kdf = ps.range(ReprTest.max_display_count) self.assertTrue("Showing only the first" not in repr(kdf)) self.assert_eq(repr(kdf), repr(kdf.to_pandas())) kdf = ps.range(ReprTest.max_display_count + 1) self.assertTrue("Showing only the first" in repr(kdf)) self.assertTrue( repr(kdf).startswith( repr(kdf.to_pandas().head(ReprTest.max_display_count)))) with option_context("display.max_rows", None): kdf = ps.range(ReprTest.max_display_count + 1) self.assert_eq(repr(kdf), repr(kdf.to_pandas()))
def test_resample_error(self): psdf = ps.range(10) with self.assertRaisesRegex( NotImplementedError, "resample currently works only for DatetimeIndex" ): psdf.resample("3Y").sum() dates = [ datetime.datetime(2012, 1, 2), datetime.datetime(2012, 5, 3), datetime.datetime(2022, 5, 3), pd.NaT, ] pdf = pd.DataFrame(np.ones(len(dates)), index=pd.DatetimeIndex(dates), columns=["A"]) psdf = ps.from_pandas(pdf) with self.assertRaisesRegex(ValueError, "rule code W-SUN is not supported"): psdf.A.resample("3W").sum() with self.assertRaisesRegex(ValueError, "rule offset must be positive"): psdf.A.resample("0Y").sum() with self.assertRaisesRegex(ValueError, "invalid closed: 'middle'"): psdf.A.resample("3Y", closed="middle").sum() with self.assertRaisesRegex(ValueError, "invalid label: 'both'"): psdf.A.resample("3Y", label="both").sum()
def test_ewm_error(self): with self.assertRaisesRegex( TypeError, "psdf_or_psser must be a series or dataframe; however, got:.*int" ): ExponentialMoving(1, 2) psdf = ps.range(10) with self.assertRaisesRegex(ValueError, "min_periods must be >= 0"): psdf.ewm(min_periods=-1, alpha=0.5).mean() with self.assertRaisesRegex(ValueError, "com must be >= 0"): psdf.ewm(com=-0.1).mean() with self.assertRaisesRegex(ValueError, "span must be >= 1"): psdf.ewm(span=0.7).mean() with self.assertRaisesRegex(ValueError, "halflife must be > 0"): psdf.ewm(halflife=0).mean() with self.assertRaisesRegex(ValueError, "alpha must be in"): psdf.ewm(alpha=1.7).mean() with self.assertRaisesRegex( ValueError, "Must pass one of com, span, halflife, or alpha"): psdf.ewm().mean() with self.assertRaisesRegex( ValueError, "com, span, halflife, and alpha are mutually exclusive"): psdf.ewm(com=0.5, alpha=0.7).mean()
def test_resample_error(self): psdf = ps.range(10) with self.assertRaisesRegex( NotImplementedError, "resample currently works only for DatetimeIndex"): psdf.resample("3Y").sum() with self.assertRaisesRegex( NotImplementedError, "resample currently works only for DatetimeIndex"): psdf.id.resample("3Y").sum() dates = [ datetime.datetime(2012, 1, 2), datetime.datetime(2012, 5, 3), datetime.datetime(2022, 5, 3), pd.NaT, ] pdf = pd.DataFrame(np.ones(len(dates)), index=pd.DatetimeIndex(dates), columns=["A"]) psdf = ps.from_pandas(pdf) with self.assertRaisesRegex(ValueError, "rule code W-SUN is not supported"): psdf.A.resample("3W").sum() with self.assertRaisesRegex(ValueError, "rule offset must be positive"): psdf.A.resample("0Y").sum() with self.assertRaisesRegex(ValueError, "invalid closed: 'middle'"): psdf.A.resample("3Y", closed="middle").sum() with self.assertRaisesRegex(ValueError, "invalid label: 'both'"): psdf.A.resample("3Y", label="both").sum() with self.assertRaisesRegex( NotImplementedError, "`on` currently works only for TimestampType"): psdf.A.resample("2D", on=psdf.A).sum() with self.assertRaisesRegex( NotImplementedError, "`on` currently works only for TimestampType"): psdf[["A"]].resample("2D", on=psdf.A).sum() psdf["B"] = ["a", "b", "c", "d"] with self.assertRaisesRegex(ValueError, "No available aggregation columns!"): psdf.B.resample("2D").sum() with self.assertRaisesRegex(ValueError, "No available aggregation columns!"): psdf[[]].resample("2D").sum()
def test_interpolate_error(self): psdf = ps.range(10) with self.assertRaisesRegex( NotImplementedError, "interpolate currently works only for method='linear'"): psdf.interpolate(method="quadratic") with self.assertRaisesRegex(ValueError, "limit must be > 0"): psdf.interpolate(limit=0)
def test_repr_indexes(self): kidx = ps.range(ReprTest.max_display_count).index self.assertTrue("Showing only the first" not in repr(kidx)) self.assert_eq(repr(kidx), repr(kidx.to_pandas())) kidx = ps.range(ReprTest.max_display_count + 1).index self.assertTrue("Showing only the first" in repr(kidx)) self.assertTrue( repr(kidx).startswith( repr(kidx.to_pandas().to_series().head( ReprTest.max_display_count).index))) with option_context("display.max_rows", None): kidx = ps.range(ReprTest.max_display_count + 1).index self.assert_eq(repr(kidx), repr(kidx.to_pandas())) kidx = ps.MultiIndex.from_tuples([ (100 * i, i) for i in range(ReprTest.max_display_count) ]) self.assertTrue("Showing only the first" not in repr(kidx)) self.assert_eq(repr(kidx), repr(kidx.to_pandas())) kidx = ps.MultiIndex.from_tuples([ (100 * i, i) for i in range(ReprTest.max_display_count + 1) ]) self.assertTrue("Showing only the first" in repr(kidx)) self.assertTrue( repr(kidx).startswith( repr(kidx.to_pandas().to_frame().head( ReprTest.max_display_count).index))) with option_context("display.max_rows", None): kidx = ps.MultiIndex.from_tuples([ (100 * i, i) for i in range(ReprTest.max_display_count + 1) ]) self.assert_eq(repr(kidx), repr(kidx.to_pandas()))
def test_repr_series(self): kser = ps.range(ReprTest.max_display_count).id self.assertTrue("Showing only the first" not in repr(kser)) self.assert_eq(repr(kser), repr(kser.to_pandas())) kser = ps.range(ReprTest.max_display_count + 1).id self.assertTrue("Showing only the first" in repr(kser)) self.assertTrue( repr(kser).startswith( repr(kser.to_pandas().head(ReprTest.max_display_count)))) with option_context("display.max_rows", None): kser = ps.range(ReprTest.max_display_count + 1).id self.assert_eq(repr(kser), repr(kser.to_pandas())) kser = ps.range(ReprTest.max_display_count).id.rename() self.assertTrue("Showing only the first" not in repr(kser)) self.assert_eq(repr(kser), repr(kser.to_pandas())) kser = ps.range(ReprTest.max_display_count + 1).id.rename() self.assertTrue("Showing only the first" in repr(kser)) self.assertTrue( repr(kser).startswith( repr(kser.to_pandas().head(ReprTest.max_display_count)))) with option_context("display.max_rows", None): kser = ps.range(ReprTest.max_display_count + 1).id.rename() self.assert_eq(repr(kser), repr(kser.to_pandas())) if LooseVersion(pyspark.__version__) >= LooseVersion("2.4"): kser = ps.MultiIndex.from_tuples([ (100 * i, i) for i in range(ReprTest.max_display_count) ]).to_series() self.assertTrue("Showing only the first" not in repr(kser)) self.assert_eq(repr(kser), repr(kser.to_pandas())) kser = ps.MultiIndex.from_tuples([ (100 * i, i) for i in range(ReprTest.max_display_count + 1) ]).to_series() self.assertTrue("Showing only the first" in repr(kser)) self.assertTrue( repr(kser).startswith( repr(kser.to_pandas().head(ReprTest.max_display_count)))) with option_context("display.max_rows", None): kser = ps.MultiIndex.from_tuples([ (100 * i, i) for i in range(ReprTest.max_display_count + 1) ]).to_series() self.assert_eq(repr(kser), repr(kser.to_pandas()))
def test_frame_apply_negative(self): with self.assertRaisesRegex( ValueError, "The output of the function.* pyspark.sql.DataFrame.*int" ): ps.range(10).spark.apply(lambda scol: 1)
def test_expanding_repr(self): self.assertEqual(repr(ps.range(10).expanding(5)), "Expanding [min_periods=5]")
def psdf2(self): return ps.range(1002)