def test_apply_with_new_dataframe(self): pdf = pd.DataFrame({ "timestamp": [0.0, 0.5, 1.0, 0.0, 0.5], "car_id": ['A', 'A', 'A', 'B', 'B'] }) kdf = koalas.DataFrame(pdf) self.assert_eq( kdf.groupby('car_id').apply( lambda _: pd.DataFrame({"column": [0.0]})).sort_index(), pdf.groupby('car_id').apply( lambda _: pd.DataFrame({"column": [0.0]})).sort_index()) set_option('compute.shortcut_limit', 1000) try: # 1000+ records will only infer the schema. pdf = pd.DataFrame({ "timestamp": [0.0, 0.5, 1.0, 0.0, 0.5] * 300, "car_id": ['A', 'A', 'A', 'B', 'B'] * 300 }) kdf = koalas.DataFrame(pdf) self.assert_eq( kdf.groupby('car_id').apply( lambda _: pd.DataFrame({"column": [0.0]})).sort_index(), pdf.groupby('car_id').apply( lambda _: pd.DataFrame({"column": [0.0]})).sort_index()) finally: reset_option('compute.shortcut_limit')
def test_axis_on_dataframe(self): # The number of each count is intentionally big # because when data is small, it executes a shortcut. # Less than 'compute.shortcut_limit' will execute a shortcut # by using collected pandas dataframe directly. # now we set the 'compute.shortcut_limit' as 1000 explicitly set_option('compute.shortcut_limit', 1000) try: pdf = pd.DataFrame({ 'A': [1, -2, 3, -4, 5] * 300, 'B': [1., -2, 3, -4, 5] * 300, 'C': [-6., -7, -8, -9, 10] * 300, 'D': [True, False, True, False, False] * 300 }) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.count(axis=1), pdf.count(axis=1)) self.assert_eq(kdf.var(axis=1), pdf.var(axis=1)) self.assert_eq(kdf.std(axis=1), pdf.std(axis=1)) self.assert_eq(kdf.max(axis=1), pdf.max(axis=1)) self.assert_eq(kdf.min(axis=1), pdf.min(axis=1)) self.assert_eq(kdf.sum(axis=1), pdf.sum(axis=1)) self.assert_eq(kdf.kurtosis(axis=1), pdf.kurtosis(axis=1)) self.assert_eq(kdf.skew(axis=1), pdf.skew(axis=1)) self.assert_eq(kdf.mean(axis=1), pdf.mean(axis=1)) finally: reset_option('compute.shortcut_limit')
def test_sampled_plot_with_ratio(self): set_option('plotting.sample_ratio', 0.5) try: pdf = pd.DataFrame(np.random.rand(2500, 4), columns=['a', 'b', 'c', 'd']) kdf = koalas.from_pandas(pdf) data = SampledPlot().get_sampled(kdf) self.assertEqual(round(len(data) / 2500, 1), 0.5) finally: set_option('plotting.sample_ratio', DataFramePlotTest.sample_ratio_default)
def test_transform(self): pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], 'b': [1, 1, 2, 3, 5, 8], 'c': [1, 4, 9, 16, 25, 36]}, columns=['a', 'b', 'c']) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.groupby("b").transform(lambda x: x + 1).sort_index(), pdf.groupby("b").transform(lambda x: x + 1).sort_index()) self.assert_eq(kdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index(), pdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index()) self.assert_eq(kdf.groupby(['b'])['a'].transform(lambda x: x).sort_index(), pdf.groupby(['b'])['a'].transform(lambda x: x).sort_index()) # multi-index columns columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) pdf.columns = columns kdf.columns = columns self.assert_eq(kdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index(), pdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index()) self.assert_eq(kdf.groupby([('x', 'a'), ('x', 'b')]) .transform(lambda x: x * x).sort_index(), pdf.groupby([('x', 'a'), ('x', 'b')]) .transform(lambda x: x * x).sort_index()) set_option('compute.shortcut_limit', 1000) try: pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6] * 300, 'b': [1, 1, 2, 3, 5, 8] * 300, 'c': [1, 4, 9, 16, 25, 36] * 300}, columns=['a', 'b', 'c']) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.groupby("b").transform(lambda x: x + 1).sort_index(), pdf.groupby("b").transform(lambda x: x + 1).sort_index()) self.assert_eq(kdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index(), pdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index()) self.assert_eq(kdf.groupby(['b'])['a'].transform(lambda x: x).sort_index(), pdf.groupby(['b'])['a'].transform(lambda x: x).sort_index()) with self.assertRaisesRegex(TypeError, "<class 'int'> object is not callable"): kdf.groupby("b").transform(1) # multi-index columns columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) pdf.columns = columns kdf.columns = columns self.assert_eq(kdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index(), pdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index()) self.assert_eq(kdf.groupby([('x', 'a'), ('x', 'b')]) .transform(lambda x: x * x).sort_index(), pdf.groupby([('x', 'a'), ('x', 'b')]) .transform(lambda x: x * x).sort_index()) finally: reset_option('compute.shortcut_limit')
def test_html_repr(self): kdf = ks.range(ReprTests.max_display_count) self.assertTrue("Showing only the first" not in kdf._repr_html_()) self.assertEqual(kdf._repr_html_(), kdf.to_pandas()._repr_html_()) kdf = ks.range(ReprTests.max_display_count + 1) self.assertTrue("Showing only the first" in kdf._repr_html_()) set_option("display.max_rows", None) try: kdf = ks.range(ReprTests.max_display_count + 1) self.assertEqual(kdf._repr_html_(), kdf.to_pandas()._repr_html_()) finally: set_option("display.max_rows", ReprTests.max_display_count)
def test_repr_series(self): kser = ks.range(ReprTests.max_display_count).id self.assertTrue("Showing only the first" not in repr(kser)) self.assert_eq(repr(kser), repr(kser.to_pandas())) kser = ks.range(ReprTests.max_display_count + 1).id self.assertTrue("Showing only the first" in repr(kser)) set_option("display.max_rows", None) try: kser = ks.range(ReprTests.max_display_count + 1).id self.assert_eq(repr(kser), repr(kser.to_pandas())) finally: set_option("display.max_rows", ReprTests.max_display_count)
def test_repr_dataframe(self): kdf = ks.range(ReprTests.max_display_count) self.assertTrue("Showing only the first" not in repr(kdf)) self.assert_eq(repr(kdf), repr(kdf.to_pandas())) kdf = ks.range(ReprTests.max_display_count + 1) self.assertTrue("Showing only the first" in repr(kdf)) set_option("display.max_rows", None) try: kdf = ks.range(ReprTests.max_display_count + 1) self.assert_eq(repr(kdf), repr(kdf.to_pandas())) finally: set_option("display.max_rows", ReprTests.max_display_count)
def test_apply(self): pdf = pd.DataFrame( { 'a': [1, 2, 3, 4, 5, 6], 'b': [1, 1, 2, 3, 5, 8], 'c': [1, 4, 9, 16, 25, 36] }, columns=['a', 'b', 'c']) kdf = koalas.DataFrame(pdf) self.assert_eq( kdf.groupby("b").apply(lambda x: x + 1).sort_index(), pdf.groupby("b").apply(lambda x: x + 1).sort_index()) self.assert_eq( kdf.groupby(['a', 'b']).apply(lambda x: x * x).sort_index(), pdf.groupby(['a', 'b']).apply(lambda x: x * x).sort_index()) self.assert_eq( kdf.groupby(['b'])['a'].apply(lambda x: x).sort_index(), pdf.groupby(['b'])['a'].apply(lambda x: x).sort_index()) # Less than 'compute.shortcut_limit' will execute a shortcut # by using collected pandas dataframe directly. # now we set the 'compute.shortcut_limit' as 1000 explicitly set_option('compute.shortcut_limit', 1000) try: pdf = pd.DataFrame( { 'a': [1, 2, 3, 4, 5, 6] * 300, 'b': [1, 1, 2, 3, 5, 8] * 300, 'c': [1, 4, 9, 16, 25, 36] * 300 }, columns=['a', 'b', 'c']) kdf = koalas.DataFrame(pdf) self.assert_eq( kdf.groupby("b").apply(lambda x: x + 1).sort_index(), pdf.groupby("b").apply(lambda x: x + 1).sort_index()) self.assert_eq( kdf.groupby(['a', 'b']).apply(lambda x: x * x).sort_index(), pdf.groupby(['a', 'b']).apply(lambda x: x * x).sort_index()) self.assert_eq( kdf.groupby(['b'])['a'].apply(lambda x: x).sort_index(), pdf.groupby(['b'])['a'].apply(lambda x: x).sort_index()) with self.assertRaisesRegex( TypeError, "<class 'int'> object is not callable"): kdf.groupby("b").apply(1) finally: reset_option('compute.shortcut_limit')
def test_repr_indexes(self): kdf = ks.range(ReprTests.max_display_count) kidx = kdf.index self.assertTrue("Showing only the first" not in repr(kidx)) self.assert_eq(repr(kidx), repr(kidx.to_pandas())) kdf = ks.range(ReprTests.max_display_count + 1) kidx = kdf.index self.assertTrue("Showing only the first" in repr(kidx)) set_option("display.max_rows", None) try: kdf = ks.range(ReprTests.max_display_count + 1) kidx = kdf.index self.assert_eq(repr(kidx), repr(kidx.to_pandas())) finally: set_option("display.max_rows", ReprTests.max_display_count)
def test_transform(self): pdf = pd.DataFrame( { 'a': [1, 2, 3, 4, 5, 6], 'b': [1, 1, 2, 3, 5, 8], 'c': [1, 4, 9, 16, 25, 36] }, columns=['a', 'b', 'c']) kdf = koalas.DataFrame(pdf) self.assert_eq( kdf.groupby("b").transform(lambda x: x + 1).sort_index(), pdf.groupby("b").transform(lambda x: x + 1).sort_index()) self.assert_eq( kdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index(), pdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index()) self.assert_eq( kdf.groupby(['b'])['a'].transform(lambda x: x).sort_index(), pdf.groupby(['b'])['a'].transform(lambda x: x).sort_index()) set_option('compute.shortcut_limit', 1000) try: pdf = pd.DataFrame( { 'a': [1, 2, 3, 4, 5, 6] * 300, 'b': [1, 1, 2, 3, 5, 8] * 300, 'c': [1, 4, 9, 16, 25, 36] * 300 }, columns=['a', 'b', 'c']) kdf = koalas.DataFrame(pdf) self.assert_eq( kdf.groupby("b").transform(lambda x: x + 1).sort_index(), pdf.groupby("b").transform(lambda x: x + 1).sort_index()) self.assert_eq( kdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index(), pdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index()) self.assert_eq( kdf.groupby(['b'])['a'].transform(lambda x: x).sort_index(), pdf.groupby(['b'])['a'].transform(lambda x: x).sort_index()) with self.assertRaisesRegex( TypeError, "<class 'int'> object is not callable"): kdf.groupby("b").transform(1) finally: reset_option('compute.shortcut_limit')
def setUpClass(cls): super().setUpClass() if LooseVersion(pd.__version__) >= LooseVersion("0.25"): pd.set_option("plotting.backend", "matplotlib") set_option("plotting.backend", "matplotlib") set_option("plotting.max_rows", 2000) set_option("plotting.sample_ratio", None)
def setUpClass(cls): super().setUpClass() pd.set_option("plotting.backend", "plotly") set_option("plotting.backend", "plotly") set_option("plotting.max_rows", 1000) set_option("plotting.sample_ratio", None)
def setUpClass(cls): super(DataFramePlotTest, cls).setUpClass() set_option('plotting.max_rows', 2000)
model = load_model("runs:/{run_id}/model".format(run_id=run_info.run_uuid)) # Prédiction et Score df = ks.DataFrame(X_test) df["prediction"] = model.predict(df) stop = datetime.now() print("Temps préparation et inférence (ML) : ", (stop - start).seconds, "s") # %% ##### 7e changement : Il faut donc recalculer le score nous même from databricks.koalas.config import set_option, reset_option set_option("compute.ops_on_diff_frames", True) # Score : The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum() reel = ks.Series(y_test).to_frame().rename(columns={0: 'Reel'}) result = ks.concat([df, reel], axis=1) result['square_diff_true_pred'] = (result['Reel'] - result['prediction'])**2 u = result['square_diff_true_pred'].sum() v = ((result['Reel'] - result['Reel'].mean())**2).sum() score = (1 - u / v) print(f"score: {score}") # %% [markdown] # ## Entrainement et inférence avec Pipeline
def setUpClass(cls): super(OpsOnDiffFramesDisabledTest, cls).setUpClass() set_option('compute.ops_on_diff_frames', False)
def setUpClass(cls): super(DistributedDefaultIndexTest, cls).setUpClass() set_option('compute.default_index_type', 'distributed')
def setUpClass(cls): super(DistributedOneByOneDefaultIndexTest, cls).setUpClass() set_option('compute.default_index_type', 'distributed-sequence')
def setUpClass(cls): super().setUpClass() set_option("display.max_rows", ReprTest.max_display_count)
def setUpClass(cls): super().setUpClass() set_option("compute.ops_on_diff_frames", True)
def setUpClass(cls): super(DataFramePlotTest, cls).setUpClass() set_option("plotting.max_rows", 2000) set_option("plotting.sample_ratio", None)
def setUpClass(cls): super(SeriesPlotTest, cls).setUpClass() set_option('plotting.max_rows', 1000)
def setUpClass(cls): super().setUpClass() set_option("plotting.max_rows", 2000) set_option("plotting.sample_ratio", None)
def setUpClass(cls): super().setUpClass() set_option("plotting.max_rows", 1000)
def setUpClass(cls): set_option("display.max_rows", ReprTests.max_display_count)
def setUpClass(cls): super(OpsOnDiffFramesGroupByTest, cls).setUpClass() set_option("compute.ops_on_diff_frames", True)
def setUpClass(cls): super(OneByOneDefaultIndexTest, cls).setUpClass() set_option("compute.default_index_type", "sequence")