def test_convert_to_spark(spark_session): spark_df = spark_session.range(0, 10) converted, is_spark_df = utils.convert_to_spark(spark_df) assert is_spark_df assert spark_df is converted koalas_df = ks.range(0, 10) converted, is_spark_df = utils.convert_to_spark(koalas_df) assert not is_spark_df assert isinstance(converted, pyspark.sql.DataFrame) assert converted.count() == 10 other_df = "df" error_msg = (f"The type: {type(other_df)} is not supported, only support " + "pyspark.sql.DataFrame and databricks.koalas.DataFrame") with pytest.raises(Exception) as exinfo: utils.df_type_check(other_df) assert str(exinfo.value) == error_msg
def save_to_ray(df: Union[DataFrame, 'koalas.DataFrame'], num_shards: int) -> PandasDataset: """ Save the pyspark.sql.DataFrame or koalas.DataFrame to Ray ObjectStore and return a SharedDataset which could fit into the 'Estimator' for distributed model training. :param df: ether pyspark.sql.DataFrame or koalas.DataFrame :param num_shards: the number of shard when stored :return: a PandasDataset """ with _spark_context_lock: global _global_spark_context if _global_spark_context is None: raise Exception("You should init the Spark context firstly.") # convert to Spark sql DF df, _ = convert_to_spark(df) return _global_spark_context._get_or_create_spark_cluster( ).save_to_ray(df, num_shards)
def _check_and_convert(self, df): train_df, _ = convert_to_spark(df) return train_df