Пример #1
0
def test_convert_to_spark(spark_session):
    spark_df = spark_session.range(0, 10)
    converted, is_spark_df = utils.convert_to_spark(spark_df)
    assert is_spark_df
    assert spark_df is converted

    koalas_df = ks.range(0, 10)
    converted, is_spark_df = utils.convert_to_spark(koalas_df)
    assert not is_spark_df
    assert isinstance(converted, pyspark.sql.DataFrame)
    assert converted.count() == 10

    other_df = "df"
    error_msg = (f"The type: {type(other_df)} is not supported, only support " +
                 "pyspark.sql.DataFrame and databricks.koalas.DataFrame")
    with pytest.raises(Exception) as exinfo:
        utils.df_type_check(other_df)
    assert str(exinfo.value) == error_msg
Пример #2
0
def save_to_ray(df: Union[DataFrame, 'koalas.DataFrame'],
                num_shards: int) -> PandasDataset:
    """
    Save the pyspark.sql.DataFrame or koalas.DataFrame to Ray ObjectStore and return
    a SharedDataset which could fit into the 'Estimator' for distributed model training.
    :param df: ether pyspark.sql.DataFrame or koalas.DataFrame
    :param num_shards: the number of shard when stored
    :return: a PandasDataset
    """
    with _spark_context_lock:
        global _global_spark_context
        if _global_spark_context is None:
            raise Exception("You should init the Spark context firstly.")
        # convert to Spark sql DF
        df, _ = convert_to_spark(df)
        return _global_spark_context._get_or_create_spark_cluster(
        ).save_to_ray(df, num_shards)
Пример #3
0
 def _check_and_convert(self, df):
     train_df, _ = convert_to_spark(df)
     return train_df