def to_petastorm(df): metadata = None if util._has_vector_column(df): to_petastorm = util.to_petastorm_fn(["features", "y"], metadata) df = df.rdd.map(to_petastorm).toDF() return df
def test_prepare_data(self): with spark_session('test_prepare_data') as spark: df = create_xor_data(spark) train_rows = df.count() schema_cols = ['features', 'y'] metadata = util._get_metadata(df) assert metadata['features']['intermediate_format'] == constants.ARRAY to_petastorm = util.to_petastorm_fn(schema_cols, metadata) modified_df = df.rdd.map(to_petastorm).toDF() data = modified_df.collect() prepare_data = remote._prepare_data_fn(metadata) features = torch.tensor([data[i].features for i in range(train_rows)]) features_prepared = prepare_data('features', features) assert np.array_equal(features_prepared, features)