def dataframe_loader(_context, config): spark_read = _context.resources.pyspark.spark_session.read file_type, file_options = list(config.items())[0] path = file_options.get("path") if file_type == "csv": return spark_read.csv(path, **dict_without_keys(file_options, "path")) elif file_type == "parquet": return spark_read.parquet(path, **dict_without_keys(file_options, "path")) elif file_type == "json": return spark_read.json(path, **dict_without_keys(file_options, "path")) elif file_type == "jdbc": return spark_read.jdbc(**file_options) elif file_type == "orc": return spark_read.orc(path, **dict_without_keys(file_options, "path")) elif file_type == "table": return spark_read.table(**file_options) elif file_type == "text": return spark_read.text(path, **dict_without_keys(file_options, "path")) elif file_type == "other": return spark_read.load(**file_options) else: raise DagsterInvariantViolationError( "Unsupported file_type {file_type}".format(file_type=file_type) )
def test_dataframe_outputs(file_type, read, other): df = create_pyspark_df() @solid(output_defs=[ OutputDefinition(dagster_type=DagsterPySparkDataFrame, name="df") ]) def return_df(_): return df with get_temp_dir() as temp_path: shutil.rmtree(temp_path) options = {"path": temp_path} if other: options["format"] = file_type file_type = "other" result = execute_solid( return_df, run_config={ "solids": { "return_df": { "outputs": [{ "df": { file_type: options } }] } } }, ) assert result.success actual = read(options["path"], **dict_without_keys(options, "path")) assert sorted(df.collect()) == sorted(actual.collect()) result = execute_solid( return_df, run_config={ "solids": { "return_df": { "outputs": [{ "df": { file_type: dict( { "mode": "overwrite", "compression": "gzip", }, **options) } }] } } }, ) assert result.success actual = read(options["path"], **dict_without_keys(options, "path")) assert sorted(df.collect()) == sorted(actual.collect())
def dataframe_loader(_context, config): file_type, file_options = list(config.items())[0] if file_type == "csv": path = file_options["path"] return pd.read_csv(path, **dict_without_keys(file_options, "path")) elif file_type == "parquet": return pd.read_parquet(file_options["path"]) elif file_type == "table": return pd.read_csv(file_options["path"], sep="\t") else: raise DagsterInvariantViolationError( "Unsupported file_type {file_type}".format(file_type=file_type))
def dataframe_materializer(_context, config, pandas_df): check.inst_param(pandas_df, "pandas_df", pd.DataFrame) file_type, file_options = list(config.items())[0] if file_type == "csv": path = file_options["path"] pandas_df.to_csv(path, index=False, **dict_without_keys(file_options, "path")) elif file_type == "parquet": pandas_df.to_parquet(file_options["path"]) elif file_type == "table": pandas_df.to_csv(file_options["path"], sep="\t", index=False) elif file_type == "pickle": pandas_df.to_pickle(file_options["path"]) else: check.failed("Unsupported file_type {file_type}".format(file_type=file_type)) return AssetMaterialization.file(file_options["path"])
def test_dataframe_inputs(file_type, read, other): @solid(input_defs=[InputDefinition(dagster_type=DagsterPySparkDataFrame, name="input_df")],) def return_df(_, input_df): return input_df options = {"path": file_relative_path(__file__, "num.{file_type}".format(file_type=file_type))} if other: options["format"] = file_type file_type = "other" result = execute_solid( return_df, mode_def=ModeDefinition(resource_defs={"pyspark": pyspark_resource}), run_config={"solids": {"return_df": {"inputs": {"input_df": {file_type: options}}}}}, ) assert result.success actual = read(options["path"], **dict_without_keys(options, "path")) assert sorted(result.output_value().collect()) == sorted(actual.collect())