def test_composite_config_driven_materialization(): @lambda_solid def one(): return 1 @composite_solid def wrap_one(): return one() @pipeline def composite_config_driven_materialization_pipeline(): wrap_one() with get_temp_dir() as write_directory: write_location = os.path.join(write_directory, 'wrap_one.json') execute_pipeline( composite_config_driven_materialization_pipeline, run_config={ 'solids': { 'wrap_one': { 'outputs': [{ 'result': { 'json': { 'path': write_location } } }] } } }, ) assert os.path.exists(write_location)
def test_dataframe_outputs(file_type, read, kwargs): df = create_dask_df() @solid(output_defs=[ OutputDefinition(dagster_type=DataFrame, name='output_df') ]) def return_df(_): return df with get_temp_dir() as temp_path: shutil.rmtree(temp_path) result = execute_solid( return_df, run_config={ 'solids': { 'return_df': { 'outputs': [{ 'output_df': { file_type: { 'path': temp_path, **kwargs } } }] } } }, ) assert result.success actual = read(f"{temp_path}/*") assert assert_eq(actual, df)
def test_missing_resources(): with pytest.raises(DagsterInvalidDefinitionError): with get_temp_dir() as temp_dir: execute_solid_with_resources( cache_file_from_s3, resources={'file_cache': fs_file_cache}, environment_dict={ 'solids': { 'cache_file_from_s3': { 'inputs': { 'bucket_data': { 'bucket': 'some-bucket', 'key': 'some-key' } } } }, 'resources': { 'file_cache': { 'config': { 'target_folder': temp_dir } } }, }, )
def test_composite_config_driven_materialization(composition_decorator): @lambda_solid def one(): return 1 @composition_decorator(output_defs=[OutputDefinition()]) def wrap_one(): return one() @pipeline def composite_config_driven_materialization_pipeline(): wrap_one() with get_temp_dir() as write_directory: write_location = os.path.join(write_directory, "wrap_one.json") execute_pipeline( composite_config_driven_materialization_pipeline, run_config={ "solids": { "wrap_one": {"outputs": [{"result": {"json": {"path": write_location}}}]} } }, ) assert os.path.exists(write_location)
def test_fs_file_cache_write_binary_data(): with get_temp_dir() as temp_dir: file_store = FSFileCache(temp_dir) assert not file_store.has_file_object('foo') assert file_store.write_binary_data('foo', 'bar'.encode()) file_handle = file_store.get_file_handle('foo') assert isinstance(file_handle, LocalFileHandle) assert file_handle.path_desc == os.path.join(temp_dir, 'foo')
def test_fs_file_cache_write_data(): bytes_object = io.BytesIO('bar'.encode()) with get_temp_dir() as temp_dir: file_cache = FSFileCache(temp_dir) assert not file_cache.has_file_object('foo') assert file_cache.write_file_object('foo', bytes_object) file_handle = file_cache.get_file_handle('foo') assert isinstance(file_handle, LocalFileHandle) assert file_handle.path_desc == os.path.join(temp_dir, 'foo')
def test_dataframe_outputs(file_type, read, other): df = create_pyspark_df() @solid(output_defs=[ OutputDefinition(dagster_type=DagsterPySparkDataFrame, name="df") ]) def return_df(_): return df with get_temp_dir() as temp_path: shutil.rmtree(temp_path) options = {"path": temp_path} if other: options["format"] = file_type file_type = "other" result = execute_solid( return_df, run_config={ "solids": { "return_df": { "outputs": [{ "df": { file_type: options } }] } } }, ) assert result.success actual = read(options["path"], **dict_without_keys(options, "path")) assert sorted(df.collect()) == sorted(actual.collect()) result = execute_solid( return_df, run_config={ "solids": { "return_df": { "outputs": [{ "df": { file_type: dict( { "mode": "overwrite", "compression": "gzip", }, **options) } }] } } }, ) assert result.success actual = read(options["path"], **dict_without_keys(options, "path")) assert sorted(df.collect()) == sorted(actual.collect())
def test_basic_file_manager_copy_handle_to_local_temp(): foo_data = 'foo'.encode() with get_temp_dir() as temp_dir: with get_temp_file_handle_with_data(foo_data) as foo_handle: with local_file_manager(temp_dir) as manager: local_temp = manager.copy_handle_to_local_temp(foo_handle) assert local_temp != foo_handle.path with open(local_temp, 'rb') as ff: assert ff.read() == foo_data
def test_dataframe_outputs(file_type, read): df = create_pyspark_df() @solid(output_defs=[ OutputDefinition(dagster_type=DagsterPySparkDataFrame, name='df') ]) def return_df(_): return df with get_temp_dir() as temp_path: shutil.rmtree(temp_path) result = execute_solid( return_df, run_config={ 'solids': { 'return_df': { 'outputs': [{ 'df': { file_type: { 'path': temp_path } } }] } } }, ) assert result.success actual = read(temp_path) assert sorted(df.collect()) == sorted(actual.collect()) result = execute_solid( return_df, run_config={ 'solids': { 'return_df': { 'outputs': [{ 'df': { file_type: { 'path': temp_path, 'mode': 'overwrite', 'compression': 'gzip', } } }] } } }, ) assert result.success actual = read(temp_path) assert sorted(df.collect()) == sorted(actual.collect())
def test_cache_file_from_s3_basic(): s3_session = mock.MagicMock() with get_temp_dir() as temp_dir: pipeline_result = execute_solid_with_resources( cache_file_from_s3, resources={ 'file_cache': fs_file_cache, 's3': ResourceDefinition.hardcoded_resource(S3Resource(s3_session)), }, environment_dict={ 'solids': { 'cache_file_from_s3': { 'inputs': { 'bucket_data': { 'bucket': 'some-bucket', 'key': 'some-key' } } } }, 'resources': { 'file_cache': { 'config': { 'target_folder': temp_dir } } }, }, ) # assert the download occured assert s3_session.download_file.call_count == 1 assert pipeline_result.success solid_result = pipeline_result.result_for_solid('cache_file_from_s3') assert solid_result.success expectation_results = solid_result.expectation_results_during_compute assert len(expectation_results) == 1 expectation_result = expectation_results[0] assert expectation_result.success assert expectation_result.label == 'file_handle_exists' path_in_metadata = expectation_result.metadata_entries[ 0].entry_data.path assert isinstance(path_in_metadata, str) assert os.path.exists(path_in_metadata) assert isinstance(solid_result.result_value(), LocalFileHandle) assert 'some-key' in solid_result.result_value().path_desc
def test_cache_file_from_s3_specify_target_key(): s3_session = mock.MagicMock() with get_temp_dir() as temp_dir: pipeline_result = execute_solid_with_resources( cache_file_from_s3, resources={ 'file_cache': fs_file_cache, 's3': ResourceDefinition.hardcoded_resource(S3Resource(s3_session)), }, environment_dict={ 'solids': { 'cache_file_from_s3': { 'inputs': { 'bucket_data': { 'bucket': 'some-bucket', 'key': 'some-key' } }, 'config': { 'file_key': 'specified-file-key' }, } }, 'resources': { 'file_cache': { 'config': { 'target_folder': temp_dir } } }, }, ) # assert the download occured assert s3_session.download_file.call_count == 1 assert pipeline_result.success solid_result = pipeline_result.result_for_solid('cache_file_from_s3') assert solid_result.success assert isinstance(solid_result.result_value(), LocalFileHandle) assert 'specified-file-key' in solid_result.result_value().path_desc
def test_dataframe_outputs(file_type, read, kwargs): df = create_dask_df() @solid(output_defs=[ OutputDefinition(dagster_type=DataFrame, name="output_df") ]) def return_df(_): return df # https://github.com/dagster-io/dagster/issues/2872 with pytest.warns( UserWarning, match=re.escape( "Specifying {key}: is deprecated. Use to:{key}: instead.". format(key=file_type)), ): with get_temp_dir() as temp_path: shutil.rmtree(temp_path) result = execute_solid( return_df, run_config={ "solids": { "return_df": { "outputs": [{ "output_df": { file_type: { "path": temp_path, **kwargs } } }] } } }, ) assert result.success actual = read(f"{temp_path}/*") assert assert_eq(actual, df)
def test_empty_file_cache(): with get_temp_dir() as temp_dir: file_cache = FSFileCache(temp_dir) assert not file_cache.has_file_object('kjdfkd')
def test_cache_file_from_s3_overwrite(): with get_temp_dir() as temp_dir: s3_session_one = mock.MagicMock() pipeline_result_one = execute_solid_with_resources( cache_file_from_s3, resources={ 'file_cache': fs_file_cache, 's3': ResourceDefinition.hardcoded_resource( S3Resource(s3_session_one)), }, environment_dict={ 'solids': { 'cache_file_from_s3': { 'inputs': { 'bucket_data': { 'bucket': 'some-bucket', 'key': 'some-key' } } } }, 'resources': { 'file_cache': { 'config': { 'target_folder': temp_dir, 'overwrite': True } } }, }, ) assert pipeline_result_one.success # assert the download occured assert s3_session_one.download_file.call_count == 1 s3_session_two = mock.MagicMock() pipeline_result_two = execute_solid_with_resources( cache_file_from_s3, resources={ 'file_cache': fs_file_cache, 's3': ResourceDefinition.hardcoded_resource(s3_session_two), }, environment_dict={ 'solids': { 'cache_file_from_s3': { 'inputs': { 'bucket_data': { 'bucket': 'some-bucket', 'key': 'some-key' } } } }, 'resources': { 'file_cache': { 'config': { 'target_folder': temp_dir, 'overwrite': True } } }, }, ) assert pipeline_result_two.success # assert the download did not occur because file is already there assert s3_session_two.download_file.call_count == 0