# "branch":"dev"}}, }, "resources": { "metadata_root": { "config": { "env": "METADATA" } }, }, } @dg.pipeline( mode_defs=[ dg.ModeDefinition(resource_defs={ "pandas_csv": df_csv_io_manager, "metadata_root": csv_root_input, }) ], preset_defs=[ dg.PresetDefinition( "default", run_config=preset, mode="default", ) ], ) def images_pipeline(): files = file_picker() to_tag = file_dispatcher(files=files) images_df = create_images_df(files=files) ok = update_metadata(df=images_df)
metadata_new["SSID"] = metadata_new["SSID"].astype( np.float).astype("Int32") return metadata_new.set_index("Source ID") ################ PIPELINE ################## @dg.pipeline( mode_defs=[ dg.ModeDefinition( name="default", resource_defs={ "pandas_csv": df_csv_io_manager, "cumulus_root": csv_root_input, "jstor_root": xls_root_input, "wikidata_root": csv_root_input, "portals_root": csv_root_input, "camera_root": geojson_root_input, "images_root": csv_root_input, }, ) ], preset_defs=[ dg.PresetDefinition( "default", run_config=preset, mode="default", ) ], ) def metadata_pipeline():
# "config":{ # "commit":"API Portals", # "branch":"dev"}}, }, "resources": {"metadata_root": {"config": {"env": "METADATA"}}}, } ################ PIPELINE ################## @dg.pipeline( mode_defs=[ dg.ModeDefinition( name="default", resource_defs={ "pandas_csv": df_csv_io_manager, "metadata_root": csv_root_input, }, ), ], preset_defs=[ dg.PresetDefinition( "default", run_config=preset, mode="default", ), dg.PresetDefinition( "preset_omeka", run_config=preset_omeka, solid_selection=["query_omeka", "omeka_dataframe", "update_metadata","push_new_data"], mode="default",
default = { "resources": { "metadata_root": {"config": {"env": "METADATA"}}, "mapping_root": {"config": {"env": "MAPPING"}}, "iiif_manager": {"config": {"s3_bucket": "imaginerio-images"}}, }, } @dg.pipeline( mode_defs=[ dg.ModeDefinition( name="prod", resource_defs={ "metadata_root": csv_root_input, "mapping_root": csv_root_input, "iiif_manager": s3_io_manager, "s3": s3_resource, }, ), dg.ModeDefinition( name="test", resource_defs={ "metadata_root": csv_root_input, "mapping_root": csv_root_input, "iiif_manager": json_local_io_manager, }, ), ], preset_defs=[ dg.PresetDefinition(
}, # "push_new_data": {"config": "dashboard"}, }, } @dg.pipeline( mode_defs=[ dg.ModeDefinition( name="default", resource_defs={ "pandas_csv": df_csv_io_manager, "metadata_root": csv_root_input, "smapshot_root": csv_root_input, "cumulus_root": csv_root_input, "wikidata_root": csv_root_input, "camera_root": geojson_root_input, "images_root": csv_root_input, "omeka_root": csv_root_input, "mapping_root": csv_root_input, "portals_root": csv_root_input, }, ) ], preset_defs=[ dg.PresetDefinition( "default", run_config=preset, mode="default", ), dg.PresetDefinition(
mode_local = dagster.ModeDefinition( name='local', resource_defs={ 'database': resources.postgres_database, 'datalake': resources.datalake, 'pyspark_step_launcher': no_step_launcher, 'pyspark': dagster_pyspark.pyspark_resource.configured({'spark_conf': { 'spark.submit.pyFiles': dagster.file_relative_path( __file__, '../../../../../packages/articles_aggregator-0.0.0-py3-none-any.whl'), # 'spark.archives': dagster.file_relative_path( # __file__, '../../../../../packages/pyspark_conda_env.tar.gz#environment'), 'spark.jars.packages': 'io.delta:delta-core_2.12:0.8.0', 'spark.sql.extensions': 'io.delta.sql.DeltaSparkSessionExtension', 'spark.sql.catalog.spark_catalog': 'org.apache.spark.sql.delta.catalog.DeltaCatalog', 'spark.sql.adaptive.enabled': 'true', 'spark.sql.execution.arrow.pyspark.enabled': 'true', 'spark.sql.execution.arrow.maxRecordsPerBatch': 1000, # because of text preprocessing job. # 'spark.default.parallelism': 8, 'spark.jars': r'C:\Users\Tim\Programs\spark\gcs-connector-hadoop3-latest.jar', 'spark.hadoop.fs.gs.impl': 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem', 'spark.hadoop.fs.AbstractFileSystem.gs.impl': 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS', 'spark.hadoop.google.cloud.auth.service.account.enable': 'true', 'spark.hadoop.google.cloud.auth.service.account.json.keyfile': dagster.file_relative_path(__file__, '../../../../../configs/gcs_keyfile.json'), # 'spark.executor.instances': 1, # 'spark.executor.cores': 2, # 'spark.executor.memory': '1g', 'spark.driver.memory': '8g', }}), } )
# "branch":"dev" # }}, }, "resources": { "metadata_root": {"config": {"env": "METADATA"}}, "cumulus_root": {"config": {"env": "CUMULUS"}}}, } @dg.pipeline( mode_defs=[ dg.ModeDefinition( name="default", resource_defs={ "geojson": geojson_io_manager, "pandas_csv": df_csv_io_manager, "metadata_root": csv_root_input, "cumulus_root": csv_root_input } ) ], preset_defs=[ dg.PresetDefinition( "default", run_config=preset, mode="default", ) ], ) def camera_pipeline():