예제 #1
0
        #         "branch":"dev"}},
    },
    "resources": {
        "metadata_root": {
            "config": {
                "env": "METADATA"
            }
        },
    },
}


@dg.pipeline(
    mode_defs=[
        dg.ModeDefinition(resource_defs={
            "pandas_csv": df_csv_io_manager,
            "metadata_root": csv_root_input,
        })
    ],
    preset_defs=[
        dg.PresetDefinition(
            "default",
            run_config=preset,
            mode="default",
        )
    ],
)
def images_pipeline():
    files = file_picker()
    to_tag = file_dispatcher(files=files)
    images_df = create_images_df(files=files)
    ok = update_metadata(df=images_df)
예제 #2
0
    metadata_new["SSID"] = metadata_new["SSID"].astype(
        np.float).astype("Int32")
    return metadata_new.set_index("Source ID")


################   PIPELINE   ##################


@dg.pipeline(
    mode_defs=[
        dg.ModeDefinition(
            name="default",
            resource_defs={
                "pandas_csv": df_csv_io_manager,
                "cumulus_root": csv_root_input,
                "jstor_root": xls_root_input,
                "wikidata_root": csv_root_input,
                "portals_root": csv_root_input,
                "camera_root": geojson_root_input,
                "images_root": csv_root_input,
            },
        )
    ],
    preset_defs=[
        dg.PresetDefinition(
            "default",
            run_config=preset,
            mode="default",
        )
    ],
)
def metadata_pipeline():
예제 #3
0
        #     "config":{
        #         "commit":"API Portals",
        #         "branch":"dev"}},
    },
    "resources": {"metadata_root": {"config": {"env": "METADATA"}}},
}

################   PIPELINE   ##################


@dg.pipeline(
    mode_defs=[
        dg.ModeDefinition(
            name="default",
            resource_defs={
                "pandas_csv": df_csv_io_manager,
                "metadata_root": csv_root_input,
            },
        ),
    ],
    preset_defs=[
        dg.PresetDefinition(
            "default",
            run_config=preset,
            mode="default",
        ),
        dg.PresetDefinition(
            "preset_omeka",
            run_config=preset_omeka,
            solid_selection=["query_omeka", "omeka_dataframe", "update_metadata","push_new_data"],
            mode="default",
예제 #4
0
default = {
    "resources": {
        "metadata_root": {"config": {"env": "METADATA"}},
        "mapping_root": {"config": {"env": "MAPPING"}},
        "iiif_manager": {"config": {"s3_bucket": "imaginerio-images"}},
    },
}


@dg.pipeline(
    mode_defs=[
        dg.ModeDefinition(
            name="prod",
            resource_defs={
                "metadata_root": csv_root_input,
                "mapping_root": csv_root_input,
                "iiif_manager": s3_io_manager,
                "s3": s3_resource,
            },
        ),
        dg.ModeDefinition(
            name="test",
            resource_defs={
                "metadata_root": csv_root_input,
                "mapping_root": csv_root_input,
                "iiif_manager": json_local_io_manager,
            },
        ),
    ],
    preset_defs=[
        dg.PresetDefinition(
예제 #5
0
        },
        # "push_new_data": {"config": "dashboard"},
    },
}


@dg.pipeline(
    mode_defs=[
        dg.ModeDefinition(
            name="default",
            resource_defs={
                "pandas_csv": df_csv_io_manager,
                "metadata_root": csv_root_input,
                "smapshot_root": csv_root_input,
                "cumulus_root": csv_root_input,
                "wikidata_root": csv_root_input,
                "camera_root": geojson_root_input,
                "images_root": csv_root_input,
                "omeka_root": csv_root_input,
                "mapping_root": csv_root_input,
                "portals_root": csv_root_input,
            },
        )
    ],
    preset_defs=[
        dg.PresetDefinition(
            "default",
            run_config=preset,
            mode="default",
        ),
        dg.PresetDefinition(
예제 #6
0
mode_local = dagster.ModeDefinition(
    name='local',
    resource_defs={
        'database': resources.postgres_database,
        'datalake': resources.datalake,
        'pyspark_step_launcher': no_step_launcher,
        'pyspark': dagster_pyspark.pyspark_resource.configured({'spark_conf': {
            'spark.submit.pyFiles': dagster.file_relative_path(
                __file__, '../../../../../packages/articles_aggregator-0.0.0-py3-none-any.whl'),
            # 'spark.archives': dagster.file_relative_path(
            #     __file__, '../../../../../packages/pyspark_conda_env.tar.gz#environment'),
            'spark.jars.packages': 'io.delta:delta-core_2.12:0.8.0',
            'spark.sql.extensions': 'io.delta.sql.DeltaSparkSessionExtension',
            'spark.sql.catalog.spark_catalog': 'org.apache.spark.sql.delta.catalog.DeltaCatalog',
            'spark.sql.adaptive.enabled': 'true',
            'spark.sql.execution.arrow.pyspark.enabled': 'true',
            'spark.sql.execution.arrow.maxRecordsPerBatch': 1000,  # because of text preprocessing job.
            # 'spark.default.parallelism': 8,
            'spark.jars': r'C:\Users\Tim\Programs\spark\gcs-connector-hadoop3-latest.jar',
            'spark.hadoop.fs.gs.impl': 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem',
            'spark.hadoop.fs.AbstractFileSystem.gs.impl': 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS',
            'spark.hadoop.google.cloud.auth.service.account.enable': 'true',
            'spark.hadoop.google.cloud.auth.service.account.json.keyfile':
                dagster.file_relative_path(__file__, '../../../../../configs/gcs_keyfile.json'),
            # 'spark.executor.instances': 1,
            # 'spark.executor.cores': 2,
            # 'spark.executor.memory': '1g',
            'spark.driver.memory': '8g',
        }}),
    }
)
예제 #7
0
        #         "branch":"dev"
        #     }},
    },
    "resources": {
        "metadata_root": {"config": {"env": "METADATA"}},
        "cumulus_root": {"config": {"env": "CUMULUS"}}},
}


@dg.pipeline(
    mode_defs=[
        dg.ModeDefinition(
            name="default",
            resource_defs={
                "geojson": geojson_io_manager,
                "pandas_csv": df_csv_io_manager,
                "metadata_root": csv_root_input,
                "cumulus_root": csv_root_input
            }
        )
    ],
    preset_defs=[
        dg.PresetDefinition(
            "default",
            run_config=preset,
            mode="default",
        )
    ],
)
def camera_pipeline():