from .common_bucket_s3_pickle_io_manager import common_bucket_s3_pickle_io_manager from .parquet_io_manager import ( local_partitioned_parquet_io_manager, s3_partitioned_parquet_io_manager, ) from .snowflake_io_manager import snowflake_io_manager configured_pyspark = pyspark_resource.configured({ "spark_conf": { "spark.jars.packages": ",".join([ "net.snowflake:snowflake-jdbc:3.8.0", "net.snowflake:spark-snowflake_2.12:2.8.2-spark_3.0", "com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.7", ]), "spark.hadoop.fs.s3.impl": "org.apache.hadoop.fs.s3native.NativeS3FileSystem", "spark.hadoop.fs.s3.awsAccessKeyId": os.getenv("AWS_ACCESS_KEY_ID", ""), "spark.hadoop.fs.s3.awsSecretAccessKey": os.getenv("AWS_SECRET_ACCESS_KEY", ""), "spark.hadoop.fs.s3.buffer.dir": "/tmp", } }) snowflake_io_manager_prod = snowflake_io_manager.configured( {"database": "DEMO_DB_ASSETS"}) RESOURCES_PROD = { "s3_bucket": ResourceDefinition.hardcoded_resource("hackernews-elementl-prod"),
def count_people(people: DataFrame) -> int: return people.count() emr_resource_defs = { "pyspark_step_launcher": emr_pyspark_step_launcher.configured( { "cluster_id": {"env": "EMR_CLUSTER_ID"}, "local_pipeline_package_path": str(Path(__file__).parent), "deploy_local_pipeline_package": True, "region_name": "us-west-1", "staging_bucket": "my_staging_bucket", "wait_for_logs": True, } ), "pyspark": pyspark_resource.configured({"spark_conf": {"spark.executor.memory": "2g"}}), "s3": s3_resource, "io_manager": s3_pickle_io_manager.configured( {"s3_bucket": "my_staging_bucket", "s3_prefix": "simple-pyspark"} ), } local_resource_defs = { "pyspark_step_launcher": no_step_launcher, "pyspark": pyspark_resource.configured({"spark_conf": {"spark.default.parallelism": 1}}), } @graph def count_people_over_50(): count_people(filter_over_50(make_people()))
"warehouse_io_manager": fs_io_manager, "pyspark": pyspark_resource, "hn_client": hn_api_subsample_client.configured({"sample_rate": 10}), } PROD_RESOURCES = { "io_manager": s3_pickle_io_manager.configured({"s3_bucket": "hackernews-elementl-prod"}), "s3": s3_resource, "partition_start": ResourceDefinition.string_resource(), "partition_end": ResourceDefinition.string_resource(), "parquet_io_manager": partitioned_parquet_io_manager.configured( {"base_path": "s3://hackernews-elementl-prod"} ), "warehouse_io_manager": time_partitioned_snowflake_io_manager_prod, "pyspark": pyspark_resource.configured(S3_SPARK_CONF), "hn_client": hn_api_subsample_client.configured({"sample_rate": 10}), } download_pipeline_properties = { "description": "#### Owners:\n" "[email protected], [email protected]\n " "#### About\n" "This pipeline downloads all items from the HN API for a given day, " "splits the items into stories and comment types using Spark, and uploads filtered items to " "the corresponding stories or comments Snowflake table", "tags": { "dagster-k8s/config": { "container_config": { "resources": { "requests": {"cpu": "500m", "memory": "2Gi"},