Пример #1
0
 def bq_pipeline():
     bq_solid_for_queries([
         # Toy example query
         "SELECT 1 AS field1, 2 AS field2;",
         # Test access of public BQ historical dataset (only processes ~2MB here)
         # pylint: disable=line-too-long
         """SELECT *
         FROM `weathersource-com.pub_weather_data_samples.sample_weather_history_anomaly_us_zipcode_daily`
         ORDER BY postal_code ASC, date_valid_std ASC
         LIMIT 1""",
     ]).alias("bq_query_solid")()
Пример #2
0
def explore_visits_by_hour(start):
    with open(file_relative_path(__file__, 'sql/explore_visits_by_hour.sql'),
              'r') as f:
        query = f.read()

    return bq_solid_for_queries(
        [query]).alias('explore_visits_by_hour_internal')(start=start)
Пример #3
0
def test_pd_df_load():
    dataset = get_dataset()
    table = '%s.%s' % (dataset, 'df')

    test_df = pd.DataFrame({'num1': [1, 3], 'num2': [2, 4]})

    create_solid = bq_create_dataset.alias('create_solid')
    load_solid = import_df_to_bq.alias('load_solid')
    query_solid = bq_solid_for_queries(['SELECT num1, num2 FROM %s' % table]).alias('query_solid')
    delete_solid = bq_delete_dataset.alias('delete_solid')

    @solid(
        input_defs=[InputDefinition('success', Nothing)], output_defs=[OutputDefinition(DataFrame)]
    )
    def return_df(_context):  # pylint: disable=unused-argument
        return test_df

    config = {
        'solids': {
            'create_solid': {'config': {'dataset': dataset, 'exists_ok': True}},
            'load_solid': {'config': {'destination': table}},
            'delete_solid': {'config': {'dataset': dataset, 'delete_contents': True}},
        }
    }

    @pipeline(mode_defs=bq_modes())
    def bq_pipeline():
        delete_solid(query_solid(load_solid(return_df(create_solid()))))

    result = execute_pipeline(bq_pipeline, config)
    assert result.success

    values = result.result_for_solid('query_solid').output_value()
    assert values[0].to_dict() == test_df.to_dict()

    # BQ loads should throw an exception if pyarrow and fastparquet aren't available
    with mock.patch.dict(sys.modules, {'pyarrow': None, 'fastparquet': None}):
        with pytest.raises(DagsterExecutionStepExecutionError) as exc_info:
            result = execute_pipeline(bq_pipeline, config)
        assert (
            'loading data to BigQuery from pandas DataFrames requires either pyarrow or fastparquet'
            ' to be installed' in str(exc_info.value.user_exception)
        )
        cleanup_config = {
            'solids': {'delete_solid': {'config': {'dataset': dataset, 'delete_contents': True}}}
        }

        @pipeline(mode_defs=bq_modes())
        def cleanup():
            delete_solid()

        assert execute_pipeline(cleanup, cleanup_config).success

    assert not dataset_exists(dataset)
Пример #4
0
def test_pd_df_load():
    dataset = get_dataset()
    table = "%s.%s" % (dataset, "df")

    test_df = pd.DataFrame({"num1": [1, 3], "num2": [2, 4]})

    create_solid = bq_create_dataset.alias("create_solid")
    load_solid = import_df_to_bq.alias("load_solid")
    query_solid = bq_solid_for_queries(["SELECT num1, num2 FROM %s" % table]).alias("query_solid")
    delete_solid = bq_delete_dataset.alias("delete_solid")

    @solid(
        input_defs=[InputDefinition("success", Nothing)], output_defs=[OutputDefinition(DataFrame)]
    )
    def return_df(_context):  # pylint: disable=unused-argument
        return test_df

    config = {
        "solids": {
            "create_solid": {"config": {"dataset": dataset, "exists_ok": True}},
            "load_solid": {"config": {"destination": table}},
            "delete_solid": {"config": {"dataset": dataset, "delete_contents": True}},
        }
    }

    @pipeline(mode_defs=bq_modes())
    def bq_pipeline():
        delete_solid(query_solid(load_solid(return_df(create_solid()))))

    result = execute_pipeline(bq_pipeline, config)
    assert result.success

    values = result.result_for_solid("query_solid").output_value()
    assert values[0].to_dict() == test_df.to_dict()

    # BQ loads should throw an exception if pyarrow and fastparquet aren't available
    with mock.patch.dict(sys.modules, {"pyarrow": None, "fastparquet": None}):
        with pytest.raises(DagsterExecutionStepExecutionError) as exc_info:
            result = execute_pipeline(bq_pipeline, config)
        assert (
            "loading data to BigQuery from pandas DataFrames requires either pyarrow or fastparquet"
            " to be installed" in str(exc_info.value.user_exception)
        )
        cleanup_config = {
            "solids": {"delete_solid": {"config": {"dataset": dataset, "delete_contents": True}}}
        }

        @pipeline(mode_defs=bq_modes())
        def cleanup():
            delete_solid()

        assert execute_pipeline(cleanup, cleanup_config).success

    assert not dataset_exists(dataset)
Пример #5
0
def my_gcp_dataops_pipeline():

    sql_process = bq_solid_for_queries([
        cfg.sql_query["QUERY"]
    ]).alias("export_bq_table_gcs")(dbt_cli_run.configured(
        {"project-dir": cfg.dbt_config["DBT_PROJECT_DIR"]},
        name="run_bq_dbt")())
    spark_process = delete_dataproc_cluster(
        data_proc_spark_operator(create_dataproc_cluster(sql_process)))
    jupyter_process = dm.define_dagstermill_solid(
        "view_data_matplot",
        script_relative_path("jupyter/view_data.ipynb"),
        input_defs=[InputDefinition("start",
                                    Nothing)])(download_file(spark_process))
Пример #6
0
def test_gcs_load():
    dataset = get_dataset()
    table = '%s.%s' % (dataset, 'df')

    create_solid = bq_create_dataset.alias('create_solid')
    query_solid = bq_solid_for_queries(
        [
            'SELECT string_field_0, string_field_1 FROM %s ORDER BY string_field_0 ASC LIMIT 1'
            % table
        ]
    ).alias('query_solid')
    delete_solid = bq_delete_dataset.alias('delete_solid')

    @solid(
        input_defs=[InputDefinition('success', Nothing)], output_defs=[OutputDefinition(List[Path])]
    )
    def return_gcs_uri(_context):  # pylint: disable=unused-argument
        return ["gs://cloud-samples-data/bigquery/us-states/us-states.csv"]

    config = {
        'solids': {
            'create_solid': {'config': {'dataset': dataset, 'exists_ok': True}},
            'import_gcs_paths_to_bq': {
                'config': {
                    'destination': table,
                    'load_job_config': {
                        'autodetect': True,
                        'skip_leading_rows': 1,
                        'source_format': 'CSV',
                        'write_disposition': 'WRITE_TRUNCATE',
                    },
                }
            },
            'delete_solid': {'config': {'dataset': dataset, 'delete_contents': True}},
        }
    }

    @pipeline(mode_defs=bq_modes())
    def bq_pipeline():
        delete_solid(query_solid(import_gcs_paths_to_bq(return_gcs_uri(create_solid()))))

    result = execute_pipeline(bq_pipeline, config)
    assert result.success

    values = result.result_for_solid('query_solid').output_value()
    assert values[0].to_dict() == {'string_field_0': {0: 'Alabama'}, 'string_field_1': {0: 'AL'}}

    assert not dataset_exists(dataset)
Пример #7
0
def test_gcs_load():
    dataset = get_dataset()
    table = "%s.%s" % (dataset, "df")

    create_solid = bq_create_dataset.alias("create_solid")
    query_solid = bq_solid_for_queries(
        [
            "SELECT string_field_0, string_field_1 FROM %s ORDER BY string_field_0 ASC LIMIT 1"
            % table
        ]
    ).alias("query_solid")
    delete_solid = bq_delete_dataset.alias("delete_solid")

    @solid(
        input_defs=[InputDefinition("success", Nothing)], output_defs=[OutputDefinition(List[str])]
    )
    def return_gcs_uri(_context):  # pylint: disable=unused-argument
        return ["gs://cloud-samples-data/bigquery/us-states/us-states.csv"]

    config = {
        "solids": {
            "create_solid": {"config": {"dataset": dataset, "exists_ok": True}},
            "import_gcs_paths_to_bq": {
                "config": {
                    "destination": table,
                    "load_job_config": {
                        "autodetect": True,
                        "skip_leading_rows": 1,
                        "source_format": "CSV",
                        "write_disposition": "WRITE_TRUNCATE",
                    },
                }
            },
            "delete_solid": {"config": {"dataset": dataset, "delete_contents": True}},
        }
    }

    @pipeline(mode_defs=bq_modes())
    def bq_pipeline():
        delete_solid(query_solid(import_gcs_paths_to_bq(return_gcs_uri(create_solid()))))

    result = execute_pipeline(bq_pipeline, config)
    assert result.success

    values = result.result_for_solid("query_solid").output_value()
    assert values[0].to_dict() == {"string_field_0": {0: "Alabama"}, "string_field_1": {0: "AL"}}

    assert not dataset_exists(dataset)
Пример #8
0
 def test_config_pipeline():
     bq_solid_for_queries(['SELECT 1']).alias('test')()  # pylint: disable=no-value-for-parameter
Пример #9
0
 def _test():
     bq_solid_for_queries(["SELECT 1"])()
     bq_solid_for_queries(["SELECT *"])()
Пример #10
0
 def test_config_pipeline():
     bq_solid_for_queries(["SELECT 1"]).alias("test")()
Пример #11
0
 def _test():
     bq_solid_for_queries(['SELECT 1'])()
     bq_solid_for_queries(['SELECT *'])()
Пример #12
0
 def test_config_pipeline():
     bq_solid_for_queries(['SELECT 1']).alias('test')()
Пример #13
0
 def _bq_sql_solid(start):
     return bq_solid_for_queries([sql
                                  ]).alias('bq_sql_internal')(start=start)