def bq_pipeline(): bq_solid_for_queries([ # Toy example query "SELECT 1 AS field1, 2 AS field2;", # Test access of public BQ historical dataset (only processes ~2MB here) # pylint: disable=line-too-long """SELECT * FROM `weathersource-com.pub_weather_data_samples.sample_weather_history_anomaly_us_zipcode_daily` ORDER BY postal_code ASC, date_valid_std ASC LIMIT 1""", ]).alias("bq_query_solid")()
def explore_visits_by_hour(start): with open(file_relative_path(__file__, 'sql/explore_visits_by_hour.sql'), 'r') as f: query = f.read() return bq_solid_for_queries( [query]).alias('explore_visits_by_hour_internal')(start=start)
def test_pd_df_load(): dataset = get_dataset() table = '%s.%s' % (dataset, 'df') test_df = pd.DataFrame({'num1': [1, 3], 'num2': [2, 4]}) create_solid = bq_create_dataset.alias('create_solid') load_solid = import_df_to_bq.alias('load_solid') query_solid = bq_solid_for_queries(['SELECT num1, num2 FROM %s' % table]).alias('query_solid') delete_solid = bq_delete_dataset.alias('delete_solid') @solid( input_defs=[InputDefinition('success', Nothing)], output_defs=[OutputDefinition(DataFrame)] ) def return_df(_context): # pylint: disable=unused-argument return test_df config = { 'solids': { 'create_solid': {'config': {'dataset': dataset, 'exists_ok': True}}, 'load_solid': {'config': {'destination': table}}, 'delete_solid': {'config': {'dataset': dataset, 'delete_contents': True}}, } } @pipeline(mode_defs=bq_modes()) def bq_pipeline(): delete_solid(query_solid(load_solid(return_df(create_solid())))) result = execute_pipeline(bq_pipeline, config) assert result.success values = result.result_for_solid('query_solid').output_value() assert values[0].to_dict() == test_df.to_dict() # BQ loads should throw an exception if pyarrow and fastparquet aren't available with mock.patch.dict(sys.modules, {'pyarrow': None, 'fastparquet': None}): with pytest.raises(DagsterExecutionStepExecutionError) as exc_info: result = execute_pipeline(bq_pipeline, config) assert ( 'loading data to BigQuery from pandas DataFrames requires either pyarrow or fastparquet' ' to be installed' in str(exc_info.value.user_exception) ) cleanup_config = { 'solids': {'delete_solid': {'config': {'dataset': dataset, 'delete_contents': True}}} } @pipeline(mode_defs=bq_modes()) def cleanup(): delete_solid() assert execute_pipeline(cleanup, cleanup_config).success assert not dataset_exists(dataset)
def test_pd_df_load(): dataset = get_dataset() table = "%s.%s" % (dataset, "df") test_df = pd.DataFrame({"num1": [1, 3], "num2": [2, 4]}) create_solid = bq_create_dataset.alias("create_solid") load_solid = import_df_to_bq.alias("load_solid") query_solid = bq_solid_for_queries(["SELECT num1, num2 FROM %s" % table]).alias("query_solid") delete_solid = bq_delete_dataset.alias("delete_solid") @solid( input_defs=[InputDefinition("success", Nothing)], output_defs=[OutputDefinition(DataFrame)] ) def return_df(_context): # pylint: disable=unused-argument return test_df config = { "solids": { "create_solid": {"config": {"dataset": dataset, "exists_ok": True}}, "load_solid": {"config": {"destination": table}}, "delete_solid": {"config": {"dataset": dataset, "delete_contents": True}}, } } @pipeline(mode_defs=bq_modes()) def bq_pipeline(): delete_solid(query_solid(load_solid(return_df(create_solid())))) result = execute_pipeline(bq_pipeline, config) assert result.success values = result.result_for_solid("query_solid").output_value() assert values[0].to_dict() == test_df.to_dict() # BQ loads should throw an exception if pyarrow and fastparquet aren't available with mock.patch.dict(sys.modules, {"pyarrow": None, "fastparquet": None}): with pytest.raises(DagsterExecutionStepExecutionError) as exc_info: result = execute_pipeline(bq_pipeline, config) assert ( "loading data to BigQuery from pandas DataFrames requires either pyarrow or fastparquet" " to be installed" in str(exc_info.value.user_exception) ) cleanup_config = { "solids": {"delete_solid": {"config": {"dataset": dataset, "delete_contents": True}}} } @pipeline(mode_defs=bq_modes()) def cleanup(): delete_solid() assert execute_pipeline(cleanup, cleanup_config).success assert not dataset_exists(dataset)
def my_gcp_dataops_pipeline(): sql_process = bq_solid_for_queries([ cfg.sql_query["QUERY"] ]).alias("export_bq_table_gcs")(dbt_cli_run.configured( {"project-dir": cfg.dbt_config["DBT_PROJECT_DIR"]}, name="run_bq_dbt")()) spark_process = delete_dataproc_cluster( data_proc_spark_operator(create_dataproc_cluster(sql_process))) jupyter_process = dm.define_dagstermill_solid( "view_data_matplot", script_relative_path("jupyter/view_data.ipynb"), input_defs=[InputDefinition("start", Nothing)])(download_file(spark_process))
def test_gcs_load(): dataset = get_dataset() table = '%s.%s' % (dataset, 'df') create_solid = bq_create_dataset.alias('create_solid') query_solid = bq_solid_for_queries( [ 'SELECT string_field_0, string_field_1 FROM %s ORDER BY string_field_0 ASC LIMIT 1' % table ] ).alias('query_solid') delete_solid = bq_delete_dataset.alias('delete_solid') @solid( input_defs=[InputDefinition('success', Nothing)], output_defs=[OutputDefinition(List[Path])] ) def return_gcs_uri(_context): # pylint: disable=unused-argument return ["gs://cloud-samples-data/bigquery/us-states/us-states.csv"] config = { 'solids': { 'create_solid': {'config': {'dataset': dataset, 'exists_ok': True}}, 'import_gcs_paths_to_bq': { 'config': { 'destination': table, 'load_job_config': { 'autodetect': True, 'skip_leading_rows': 1, 'source_format': 'CSV', 'write_disposition': 'WRITE_TRUNCATE', }, } }, 'delete_solid': {'config': {'dataset': dataset, 'delete_contents': True}}, } } @pipeline(mode_defs=bq_modes()) def bq_pipeline(): delete_solid(query_solid(import_gcs_paths_to_bq(return_gcs_uri(create_solid())))) result = execute_pipeline(bq_pipeline, config) assert result.success values = result.result_for_solid('query_solid').output_value() assert values[0].to_dict() == {'string_field_0': {0: 'Alabama'}, 'string_field_1': {0: 'AL'}} assert not dataset_exists(dataset)
def test_gcs_load(): dataset = get_dataset() table = "%s.%s" % (dataset, "df") create_solid = bq_create_dataset.alias("create_solid") query_solid = bq_solid_for_queries( [ "SELECT string_field_0, string_field_1 FROM %s ORDER BY string_field_0 ASC LIMIT 1" % table ] ).alias("query_solid") delete_solid = bq_delete_dataset.alias("delete_solid") @solid( input_defs=[InputDefinition("success", Nothing)], output_defs=[OutputDefinition(List[str])] ) def return_gcs_uri(_context): # pylint: disable=unused-argument return ["gs://cloud-samples-data/bigquery/us-states/us-states.csv"] config = { "solids": { "create_solid": {"config": {"dataset": dataset, "exists_ok": True}}, "import_gcs_paths_to_bq": { "config": { "destination": table, "load_job_config": { "autodetect": True, "skip_leading_rows": 1, "source_format": "CSV", "write_disposition": "WRITE_TRUNCATE", }, } }, "delete_solid": {"config": {"dataset": dataset, "delete_contents": True}}, } } @pipeline(mode_defs=bq_modes()) def bq_pipeline(): delete_solid(query_solid(import_gcs_paths_to_bq(return_gcs_uri(create_solid())))) result = execute_pipeline(bq_pipeline, config) assert result.success values = result.result_for_solid("query_solid").output_value() assert values[0].to_dict() == {"string_field_0": {0: "Alabama"}, "string_field_1": {0: "AL"}} assert not dataset_exists(dataset)
def test_config_pipeline(): bq_solid_for_queries(['SELECT 1']).alias('test')() # pylint: disable=no-value-for-parameter
def _test(): bq_solid_for_queries(["SELECT 1"])() bq_solid_for_queries(["SELECT *"])()
def test_config_pipeline(): bq_solid_for_queries(["SELECT 1"]).alias("test")()
def _test(): bq_solid_for_queries(['SELECT 1'])() bq_solid_for_queries(['SELECT *'])()
def test_config_pipeline(): bq_solid_for_queries(['SELECT 1']).alias('test')()
def _bq_sql_solid(start): return bq_solid_for_queries([sql ]).alias('bq_sql_internal')(start=start)