def bq_test(): bq_op_for_queries([ # Toy example query "SELECT 1 AS field1, 2 AS field2;", # Test access of public BQ historical dataset (only processes ~2MB here) # pylint: disable=line-too-long """SELECT * FROM `weathersource-com.pub_weather_data_samples.sample_weather_history_anomaly_us_zipcode_daily` ORDER BY postal_code ASC, date_valid_std ASC LIMIT 1""", ]).alias("bq_query_op")()
def test_gcs_load(): dataset = get_dataset() table = "%s.%s" % (dataset, "df") create_op = bq_create_dataset.alias("create_op") query_op = bq_op_for_queries( [ "SELECT string_field_0, string_field_1 FROM %s ORDER BY string_field_0 ASC LIMIT 1" % table ] ).alias("query_op") delete_op = bq_delete_dataset.alias("delete_op") @op(input_defs=[InputDefinition("success", Nothing)], output_defs=[OutputDefinition(List[str])]) def return_gcs_uri(_context): # pylint: disable=unused-argument return ["gs://cloud-samples-data/bigquery/us-states/us-states.csv"] @job(resource_defs={"bigquery": bigquery_resource}) def bq_from_gcs(): delete_op(query_op(import_gcs_paths_to_bq(return_gcs_uri(create_op())))) result = bq_from_gcs.execute_in_process( run_config={ "ops": { "create_op": {"config": {"dataset": dataset, "exists_ok": True}}, "import_gcs_paths_to_bq": { "config": { "destination": table, "load_job_config": { "autodetect": True, "skip_leading_rows": 1, "source_format": "CSV", "write_disposition": "WRITE_TRUNCATE", }, } }, "delete_op": {"config": {"dataset": dataset, "delete_contents": True}}, } } ) assert result.success values = result.output_for_node("query_op") assert values[0].to_dict() == {"string_field_0": {0: "Alabama"}, "string_field_1": {0: "AL"}} assert not dataset_exists(dataset)
def test_pd_df_load(): dataset = get_dataset() table = "%s.%s" % (dataset, "df") test_df = pd.DataFrame({"num1": [1, 3], "num2": [2, 4]}) create_op = bq_create_dataset.alias("create_op") load_op = import_df_to_bq.alias("load_op") query_op = bq_op_for_queries(["SELECT num1, num2 FROM %s" % table]).alias("query_op") delete_op = bq_delete_dataset.alias("delete_op") @op(input_defs=[InputDefinition("success", Nothing)], output_defs=[OutputDefinition(DataFrame)]) def return_df(_context): # pylint: disable=unused-argument return test_df @job(resource_defs={"bigquery": bigquery_resource}) def bq_circle_of_life(): delete_op(query_op(load_op(return_df(create_op())))) result = bq_circle_of_life.execute_in_process( run_config={ "ops": { "create_op": {"config": {"dataset": dataset, "exists_ok": True}}, "load_op": {"config": {"destination": table}}, "delete_op": {"config": {"dataset": dataset, "delete_contents": True}}, } } ) assert result.success values = result.output_for_node("query_op") assert values[0].to_dict() == test_df.to_dict() # BQ loads should throw an exception if pyarrow and fastparquet aren't available with mock.patch.dict(sys.modules, {"pyarrow": None, "fastparquet": None}): with pytest.raises(DagsterExecutionStepExecutionError) as exc_info: bq_circle_of_life.execute_in_process( run_config={ "ops": { "create_op": {"config": {"dataset": dataset, "exists_ok": True}}, "load_op": {"config": {"destination": table}}, "delete_op": {"config": {"dataset": dataset, "delete_contents": True}}, } } ) assert ( "loading data to BigQuery from pandas DataFrames requires either pyarrow or fastparquet" " to be installed" in str(exc_info.value.user_exception) ) @job(resource_defs={"bigquery": bigquery_resource}) def cleanup_bq(): delete_op() result = cleanup_bq.execute_in_process( run_config={ "ops": {"delete_op": {"config": {"dataset": dataset, "delete_contents": True}}} } ) assert result.success assert not dataset_exists(dataset)
def test_config(): bq_op_for_queries(["SELECT 1"]).alias("test")()