def bq_load_events(source_uris: List[String]): return bq_load_solid_for_source( BigQueryLoadSource.GCS).alias('bq_load_events_internal')(source_uris)
def test_pd_df_load(): dataset = get_dataset() table = '%s.%s' % (dataset, 'df') test_df = pd.DataFrame({'num1': [1, 3], 'num2': [2, 4]}) create_solid = bq_create_dataset.alias('create_solid') load_solid = bq_load_solid_for_source( BigQueryLoadSource.DataFrame).alias('load_solid') query_solid = bq_solid_for_queries(['SELECT num1, num2 FROM %s' % table ]).alias('query_solid') delete_solid = bq_delete_dataset.alias('delete_solid') @solid(input_defs=[InputDefinition('success', Nothing)], output_defs=[OutputDefinition(DataFrame)]) def return_df(_context): # pylint: disable=unused-argument return test_df config = { 'solids': { 'create_solid': { 'config': { 'dataset': dataset, 'exists_ok': True } }, 'load_solid': { 'config': { 'destination': table } }, 'delete_solid': { 'config': { 'dataset': dataset, 'delete_contents': True } }, } } @pipeline(mode_defs=bq_modes()) def bq_pipeline(): delete_solid(query_solid(load_solid(return_df(create_solid())))) result = execute_pipeline(bq_pipeline, config) assert result.success values = result.result_for_solid('query_solid').result_value() assert values[0].to_dict() == test_df.to_dict() # BQ loads should throw an exception if pyarrow and fastparquet aren't available with mock.patch.dict(sys.modules, {'pyarrow': None, 'fastparquet': None}): with pytest.raises(DagsterExecutionStepExecutionError) as exc_info: result = execute_pipeline(bq_pipeline, config) assert ( 'loading data to BigQuery from pandas DataFrames requires either pyarrow or fastparquet' ' to be installed' in str(exc_info.value.user_exception)) cleanup_config = { 'solids': { 'delete_solid': { 'config': { 'dataset': dataset, 'delete_contents': True } } } } @pipeline(mode_defs=bq_modes()) def cleanup(): delete_solid() assert execute_pipeline(cleanup, cleanup_config).success assert not dataset_exists(dataset)
def test_gcs_load(): dataset = get_dataset() table = '%s.%s' % (dataset, 'df') create_solid = bq_create_dataset.alias('create_solid') load_solid = bq_load_solid_for_source( BigQueryLoadSource.GCS).alias('load_solid') query_solid = bq_solid_for_queries([ 'SELECT string_field_0, string_field_1 FROM %s ORDER BY string_field_0 ASC LIMIT 1' % table ]).alias('query_solid') delete_solid = bq_delete_dataset.alias('delete_solid') @solid(input_defs=[InputDefinition('success', Nothing)], output_defs=[OutputDefinition(List[Path])]) def return_gcs_uri(_context): # pylint: disable=unused-argument return ["gs://cloud-samples-data/bigquery/us-states/us-states.csv"] config = { 'solids': { 'create_solid': { 'config': { 'dataset': dataset, 'exists_ok': True } }, 'load_solid': { 'config': { 'destination': table, 'load_job_config': { 'autodetect': True, 'skip_leading_rows': 1, 'source_format': 'CSV', 'write_disposition': 'WRITE_TRUNCATE', }, } }, 'delete_solid': { 'config': { 'dataset': dataset, 'delete_contents': True } }, } } @pipeline(mode_defs=bq_modes()) def bq_pipeline(): delete_solid(query_solid(load_solid(return_gcs_uri(create_solid())))) result = execute_pipeline(bq_pipeline, config) assert result.success values = result.result_for_solid('query_solid').result_value() assert values[0].to_dict() == { 'string_field_0': { 0: 'Alabama' }, 'string_field_1': { 0: 'AL' } } assert not dataset_exists(dataset)
def _gcs_to_bigquery_solid(source_uris: List[String]): return bq_load_solid_for_source(BigQueryLoadSource.GCS).alias( 'gcs_to_bigquery_solid_internal')(source_uris)