def test_get_bq_extract_operator_definition(self, mock_table): mock_table.return_value = bq.Table( 'foo_project.foo_dataset.foo_table', context=PipelineTest._create_context()) task_id = 'foo' task_details = {} task_details['type'] = 'BigQueryToCloudStorage' task_details['table'] = 'foo_project.foo_dataset.foo_table' task_details['path'] = 'foo_path' task_details['format'] = 'csv' task_details['delimiter'] = '$' task_details['header'] = False task_details['compress'] = True operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, None) self.assertEqual( operator_def, """foo = BigQueryToCloudStorageOperator(task_id='foo_id', compression=\"\"\"GZIP\"\"\", destination_cloud_storage_uris=[\'foo_path\'], export_format=\"\"\"CSV\"\"\", field_delimiter=\"\"\"$\"\"\", print_header=False, source_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", dag=dag) """) # noqa task_details['format'] = 'json' operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, None) self.assertEqual( operator_def, """foo = BigQueryToCloudStorageOperator(task_id='foo_id', compression=\"\"\"GZIP\"\"\", destination_cloud_storage_uris=[\'foo_path\'], export_format=\"\"\"NEWLINE_DELIMITED_JSON\"\"\", field_delimiter=\"\"\"$\"\"\", print_header=False, source_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", dag=dag) """) # noqa
def test_get_bq_extract_operator_definition(self, mock_table): mock_table.return_value = google.datalab.bigquery.Table( 'foo_project.foo_dataset.foo_table', context=PipelineTest._create_context()) task_id = 'foo' task_details = {} task_details['type'] = 'bq.extract' task_details['table'] = 'foo_project.foo_dataset.foo_table' task_details['path'] = 'foo_path' task_details['format'] = 'csv' task_details['delimiter'] = '$' task_details['header'] = False task_details['compress'] = True operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details) self.assertEqual( operator_def, ('foo = BigQueryToCloudStorageOperator(task_id=\'foo_id\', ' 'compression=\'GZIP\', destination_cloud_storage_uris=\'' '[foo_path]\', export_format=\'CSV\', field_delimiter=\'$\', ' 'print_header=False, source_project_dataset_table=\'' 'foo_project.foo_dataset.foo_table\', dag=dag)\n')) task_details['format'] = 'json' operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details) self.assertEqual( operator_def, ('foo = BigQueryToCloudStorageOperator(task_id=\'foo_id\', ' 'compression=\'GZIP\', destination_cloud_storage_uris=\'' '[foo_path]\', export_format=\'NEWLINE_DELIMITED_JSON\', ' 'field_delimiter=\'$\', print_header=False, ' 'source_project_dataset_table=\'' 'foo_project.foo_dataset.foo_table\', dag=dag)\n'))
def test_get_bq_load_operator_definition(self, mock_table): mock_table.return_value = bq.Table( 'foo_project.foo_dataset.foo_table', context=PipelineTest._create_context()) task_id = 'foo' task_details = {} task_details['type'] = 'GoogleCloudStorageToBigQuery' task_details['table'] = 'foo_project.foo_dataset.foo_table' task_details['path'] = 'gs://foo_bucket/foo_file.csv' task_details['format'] = 'csv' task_details['delimiter'] = '$' task_details['skip'] = False operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, None) self.assertEqual( operator_def, """foo = GoogleCloudStorageToBigQueryOperator(task_id='foo_id', bucket=\"\"\"foo_bucket\"\"\", destination_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", export_format=\"\"\"CSV\"\"\", field_delimiter=\"\"\"$\"\"\", skip_leading_rows=False, source_objects=\"\"\"foo_file.csv\"\"\", dag=dag) """) # noqa task_details['format'] = 'json' operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, None) self.assertEqual( operator_def, """foo = GoogleCloudStorageToBigQueryOperator(task_id='foo_id', bucket=\"\"\"foo_bucket\"\"\", destination_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", export_format=\"\"\"NEWLINE_DELIMITED_JSON\"\"\", field_delimiter=\"\"\"$\"\"\", skip_leading_rows=False, source_objects=\"\"\"foo_file.csv\"\"\", dag=dag) """) # noqa
def test_get_bq_load_operator_definition(self, mock_table): mock_table.return_value = google.datalab.bigquery.Table( 'foo_project.foo_dataset.foo_table', context=PipelineTest._create_context()) task_id = 'foo' task_details = {} task_details['type'] = 'bq.load' task_details['table'] = 'foo_project.foo_dataset.foo_table' task_details['path'] = 'gs://foo_bucket/foo_file.csv' task_details['format'] = 'csv' task_details['delimiter'] = '$' task_details['skip'] = False operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details) self.assertEqual( operator_def, ('foo = GoogleCloudStorageToBigQueryOperator(task_id=\'foo_id\',' ' bucket=\'foo_bucket\', destination_project_dataset_table=' '\'foo_project.foo_dataset.foo_table\', export_format=\'CSV\', ' 'field_delimiter=\'$\', skip_leading_rows=False, ' 'source_objects=\'foo_file.csv\', dag=dag)\n')) task_details['format'] = 'json' operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details) self.assertEqual( operator_def, ('foo = GoogleCloudStorageToBigQueryOperator(task_id=\'foo_id\',' ' bucket=\'foo_bucket\', destination_project_dataset_table=' '\'foo_project.foo_dataset.foo_table\', ' 'export_format=\'NEWLINE_DELIMITED_JSON\', ' 'field_delimiter=\'$\', skip_leading_rows=False, ' 'source_objects=\'foo_file.csv\', dag=dag)\n'))
def test_get_bash_operator_definition_with_templates(self): task_id = 'print_pdt_date' task_details = {} task_details['type'] = 'Bash' task_details['output_encoding'] = 'utf-8' task_details['bash_command'] = 'date_%(_ds)s' operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, None) self.assertEqual( operator_def, """print_pdt_date = BashOperator(task_id=\'print_pdt_date_id\', bash_command=\"\"\"date_{{ ds }}\"\"\", output_encoding=\"\"\"utf-8\"\"\", dag=dag) """) # noqa # Airflow macros should get replaced in templated fields task_details['bash_command'] = 'date_%(_ds)s' operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, None) self.assertEqual( operator_def, """print_pdt_date = BashOperator(task_id=\'print_pdt_date_id\', bash_command=\"\"\"date_{{ ds }}\"\"\", output_encoding=\"\"\"utf-8\"\"\", dag=dag) """) # noqa # Airflow macros should not get replaced in non-templated fields task_details['bash_command'] = 'date' task_details['output_encoding'] = 'foo_%(_ds)s' operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, None) self.assertEqual( operator_def, """print_pdt_date = BashOperator(task_id=\'print_pdt_date_id\', bash_command=\"\"\"date\"\"\", output_encoding=\"\"\"foo_%(_ds)s\"\"\", dag=dag) """) # noqa # User-defined modifiers should get replaced in templated fields task_details['bash_command'] = 'date_%(foo_key)s' operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, [{ 'name': 'foo_key', 'value': 'foo_value', 'type': 'STRING' }]) self.assertEqual( operator_def, """print_pdt_date = BashOperator(task_id=\'print_pdt_date_id\', bash_command=\"\"\"date_foo_value\"\"\", output_encoding=\"\"\"foo_%(_ds)s\"\"\", dag=dag) """) # noqa # User-defined modifiers should take precedence over the built-in airflow macros task_details['bash_command'] = 'date_%(_ds)s' operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, [{ 'name': '_ds', 'value': 'user_value', 'type': 'STRING' }]) self.assertEqual( operator_def, """print_pdt_date = BashOperator(task_id=\'print_pdt_date_id\', bash_command=\"\"\"date_user_value\"\"\", output_encoding=\"\"\"foo_%(_ds)s\"\"\", dag=dag) """) # noqa
def test_get_pydatalab_bq_load_operator_definition(self): task_id = 'bq_pipeline_load_task' task_details = {} task_details['type'] = 'pydatalab.bq.load' task_details['delimiter'] = ',' task_details['format'] = 'csv' task_details['mode'] = 'create' task_details['path'] = 'test/path' task_details['quote'] = '"' schema = [ { 'mode': 'NULLABLE', 'type': 'int64', 'description': 'description1', 'name': 'col1', }, { 'mode': 'required', 'type': 'STRING', 'description': 'description1', 'name': 'col2', } ] task_details['schema'] = schema task_details['skip'] = 0 task_details['strict'] = True task_details['table'] = 'project.test.table' actual = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details) pattern = re.compile("""bq_pipeline_load_task = LoadOperator\(task_id='bq_pipeline_load_task_id', delimiter=',', format='csv', mode='create', path='test/path', quote='"', schema=(.*), skip=0, strict=True, table='project.test.table', dag=dag\)""") # noqa # group(1) has the string that follows the "schema=", i.e. the list of dicts. self.assertEqual(pattern.match(actual).group(1), str(schema))
def test_write_to_gcs(self, mock_client_get_bucket, mock_blob_class, mock_client): mock_client_get_bucket.return_value = mock.Mock(spec=google.cloud.storage.Bucket) mock_blob = mock_blob_class.return_value dag_dict = yaml.load(PipelineTest._test_pipeline_yaml_spec) test_pipeline = pipeline.Pipeline('foo_pipeline', dag_dict) test_pipeline.write_to_gcs() mock_blob.upload_from_string.assert_called_with(test_pipeline._get_airflow_spec())
def test_get_templated_bash_operator_definition(self): task_id = 'foo_task' task_details = {} task_details['type'] = 'Bash' task_details['bash_command'] = 'echo {{ ds }}' operator_def = pipeline.Pipeline(None, None)._get_operator_definition(task_id, task_details) self.assertEqual( operator_def, """foo_task = BashOperator(task_id='foo_task_id', bash_command='echo {{ ds }}', dag=dag) """) # noqa
def test_get_bash_operator_definition(self): task_id = 'print_pdt_date' task_details = {} task_details['type'] = 'Bash' task_details['bash_command'] = 'date' operator_def = pipeline.Pipeline(None, None)._get_operator_definition(task_id, task_details) self.assertEqual( operator_def, 'print_pdt_date = BashOperator(task_id=\'print_pdt_date_id\', ' 'bash_command=\'date\', dag=dag)\n')
def test_get_unknown_operator_definition(self): task_id = 'id' task_details = {} task_details['type'] = 'Unknown' task_details['foo'] = 'bar' task_details['bar_typed'] = False operator_def = pipeline.Pipeline(None, None)._get_operator_definition(task_id, task_details) self.assertEqual(operator_def, 'id = UnknownOperator(''task_id=\'id_id\', ' + 'bar_typed=False, foo=\'bar\', dag=dag)\n')
def test_get_templated_bq_definition(self): task_id = 'foo_task' task_details = {} task_details['type'] = 'BigQuery' task_details['query'] = google.datalab.bigquery.Query( 'SELECT * FROM `cloud-datalab-samples.httplogs.logs_%(ds)s`') operator_def = pipeline.Pipeline(None, None)._get_operator_definition(task_id, task_details) self.assertEqual( operator_def, """foo_task = BigQueryOperator(task_id='foo_task_id', bql='SELECT * FROM `cloud-datalab-samples.httplogs.logs_{{ ds_nodash }}`', use_legacy_sql=False, dag=dag) """) # noqa
def test_get_bq_execute_operator_definition(self, mock_table): mock_table.return_value = bq.Table( 'foo_project.foo_dataset.foo_table', context=PipelineTest._create_context()) task_id = 'foo' task_details = {} task_details['type'] = 'BigQuery' task_details['query'] = google.datalab.bigquery.Query( 'SELECT * FROM publicdata.samples.wikipedia LIMIT 5') operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details) self.assertEqual(operator_def, "foo = BigQueryOperator(task_id='foo_id', bql='SELECT * FROM publicdata.samples.wikipedia LIMIT 5', use_legacy_sql=False, dag=dag)\n") # noqa
def test_get_bq_operator_definition(self): task_id = 'query_wikipedia' task_details = {} task_details['type'] = 'bq' task_details['query'] = google.datalab.bigquery.Query( 'SELECT * FROM publicdata.samples.wikipedia LIMIT 5') operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details) self.assertEqual( operator_def, 'query_wikipedia = BigQueryOperator(task_id=\'query_wikipedia_id\', ' 'bql=\'SELECT * FROM publicdata.samples.wikipedia LIMIT 5\', ' 'use_legacy_sql=False, dag=dag)\n')
def test_get_pydatalab_bq_execute_operator_definition(self): task_id = 'bq_pipeline_execute_task' task_details = {} task_details['type'] = 'pydatalab.bq.execute' task_details['large'] = True task_details['mode'] = 'create' task_details['sql'] = 'foo_query' task_details['table'] = 'project.test.table' actual = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, None) expected = """bq_pipeline_execute_task = ExecuteOperator(task_id='bq_pipeline_execute_task_id', large=True, mode=\"\"\"create\"\"\", sql=\"\"\"foo_query\"\"\", table=\"\"\"project.test.table\"\"\", dag=dag) """ # noqa self.assertEqual(actual, expected)
def test_execute_operator_definition(self, mock_get_notebook_item, mock_query_execute): mock_get_notebook_item.return_value = google.datalab.bigquery.Query( 'test_sql') task_id = 'foo' task_details = {} task_details['type'] = 'pydatalab.bq.execute' task_details['sql'] = 'test_sql' task_details['mode'] = 'create' actual = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details) expected = """foo = ExecuteOperator(task_id='foo_id', mode='create', sql='test_sql', dag=dag)\n""" # noqa self.assertEqual(actual, expected)
def test_get_airflow_spec_with_default_schedule(self): dag_dict = yaml.load(PipelineTest._test_pipeline_yaml_spec) # We delete the schedule spec to test with defaults del dag_dict['schedule'] test_pipeline = pipeline.Pipeline('foo_name', dag_dict) actual = test_pipeline.get_airflow_spec() self.assertIn('import datetime', actual) self.assertIn("'email': ['*****@*****.**', '*****@*****.**']", actual) self.assertIn("schedule_interval='@once'", actual) self.assertIn('current_timestamp_id', actual) self.assertIn('tomorrows_timestamp_id', actual) self.assertIn('tomorrows_timestamp.set_upstream(current_timestamp)', actual)
def test_get_pydatalab_bq_extract_operator_definition(self): task_id = 'bq_pipeline_extract_task' task_details = {} task_details['type'] = 'pydatalab.bq.extract' task_details['billing'] = 'foo' task_details['compress'] = True task_details['delimiter'] = ',' task_details['format'] = 'csv' task_details['header'] = True task_details['path'] = 'test/path' actual = pipeline.Pipeline(None, None)._get_operator_definition(task_id, task_details) expected = """bq_pipeline_extract_task = ExtractOperator(task_id='bq_pipeline_extract_task_id', billing='foo', compress=True, delimiter=',', format='csv', header=True, path='test/path', dag=dag) """ # noqa self.assertEqual(actual, expected)
def test_get_bq_execute_operator_definition(self, mock_table): mock_table.return_value = bq.Table( 'foo_project.foo_dataset.foo_table', context=PipelineTest._create_context()) task_id = 'foo' task_details = {} task_details['type'] = 'BigQuery' # Adding newlines to the query to mimic actual usage of %%bq query ... task_details['query'] = google.datalab.bigquery.Query("""SELECT * FROM publicdata.samples.wikipedia LIMIT 5""") operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, None) self.assertEqual( operator_def, """foo = BigQueryOperator(task_id='foo_id', bql=\"\"\"SELECT *\nFROM publicdata.samples.wikipedia\nLIMIT 5\"\"\", use_legacy_sql=False, dag=dag) """) # noqa
def test_py_bq(self): env = {} env['foo_query'] = google.datalab.bigquery.Query( 'INSERT INTO rajivpb_demo.the_datetime_table (the_datetime) VALUES (CURRENT_DATETIME())' ) airflow_dag = pipeline.Pipeline(PipelineTest._dag_spec, 'demo_bq_dag_during_demo', env) expected_py = """ from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.contrib.operators.bigquery_operator import BigQueryOperator from airflow.contrib.operators.bigquery_table_delete_operator import BigQueryTableDeleteOperator from airflow.contrib.operators.bigquery_to_bigquery import BigQueryToBigQueryOperator from airflow.contrib.operators.bigquery_to_gcs import BigQueryToCloudStorageOperator from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator from datetime import timedelta from pytz import timezone default_args = { 'owner': 'Datalab', 'depends_on_past': False, 'email': ['*****@*****.**'], 'start_date': datetime.datetime.strptime('2009-05-05T22:28:15', '%Y-%m-%dT%H:%M:%S').replace(tzinfo=timezone('UTC')), 'end_date': datetime.datetime.strptime('2009-05-06T22:28:15', '%Y-%m-%dT%H:%M:%S').replace(tzinfo=timezone('UTC')), 'email_on_failure': True, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1), } dag = DAG(dag_id='demo_bq_dag_during_demo', schedule_interval='0-59 * * * *', default_args=default_args) current_timestamp = BigQueryOperator(task_id='current_timestamp_id', bql='INSERT INTO rajivpb_demo.the_datetime_table (the_datetime) VALUES (CURRENT_DATETIME())', use_legacy_sql=False, dag=dag) tomorrows_timestamp = BigQueryOperator(task_id='tomorrows_timestamp_id', bql='INSERT INTO rajivpb_demo.the_datetime_table (the_datetime) VALUES (CURRENT_DATETIME())', use_legacy_sql=False, dag=dag) tomorrows_timestamp.set_upstream(current_timestamp) """ # noqa self.assertEqual(airflow_dag.py, expected_py)
def test_get_pydatalab_bq_load_operator_definition(self): task_id = 'bq_pipeline_load_task' task_details = {} task_details['type'] = 'pydatalab.bq.load' task_details['delimiter'] = ',' task_details['format'] = 'csv' task_details['mode'] = 'create' task_details['path'] = 'test/path' task_details['quote'] = '"' schema = [{ 'mode': 'NULLABLE', 'type': 'int64', 'description': 'description1', 'name': 'col1', }, { 'mode': 'required', 'type': 'STRING', 'description': 'description1', 'name': 'col2', }] task_details['schema'] = schema task_details['skip'] = 0 task_details['strict'] = True task_details['table'] = 'project.test.table' actual = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, None) pattern = re.compile( """bq_pipeline_load_task = LoadOperator\(task_id='bq_pipeline_load_task_id', delimiter=\"\"\",\"\"\", format=\"\"\"csv\"\"\", mode=\"\"\"create\"\"\", path=\"\"\"test/path\"\"\", quote=\"\"\""\"\"\", schema=(.*), skip=0, strict=True, table=\"\"\"project.test.table\"\"\", dag=dag\)""" ) # noqa # group(1) has the string that follows the "schema=", i.e. the list of dicts. # Since we're comparing string literals of dicts that have the items re-ordered, we just sort # the string. This is a loose check. sorted_string_of_actual_schema = ''.join( sorted(pattern.match(actual).group(1))) sorted_string_of_expected_schema = ''.join(sorted(str(schema))) self.assertEqual(sorted_string_of_actual_schema, sorted_string_of_expected_schema)
def test_get_dag_definition(self): test_pipeline = pipeline.Pipeline('', 'foo') self.assertEqual( test_pipeline._get_dag_definition('bar'), 'dag = DAG(dag_id=\'foo\', schedule_interval=\'bar\', ' 'default_args=default_args)\n\n')