예제 #1
0
    def test_get_bq_extract_operator_definition(self, mock_table):
        mock_table.return_value = bq.Table(
            'foo_project.foo_dataset.foo_table',
            context=PipelineTest._create_context())
        task_id = 'foo'
        task_details = {}
        task_details['type'] = 'BigQueryToCloudStorage'
        task_details['table'] = 'foo_project.foo_dataset.foo_table'
        task_details['path'] = 'foo_path'
        task_details['format'] = 'csv'
        task_details['delimiter'] = '$'
        task_details['header'] = False
        task_details['compress'] = True
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, None)
        self.assertEqual(
            operator_def,
            """foo = BigQueryToCloudStorageOperator(task_id='foo_id', compression=\"\"\"GZIP\"\"\", destination_cloud_storage_uris=[\'foo_path\'], export_format=\"\"\"CSV\"\"\", field_delimiter=\"\"\"$\"\"\", print_header=False, source_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", dag=dag)
""")  # noqa

        task_details['format'] = 'json'
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, None)
        self.assertEqual(
            operator_def,
            """foo = BigQueryToCloudStorageOperator(task_id='foo_id', compression=\"\"\"GZIP\"\"\", destination_cloud_storage_uris=[\'foo_path\'], export_format=\"\"\"NEWLINE_DELIMITED_JSON\"\"\", field_delimiter=\"\"\"$\"\"\", print_header=False, source_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", dag=dag)
""")  # noqa
예제 #2
0
    def test_get_bq_extract_operator_definition(self, mock_table):
        mock_table.return_value = google.datalab.bigquery.Table(
            'foo_project.foo_dataset.foo_table',
            context=PipelineTest._create_context())
        task_id = 'foo'
        task_details = {}
        task_details['type'] = 'bq.extract'
        task_details['table'] = 'foo_project.foo_dataset.foo_table'
        task_details['path'] = 'foo_path'
        task_details['format'] = 'csv'
        task_details['delimiter'] = '$'
        task_details['header'] = False
        task_details['compress'] = True
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details)
        self.assertEqual(
            operator_def,
            ('foo = BigQueryToCloudStorageOperator(task_id=\'foo_id\', '
             'compression=\'GZIP\', destination_cloud_storage_uris=\''
             '[foo_path]\', export_format=\'CSV\', field_delimiter=\'$\', '
             'print_header=False, source_project_dataset_table=\''
             'foo_project.foo_dataset.foo_table\', dag=dag)\n'))

        task_details['format'] = 'json'
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details)
        self.assertEqual(
            operator_def,
            ('foo = BigQueryToCloudStorageOperator(task_id=\'foo_id\', '
             'compression=\'GZIP\', destination_cloud_storage_uris=\''
             '[foo_path]\', export_format=\'NEWLINE_DELIMITED_JSON\', '
             'field_delimiter=\'$\', print_header=False, '
             'source_project_dataset_table=\''
             'foo_project.foo_dataset.foo_table\', dag=dag)\n'))
예제 #3
0
    def test_get_bq_load_operator_definition(self, mock_table):
        mock_table.return_value = bq.Table(
            'foo_project.foo_dataset.foo_table',
            context=PipelineTest._create_context())
        task_id = 'foo'
        task_details = {}
        task_details['type'] = 'GoogleCloudStorageToBigQuery'
        task_details['table'] = 'foo_project.foo_dataset.foo_table'
        task_details['path'] = 'gs://foo_bucket/foo_file.csv'
        task_details['format'] = 'csv'
        task_details['delimiter'] = '$'
        task_details['skip'] = False
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, None)
        self.assertEqual(
            operator_def,
            """foo = GoogleCloudStorageToBigQueryOperator(task_id='foo_id', bucket=\"\"\"foo_bucket\"\"\", destination_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", export_format=\"\"\"CSV\"\"\", field_delimiter=\"\"\"$\"\"\", skip_leading_rows=False, source_objects=\"\"\"foo_file.csv\"\"\", dag=dag)
""")  # noqa

        task_details['format'] = 'json'
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, None)
        self.assertEqual(
            operator_def,
            """foo = GoogleCloudStorageToBigQueryOperator(task_id='foo_id', bucket=\"\"\"foo_bucket\"\"\", destination_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", export_format=\"\"\"NEWLINE_DELIMITED_JSON\"\"\", field_delimiter=\"\"\"$\"\"\", skip_leading_rows=False, source_objects=\"\"\"foo_file.csv\"\"\", dag=dag)
""")  # noqa
예제 #4
0
    def test_get_bq_load_operator_definition(self, mock_table):
        mock_table.return_value = google.datalab.bigquery.Table(
            'foo_project.foo_dataset.foo_table',
            context=PipelineTest._create_context())
        task_id = 'foo'
        task_details = {}
        task_details['type'] = 'bq.load'
        task_details['table'] = 'foo_project.foo_dataset.foo_table'
        task_details['path'] = 'gs://foo_bucket/foo_file.csv'
        task_details['format'] = 'csv'
        task_details['delimiter'] = '$'
        task_details['skip'] = False
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details)
        self.assertEqual(
            operator_def,
            ('foo = GoogleCloudStorageToBigQueryOperator(task_id=\'foo_id\','
             ' bucket=\'foo_bucket\', destination_project_dataset_table='
             '\'foo_project.foo_dataset.foo_table\', export_format=\'CSV\', '
             'field_delimiter=\'$\', skip_leading_rows=False, '
             'source_objects=\'foo_file.csv\', dag=dag)\n'))

        task_details['format'] = 'json'
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details)
        self.assertEqual(
            operator_def,
            ('foo = GoogleCloudStorageToBigQueryOperator(task_id=\'foo_id\','
             ' bucket=\'foo_bucket\', destination_project_dataset_table='
             '\'foo_project.foo_dataset.foo_table\', '
             'export_format=\'NEWLINE_DELIMITED_JSON\', '
             'field_delimiter=\'$\', skip_leading_rows=False, '
             'source_objects=\'foo_file.csv\', dag=dag)\n'))
예제 #5
0
    def test_get_bash_operator_definition_with_templates(self):
        task_id = 'print_pdt_date'
        task_details = {}
        task_details['type'] = 'Bash'
        task_details['output_encoding'] = 'utf-8'
        task_details['bash_command'] = 'date_%(_ds)s'
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, None)
        self.assertEqual(
            operator_def,
            """print_pdt_date = BashOperator(task_id=\'print_pdt_date_id\', bash_command=\"\"\"date_{{ ds }}\"\"\", output_encoding=\"\"\"utf-8\"\"\", dag=dag)
""")  # noqa

        # Airflow macros should get replaced in templated fields
        task_details['bash_command'] = 'date_%(_ds)s'
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, None)
        self.assertEqual(
            operator_def,
            """print_pdt_date = BashOperator(task_id=\'print_pdt_date_id\', bash_command=\"\"\"date_{{ ds }}\"\"\", output_encoding=\"\"\"utf-8\"\"\", dag=dag)
""")  # noqa

        # Airflow macros should not get replaced in non-templated fields
        task_details['bash_command'] = 'date'
        task_details['output_encoding'] = 'foo_%(_ds)s'
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, None)
        self.assertEqual(
            operator_def,
            """print_pdt_date = BashOperator(task_id=\'print_pdt_date_id\', bash_command=\"\"\"date\"\"\", output_encoding=\"\"\"foo_%(_ds)s\"\"\", dag=dag)
""")  # noqa

        # User-defined modifiers should get replaced in templated fields
        task_details['bash_command'] = 'date_%(foo_key)s'
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, [{
                'name': 'foo_key',
                'value': 'foo_value',
                'type': 'STRING'
            }])
        self.assertEqual(
            operator_def,
            """print_pdt_date = BashOperator(task_id=\'print_pdt_date_id\', bash_command=\"\"\"date_foo_value\"\"\", output_encoding=\"\"\"foo_%(_ds)s\"\"\", dag=dag)
""")  # noqa

        # User-defined modifiers should take precedence over the built-in airflow macros
        task_details['bash_command'] = 'date_%(_ds)s'
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, [{
                'name': '_ds',
                'value': 'user_value',
                'type': 'STRING'
            }])
        self.assertEqual(
            operator_def,
            """print_pdt_date = BashOperator(task_id=\'print_pdt_date_id\', bash_command=\"\"\"date_user_value\"\"\", output_encoding=\"\"\"foo_%(_ds)s\"\"\", dag=dag)
""")  # noqa
예제 #6
0
  def test_get_pydatalab_bq_load_operator_definition(self):
    task_id = 'bq_pipeline_load_task'
    task_details = {}
    task_details['type'] = 'pydatalab.bq.load'
    task_details['delimiter'] = ','
    task_details['format'] = 'csv'
    task_details['mode'] = 'create'
    task_details['path'] = 'test/path'
    task_details['quote'] = '"'
    schema = [
      {
        'mode': 'NULLABLE',
        'type': 'int64',
        'description': 'description1',
        'name': 'col1',
      },
      {
        'mode': 'required',
        'type': 'STRING',
        'description': 'description1',
        'name': 'col2',
      }
    ]
    task_details['schema'] = schema
    task_details['skip'] = 0
    task_details['strict'] = True
    task_details['table'] = 'project.test.table'

    actual = pipeline.Pipeline(None, None)._get_operator_definition(
        task_id, task_details)
    pattern = re.compile("""bq_pipeline_load_task = LoadOperator\(task_id='bq_pipeline_load_task_id', delimiter=',', format='csv', mode='create', path='test/path', quote='"', schema=(.*), skip=0, strict=True, table='project.test.table', dag=dag\)""")  # noqa

    # group(1) has the string that follows the "schema=", i.e. the list of dicts.
    self.assertEqual(pattern.match(actual).group(1), str(schema))
예제 #7
0
 def test_write_to_gcs(self, mock_client_get_bucket, mock_blob_class, mock_client):
   mock_client_get_bucket.return_value = mock.Mock(spec=google.cloud.storage.Bucket)
   mock_blob = mock_blob_class.return_value
   dag_dict = yaml.load(PipelineTest._test_pipeline_yaml_spec)
   test_pipeline = pipeline.Pipeline('foo_pipeline', dag_dict)
   test_pipeline.write_to_gcs()
   mock_blob.upload_from_string.assert_called_with(test_pipeline._get_airflow_spec())
예제 #8
0
  def test_get_templated_bash_operator_definition(self):
    task_id = 'foo_task'
    task_details = {}
    task_details['type'] = 'Bash'
    task_details['bash_command'] = 'echo {{ ds }}'
    operator_def = pipeline.Pipeline(None, None)._get_operator_definition(task_id, task_details)
    self.assertEqual(
      operator_def,
      """foo_task = BashOperator(task_id='foo_task_id', bash_command='echo {{ ds }}', dag=dag)
""")  # noqa
예제 #9
0
 def test_get_bash_operator_definition(self):
   task_id = 'print_pdt_date'
   task_details = {}
   task_details['type'] = 'Bash'
   task_details['bash_command'] = 'date'
   operator_def = pipeline.Pipeline(None, None)._get_operator_definition(task_id, task_details)
   self.assertEqual(
       operator_def,
       'print_pdt_date = BashOperator(task_id=\'print_pdt_date_id\', '
       'bash_command=\'date\', dag=dag)\n')
예제 #10
0
 def test_get_unknown_operator_definition(self):
   task_id = 'id'
   task_details = {}
   task_details['type'] = 'Unknown'
   task_details['foo'] = 'bar'
   task_details['bar_typed'] = False
   operator_def = pipeline.Pipeline(None, None)._get_operator_definition(task_id, task_details)
   self.assertEqual(operator_def,
                    'id = UnknownOperator(''task_id=\'id_id\', ' +
                    'bar_typed=False, foo=\'bar\', dag=dag)\n')
예제 #11
0
  def test_get_templated_bq_definition(self):
    task_id = 'foo_task'
    task_details = {}
    task_details['type'] = 'BigQuery'
    task_details['query'] = google.datalab.bigquery.Query(
      'SELECT * FROM `cloud-datalab-samples.httplogs.logs_%(ds)s`')
    operator_def = pipeline.Pipeline(None, None)._get_operator_definition(task_id, task_details)
    self.assertEqual(
      operator_def,
      """foo_task = BigQueryOperator(task_id='foo_task_id', bql='SELECT * FROM `cloud-datalab-samples.httplogs.logs_{{ ds_nodash }}`', use_legacy_sql=False, dag=dag)
""")  # noqa
예제 #12
0
 def test_get_bq_execute_operator_definition(self, mock_table):
   mock_table.return_value = bq.Table(
       'foo_project.foo_dataset.foo_table',
       context=PipelineTest._create_context())
   task_id = 'foo'
   task_details = {}
   task_details['type'] = 'BigQuery'
   task_details['query'] = google.datalab.bigquery.Query(
     'SELECT * FROM publicdata.samples.wikipedia LIMIT 5')
   operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
       task_id, task_details)
   self.assertEqual(operator_def, "foo = BigQueryOperator(task_id='foo_id', bql='SELECT * FROM publicdata.samples.wikipedia LIMIT 5', use_legacy_sql=False, dag=dag)\n")  # noqa
예제 #13
0
 def test_get_bq_operator_definition(self):
     task_id = 'query_wikipedia'
     task_details = {}
     task_details['type'] = 'bq'
     task_details['query'] = google.datalab.bigquery.Query(
         'SELECT * FROM publicdata.samples.wikipedia LIMIT 5')
     operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
         task_id, task_details)
     self.assertEqual(
         operator_def,
         'query_wikipedia = BigQueryOperator(task_id=\'query_wikipedia_id\', '
         'bql=\'SELECT * FROM publicdata.samples.wikipedia LIMIT 5\', '
         'use_legacy_sql=False, dag=dag)\n')
예제 #14
0
    def test_get_pydatalab_bq_execute_operator_definition(self):
        task_id = 'bq_pipeline_execute_task'
        task_details = {}
        task_details['type'] = 'pydatalab.bq.execute'
        task_details['large'] = True
        task_details['mode'] = 'create'
        task_details['sql'] = 'foo_query'
        task_details['table'] = 'project.test.table'
        actual = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, None)
        expected = """bq_pipeline_execute_task = ExecuteOperator(task_id='bq_pipeline_execute_task_id', large=True, mode=\"\"\"create\"\"\", sql=\"\"\"foo_query\"\"\", table=\"\"\"project.test.table\"\"\", dag=dag)
"""  # noqa
        self.assertEqual(actual, expected)
예제 #15
0
    def test_execute_operator_definition(self, mock_get_notebook_item,
                                         mock_query_execute):
        mock_get_notebook_item.return_value = google.datalab.bigquery.Query(
            'test_sql')
        task_id = 'foo'
        task_details = {}
        task_details['type'] = 'pydatalab.bq.execute'
        task_details['sql'] = 'test_sql'
        task_details['mode'] = 'create'

        actual = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details)
        expected = """foo = ExecuteOperator(task_id='foo_id', mode='create', sql='test_sql', dag=dag)\n"""  # noqa
        self.assertEqual(actual, expected)
예제 #16
0
    def test_get_airflow_spec_with_default_schedule(self):
        dag_dict = yaml.load(PipelineTest._test_pipeline_yaml_spec)
        # We delete the schedule spec to test with defaults
        del dag_dict['schedule']

        test_pipeline = pipeline.Pipeline('foo_name', dag_dict)
        actual = test_pipeline.get_airflow_spec()
        self.assertIn('import datetime', actual)
        self.assertIn("'email': ['*****@*****.**', '*****@*****.**']", actual)
        self.assertIn("schedule_interval='@once'", actual)
        self.assertIn('current_timestamp_id', actual)
        self.assertIn('tomorrows_timestamp_id', actual)
        self.assertIn('tomorrows_timestamp.set_upstream(current_timestamp)',
                      actual)
예제 #17
0
  def test_get_pydatalab_bq_extract_operator_definition(self):
    task_id = 'bq_pipeline_extract_task'
    task_details = {}
    task_details['type'] = 'pydatalab.bq.extract'
    task_details['billing'] = 'foo'
    task_details['compress'] = True
    task_details['delimiter'] = ','
    task_details['format'] = 'csv'
    task_details['header'] = True
    task_details['path'] = 'test/path'

    actual = pipeline.Pipeline(None, None)._get_operator_definition(task_id, task_details)
    expected = """bq_pipeline_extract_task = ExtractOperator(task_id='bq_pipeline_extract_task_id', billing='foo', compress=True, delimiter=',', format='csv', header=True, path='test/path', dag=dag)
"""  # noqa
    self.assertEqual(actual, expected)
예제 #18
0
    def test_get_bq_execute_operator_definition(self, mock_table):
        mock_table.return_value = bq.Table(
            'foo_project.foo_dataset.foo_table',
            context=PipelineTest._create_context())
        task_id = 'foo'
        task_details = {}
        task_details['type'] = 'BigQuery'

        # Adding newlines to the query to mimic actual usage of %%bq query ...
        task_details['query'] = google.datalab.bigquery.Query("""SELECT *
FROM publicdata.samples.wikipedia
LIMIT 5""")
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, None)
        self.assertEqual(
            operator_def,
            """foo = BigQueryOperator(task_id='foo_id', bql=\"\"\"SELECT *\nFROM publicdata.samples.wikipedia\nLIMIT 5\"\"\", use_legacy_sql=False, dag=dag)
""")  # noqa
예제 #19
0
    def test_py_bq(self):
        env = {}
        env['foo_query'] = google.datalab.bigquery.Query(
            'INSERT INTO rajivpb_demo.the_datetime_table (the_datetime) VALUES (CURRENT_DATETIME())'
        )

        airflow_dag = pipeline.Pipeline(PipelineTest._dag_spec,
                                        'demo_bq_dag_during_demo', env)
        expected_py = """
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.contrib.operators.bigquery_table_delete_operator import BigQueryTableDeleteOperator
from airflow.contrib.operators.bigquery_to_bigquery import BigQueryToBigQueryOperator
from airflow.contrib.operators.bigquery_to_gcs import BigQueryToCloudStorageOperator
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
from datetime import timedelta
from pytz import timezone

default_args = {
    'owner': 'Datalab',
    'depends_on_past': False,
    'email': ['*****@*****.**'],
    'start_date': datetime.datetime.strptime('2009-05-05T22:28:15', '%Y-%m-%dT%H:%M:%S').replace(tzinfo=timezone('UTC')),
    'end_date': datetime.datetime.strptime('2009-05-06T22:28:15', '%Y-%m-%dT%H:%M:%S').replace(tzinfo=timezone('UTC')),
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

dag = DAG(dag_id='demo_bq_dag_during_demo', schedule_interval='0-59 * * * *', default_args=default_args)

current_timestamp = BigQueryOperator(task_id='current_timestamp_id', bql='INSERT INTO rajivpb_demo.the_datetime_table (the_datetime) VALUES (CURRENT_DATETIME())', use_legacy_sql=False, dag=dag)
tomorrows_timestamp = BigQueryOperator(task_id='tomorrows_timestamp_id', bql='INSERT INTO rajivpb_demo.the_datetime_table (the_datetime) VALUES (CURRENT_DATETIME())', use_legacy_sql=False, dag=dag)
tomorrows_timestamp.set_upstream(current_timestamp)
""" # noqa
        self.assertEqual(airflow_dag.py, expected_py)
예제 #20
0
    def test_get_pydatalab_bq_load_operator_definition(self):
        task_id = 'bq_pipeline_load_task'
        task_details = {}
        task_details['type'] = 'pydatalab.bq.load'
        task_details['delimiter'] = ','
        task_details['format'] = 'csv'
        task_details['mode'] = 'create'
        task_details['path'] = 'test/path'
        task_details['quote'] = '"'
        schema = [{
            'mode': 'NULLABLE',
            'type': 'int64',
            'description': 'description1',
            'name': 'col1',
        }, {
            'mode': 'required',
            'type': 'STRING',
            'description': 'description1',
            'name': 'col2',
        }]
        task_details['schema'] = schema
        task_details['skip'] = 0
        task_details['strict'] = True
        task_details['table'] = 'project.test.table'

        actual = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, None)
        pattern = re.compile(
            """bq_pipeline_load_task = LoadOperator\(task_id='bq_pipeline_load_task_id', delimiter=\"\"\",\"\"\", format=\"\"\"csv\"\"\", mode=\"\"\"create\"\"\", path=\"\"\"test/path\"\"\", quote=\"\"\""\"\"\", schema=(.*), skip=0, strict=True, table=\"\"\"project.test.table\"\"\", dag=dag\)"""
        )  # noqa
        # group(1) has the string that follows the "schema=", i.e. the list of dicts.
        # Since we're comparing string literals of dicts that have the items re-ordered, we just sort
        # the string. This is a loose check.
        sorted_string_of_actual_schema = ''.join(
            sorted(pattern.match(actual).group(1)))
        sorted_string_of_expected_schema = ''.join(sorted(str(schema)))
        self.assertEqual(sorted_string_of_actual_schema,
                         sorted_string_of_expected_schema)
예제 #21
0
 def test_get_dag_definition(self):
     test_pipeline = pipeline.Pipeline('', 'foo')
     self.assertEqual(
         test_pipeline._get_dag_definition('bar'),
         'dag = DAG(dag_id=\'foo\', schedule_interval=\'bar\', '
         'default_args=default_args)\n\n')