def dag_preprocess_tables( dag_id, schedule_interval, start_date, target_project_id, target_dataset_id, table_config, table_partition, ): dag = DAG(dag_id=dag_id, schedule_interval=schedule_interval, start_date=start_date) for table in table_config: start_check_tables_task = DummyOperator( task_id='%s-%s' % ("start_check_tables_task", table["name"]), dag=dag) check_if_table_exist = BranchPythonOperator( task_id='%s-%s' % (table["name"], "check_if_table_exist"), python_callable=if_tbl_exists, op_kwargs={ 'dataset': target_dataset_id, 'project': target_project_id, 'table_name': table["name"] }, dag=dag) table_exists = DummyOperator(task_id='%s-%s' % (table["name"], "table_exists"), dag=dag) table_does_not_exist = DummyOperator( task_id='%s-%s' % (table["name"], "table_does_not_exist"), dag=dag) # [start create equipped_item_reference if not exists] create_if_not_exists = BigQueryCreateEmptyTableOperator( task_id='%s-%s' % (table["name"], "create_if_not_exists"), project_id=target_project_id, dataset_id=target_dataset_id, table_id=table["name"], gcs_schema_object=table["schema_gcs_location"], time_partitioning=table_partition, trigger_rule=TriggerRule.ALL_SUCCESS, dag=dag) end_check_tables_task = DummyOperator( task_id='%s-%s' % ("end_check_tables_task", table["name"]), trigger_rule='none_failed_or_skipped', dag=dag) start_check_tables_task >> check_if_table_exist >> [ table_does_not_exist, table_exists ] table_does_not_exist >> create_if_not_exists >> end_check_tables_task table_exists >> end_check_tables_task return dag
def test_execute(self, mock_hook): operator = BigQueryCreateEmptyTableOperator(task_id=TASK_ID, dataset_id=TEST_DATASET, project_id=TEST_PROJECT_ID, table_id=TEST_TABLE_ID) operator.execute(None) mock_hook.return_value \ .get_conn() \ .cursor() \ .create_empty_table \ .assert_called_once_with( dataset_id=TEST_DATASET, project_id=TEST_PROJECT_ID, table_id=TEST_TABLE_ID, schema_fields=None, time_partitioning={} )
} dag = DAG( dag_id='create_and_insert_table', default_args=default_args, description='DAG to create empty table, dataset provided', schedule_interval=timedelta(days=1), catchup=False, ) create_table_task = BigQueryCreateEmptyTableOperator( project_id="dark-furnace-298806", dataset_id="shtest", task_id="create_empty_table", table_id="new_service_table", bigquery_conn_id="bigquery_default", google_cloud_storage_conn_id="google_cloud_default", dag=dag ) insert_task = BigQueryOperator( task_id="insert_into_table", bql=''' select * from bigquery-public-data.austin_311.311_service_requests limit 5 ''', use_legacy_sql=False, destination_dataset_table="dark-furnace-298806:shtest.new_service_table",
task_id="create-dataset", dataset_id=DATASET_NAME) create_dataset_with_location = BigQueryCreateEmptyDatasetOperator( task_id="create_dataset_with_location", dataset_id=LOCATION_DATASET_NAME, location=BQ_LOCATION) create_table = BigQueryCreateEmptyTableOperator( task_id="create-table", dataset_id=DATASET_NAME, table_id="test_table", schema_fields=[ { "name": "emp_name", "type": "STRING", "mode": "REQUIRED" }, { "name": "salary", "type": "INTEGER", "mode": "NULLABLE" }, ], ) create_table_with_location = BigQueryCreateEmptyTableOperator( task_id="create_table_with_location", dataset_id=LOCATION_DATASET_NAME, table_id="test_table", schema_fields=[ {
# Create table # CreateTable = BigQueryCreateEmptyTableOperator( # task_id='BigQueryCreateEmptyTableOperator_task', # dataset_id='ODS', # table_id='Employees', # project_id='internal-gcp-project', # gcs_schema_object='gs://schema-bucket/employee_schema.json', # bigquery_conn_id='airflow-service-account', # google_cloud_storage_conn_id='airflow-service-account' # ) CreateTable = BigQueryCreateEmptyTableOperator( task_id='BigQueryCreateEmptyTableOperator_task', dataset_id='Covid', table_id='GoogleTrend_test', project_id='covidproject-278521', gcs_schema_object='gs://testcovidlinh/googletrend_schema.json', # schema_fields=googletrend_schema, bigquery_conn_id=gcp_conn_id, google_cloud_storage_conn_id=gcp_conn_id) # Loading data from GCS to BigQuery gcs_to_bigquery = GoogleCloudStorageToBigQueryOperator( task_id='GCS_to_BigQuery', dag=dag, bucket='testcovidlinh', source_objects=source_objects, # schema_object = "/tmp/covidStatSchema.json", schema_object='googletrend_schema.json', # schema_fields = googletrend_schema, source_format='CSV',
# task2 create a new covid_19 partition table. task2 = BigQueryCreateEmptyTableOperator(task_id='bq_create_new_talble', dag=dag, dataset_id=BQ_DATASET, table_id=BQ_TABLE, project_id=BQ_PROJECT, bigquery_conn_id=BQ_CONN_ID, schema_fields=[{ "name": "DateStr", "type": "DATE", "mode": "REQUIRED" }, { "name": "State", "type": "STRING", "mode": "REQUIRED" }, { "name": "Count", "type": "INTEGER", "mode": "REQUIRED" }, { "name": "Status", "type": "STRING", "mode": "REQUIRED" }], time_partitioning={ "type": "DAY", "field": "DateStr", "expiration_ms": "5184000000" } # partition expiration 60 days )
dag = DAG('incremental_ingestion', catchup=False, default_args=default_args, schedule_interval=dt.timedelta(days=1)) load_config = PythonOperator(task_id='load_config', provide_context=True, python_callable=fetch_config, dag=dag) bq_create_staging = BigQueryCreateEmptyTableOperator( task_id='bq_create_staging', project_id= '{{(var.json|attr("config-{}".format(run_id)))["bigquery"]["project_id"]}}', dataset_id= '{{(var.json|attr("config-{}".format(run_id)))["bigquery"]["dataset_id"]}}', table_id= '{{(var.json|attr("config-{}".format(run_id)))["bigquery"]["staging_table_id"]}}', gcs_schema_object= '{{(var.json|attr("config-{}".format(run_id)))["bigquery"]["schema_file"]}}', dag=dag) stage_data = DataflowTemplateOperator( task_id='stage_data', template= '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["template"] }}', dataflow_default_options={ 'project': '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["project"] }}', 'region': '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["region"] }}',
task_id='create_football_matches_dataset', project_id=project_id, dataset_id=dataset_id, bigquery_conn_id=gcp_conn, dag=dag ) create_games_table = BigQueryCreateEmptyTableOperator( task_id="create_games_table", project_id=project_id, dataset_id=dataset_id, bigquery_conn_id=gcp_conn, table_id="games", schema_fields=[{"name": "date", "type": "TIMESTAMP", "mode": "REQUIRED"}, {"name": "team_1", "type": "STRING", "mode": "REQUIRED"}, {"name": "team_2", "type": "STRING", "mode": "REQUIRED"}, {"name": "team_1_score", "type": "INTEGER", "mode": "REQUIRED"}, {"name": "team_2_score", "type": "INTEGER", "mode": "REQUIRED"}, {"name": "home_team", "type": "STRING", "mode": "NULLABLE"}, {"name": "tournament", "type": "STRING", "mode": "NULLABLE"}, {"name": "country", "type": "STRING", "mode": "NULLABLE"}, {"name": "city", "type": "STRING", "mode": "NULLABLE"}, ], dag=dag ) etl_games_job_code_path = 'gs://' + gcs_football_bucket + '/spark_jobs/etl_games_to_bigquery.py' submit_etl_games_spark_job = DataProcPySparkOperator( task_id='submit_etl_games_spark_job', main=etl_games_job_code_path, cluster_name=cluster_name, job_name='etl_games_to_bigquery',