def test_to_airflow_duplicate_dependencies(self, tmp_path): query_file = ( TEST_DIR / "data" / "test_sql" / "moz-fx-data-test-project" / "test" / "non_incremental_query_v1" / "query.sql" ) query_file2 = ( TEST_DIR / "data" / "test_sql" / "moz-fx-data-test-project" / "test" / "no_metadata_query_v1" / "query.sql" ) metadata = Metadata( "test", "test", ["*****@*****.**"], {}, { "dag_name": "bqetl_test_dag", "depends_on_past": True, "depends_on": [{"dag_name": "external", "task_id": "task1"}], }, ) tasks = [ Task.of_query(query_file, metadata), Task.of_query(query_file2, metadata), ] default_args = { "owner": "*****@*****.**", "start_date": "2020-01-01", } dags = DagCollection.from_dict( { "bqetl_test_dag": { "schedule_interval": "daily", "default_args": default_args, } } ).with_tasks(tasks) dags.to_airflow_dags(tmp_path) result = (tmp_path / "bqetl_test_dag.py").read_text().strip() expected = ( (TEST_DIR / "data" / "dags" / "test_dag_duplicate_dependencies") .read_text() .strip() ) assert result == expected
def test_date_partition_parameter(self): query_file = ( TEST_DIR / "data" / "test_sql" / "test" / "incremental_query_v1" / "query.sql" ) assert Task( dag_name="bqetl_test_dag", query_file=str(query_file), owner="*****@*****.**", date_partition_parameter=NewType("Ignore", None), ) assert Task( dag_name="bqetl_test_dag", query_file=str(query_file), owner="*****@*****.**", date_partition_parameter=None, ) assert Task( dag_name="bqetl_test_dag", query_file=str(query_file), owner="*****@*****.**", date_partition_parameter="import_date", )
def test_email_validation(self): query_file = (TEST_DIR / "data" / "test_sql" / "test" / "incremental_query_v1" / "query.sql") with pytest.raises(ValueError): assert Task( dag_name="bqetl_test_dag", query_file=str(query_file), owner="*****@*****.**", email=["*****@*****.**", "invalid", "*****@*****.**"], ) assert Task( dag_name="bqetl_test_dag", query_file=str(query_file), owner="*****@*****.**", email=[], ) assert Task( dag_name="bqetl_test_dag", query_file=str(query_file), owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], )
def test_add_tasks(self): dag = Dag("bqetl_test_dag", "daily", self.default_args) query_file = (TEST_DIR / "data" / "test_sql" / "moz-fx-data-test-project" / "test" / "incremental_query_v1" / "query.sql") scheduling = { "dag_name": "bqetl_test_dag", "default_args": { "owner": "*****@*****.**" }, "task_name": "custom_task_name", } metadata = Metadata("test", "test", ["*****@*****.**"], {}, scheduling) task1 = Task.of_query(query_file) task2 = Task.of_query(query_file, metadata) assert dag.tasks == [] dag.add_tasks([task1, task2]) assert len(dag.tasks) == 2
def test_start_date_validation(self): query_file = (TEST_DIR / "data" / "test_sql" / "test" / "incremental_query_v1" / "query.sql") with pytest.raises(ValueError): assert Task( dag_name="bqetl_test_dag", query_file=str(query_file), owner="*****@*****.**", start_date="March 12th 2020", ) with pytest.raises(ValueError): assert Task( dag_name="bqetl_test_dag", query_file=str(query_file), owner="*****@*****.**", start_date="2020-13-12", ) assert Task( dag_name="bqetl_test_dag", query_file=str(query_file), owner="*****@*****.**", start_date="2020-01-01", )
def test_task_get_nested_view_dependencies( self, tmp_path, bigquery_client, project_id, temporary_dataset ): query_file_path = tmp_path / "sql" / temporary_dataset / "query_v1" os.makedirs(query_file_path) query_file = query_file_path / "query.sql" query_file.write_text( f"SELECT * FROM {project_id}.{temporary_dataset}.table1_v1 " + f"UNION ALL SELECT * FROM {project_id}.{temporary_dataset}.test_view" ) schema = [bigquery.SchemaField("a", "STRING", mode="NULLABLE")] table = bigquery.Table( f"{project_id}.{temporary_dataset}.table1_v1", schema=schema ) bigquery_client.create_table(table) table = bigquery.Table( f"{project_id}.{temporary_dataset}.table2_v1", schema=schema ) bigquery_client.create_table(table) view = bigquery.Table(f"{project_id}.{temporary_dataset}.test_view2") view.view_query = f"SELECT * FROM {project_id}.{temporary_dataset}.table2_v1" bigquery_client.create_table(view) view = bigquery.Table(f"{project_id}.{temporary_dataset}.test_view") view.view_query = f"SELECT * FROM {project_id}.{temporary_dataset}.test_view2" bigquery_client.create_table(view) metadata = Metadata( "test", "test", ["*****@*****.**"], {}, self.default_scheduling ) task = Task.of_query(query_file, metadata) table_task1 = Task.of_query( tmp_path / "sql" / temporary_dataset / "table1_v1" / "query.sql", metadata ) table_task2 = Task.of_query( tmp_path / "sql" / temporary_dataset / "table2_v1" / "query.sql", metadata ) dags = DagCollection.from_dict( { "bqetl_test_dag": { "schedule_interval": "daily", "default_args": { "owner": "*****@*****.**", "start_date": "2020-01-01", }, } } ).with_tasks([task, table_task1, table_task2]) task.with_dependencies(bigquery_client, dags) result = task.dependencies tables = [f"{t.dataset}__{t.table}__{t.version}" for t in result] assert f"{temporary_dataset}__table1__v1" in tables assert f"{temporary_dataset}__table2__v1" in tables
def test_unscheduled_task(self, tmp_path): query_file = (TEST_DIR / "data" / "test_sql" / "test" / "incremental_query_v1" / "query.sql") metadata = Metadata("test", "test", [], {}, {}) with pytest.raises(UnscheduledTask): Task.of_query(query_file, metadata)
def test_no_dag_name(self): query_file = (TEST_DIR / "data" / "test_sql" / "test" / "incremental_query_v1" / "query.sql") metadata = Metadata("test", "test", [], {}, self.default_scheduling) with pytest.raises(TaskParseException): Task.of_query(query_file, metadata)
def test_task_get_view_dependencies(self, tmp_path, bigquery_client, project_id, temporary_dataset): query_file_path = tmp_path / "sql" / temporary_dataset / "query_v1" os.makedirs(query_file_path) query_file = query_file_path / "query.sql" query_file.write_text( f"SELECT * FROM {project_id}.{temporary_dataset}.table1_v1 " + f"UNION ALL SELECT * FROM {project_id}.{temporary_dataset}.test_view" ) schema = [bigquery.SchemaField("a", "STRING", mode="NULLABLE")] table = bigquery.Table(f"{project_id}.{temporary_dataset}.table1_v1", schema=schema) bigquery_client.create_table(table) table = bigquery.Table(f"{project_id}.{temporary_dataset}.table2_v1", schema=schema) bigquery_client.create_table(table) view = bigquery.Table(f"{project_id}.{temporary_dataset}.test_view") view.view_query = f"SELECT * FROM {project_id}.{temporary_dataset}.table2_v1" bigquery_client.create_table(view) metadata = Metadata( "test", "test", {}, { "dag_name": "test_dag", "depends_on_past": True, "param": "test_param" }, ) task = Task(query_file, metadata) table_task1 = Task( tmp_path / "sql" / temporary_dataset / "table1_v1" / "query.sql", metadata) table_task2 = Task( tmp_path / "sql" / temporary_dataset / "table2_v1" / "query.sql", metadata) dags = DagCollection.from_dict({ "test_dag": { "schedule_interval": "daily", "default_args": {} } }).with_tasks([task, table_task1, table_task2]) result = task.get_dependencies(bigquery_client, dags) tables = [f"{t.dataset}__{t.table}__{t.version}" for t in result] assert f"{temporary_dataset}__table1__v1" in tables assert f"{temporary_dataset}__table2__v1" in tables
def test_task_get_nested_view_dependencies(self, tmp_path): query_file_path = tmp_path / "test-project" / "test" / "query_v1" os.makedirs(query_file_path) query_file = query_file_path / "query.sql" query_file.write_text( "SELECT * FROM `test-project`.test.table1_v1 " "UNION ALL SELECT * FROM `test-project`.test.test_view") view_file_path = tmp_path / "test-project" / "test" / "test_view" os.makedirs(view_file_path) view_file = view_file_path / "view.sql" view_file.write_text( "CREATE OR REPLACE VIEW `test-project`.test.test_view " "AS SELECT * FROM `test-project`.test.test_view2") view2_file_path = tmp_path / "test-project" / "test" / "test_view2" os.makedirs(view2_file_path) view2_file = view2_file_path / "view.sql" view2_file.write_text( "CREATE OR REPLACE VIEW `test-project`.test.test_view2 " "AS SELECT * FROM `test-project`.test.table2_v1") metadata = Metadata("test", "test", ["*****@*****.**"], {}, self.default_scheduling) task = Task.of_query(query_file, metadata) table_task1 = Task.of_query( tmp_path / "test-project" / "test" / "table1_v1" / "query.sql", metadata, ) table_task2 = Task.of_query( tmp_path / "test-project" / "test" / "table2_v1" / "query.sql", metadata, ) dags = DagCollection.from_dict({ "bqetl_test_dag": { "schedule_interval": "daily", "default_args": { "owner": "*****@*****.**", "start_date": "2020-01-01", }, } }).with_tasks([task, table_task1, table_task2]) task.with_dependencies(dags) result = task.dependencies tables = [t.task_id for t in result] assert "test__table1__v1" in tables assert "test__table2__v1" in tables
def get_dags(project_id, dags_config): """Return all configured DAGs including associated tasks.""" tasks = [] dag_collection = DagCollection.from_file(dags_config) for project_dir in project_dirs(project_id): # parse metadata.yaml to retrieve scheduling information if os.path.isdir(project_dir): for root, dirs, files in os.walk(project_dir): try: if QUERY_FILE in files: query_file = os.path.join(root, QUERY_FILE) task = Task.of_query(query_file, dag_collection=dag_collection) elif QUERY_PART_FILE in files: # multipart query query_file = os.path.join(root, QUERY_PART_FILE) task = Task.of_multipart_query( query_file, dag_collection=dag_collection) elif SCRIPT_FILE in files: query_file = os.path.join(root, SCRIPT_FILE) task = Task.of_script(query_file, dag_collection=dag_collection) elif PYTHON_SCRIPT_FILE in files: query_file = os.path.join(root, PYTHON_SCRIPT_FILE) task = Task.of_python_script( query_file, dag_collection=dag_collection) else: continue except FileNotFoundError: # query has no metadata.yaml file; skip pass except UnscheduledTask: # logging.debug( # f"No scheduling information for {query_file}." # ) # # most tasks lack scheduling information for now pass except Exception as e: # in the case that there was some other error, report the query # that failed before exiting logging.error( f"Error processing task for query {query_file}") raise e else: tasks.append(task) else: logging.error(""" Invalid project_dir: {}, project_dir must be a directory with structure <sql>/<project>/<dataset>/<table>/metadata.yaml. """.format(project_dir)) return dag_collection.with_tasks(tasks)
def test_multipart_task_get_dependencies(self, tmp_path, bigquery_client, project_id, temporary_dataset): query_file_path = tmp_path / project_id / temporary_dataset / "query_v1" os.makedirs(query_file_path) query_file_part1 = query_file_path / "part1.sql" query_file_part1.write_text( f"SELECT * FROM {project_id}.{temporary_dataset}.table1_v1") query_file_part2 = query_file_path / "part2.sql" query_file_part2.write_text( f"SELECT * FROM {project_id}.{temporary_dataset}.table2_v1") schema = [bigquery.SchemaField("a", "STRING", mode="NULLABLE")] table = bigquery.Table(f"{project_id}.{temporary_dataset}.table1_v1", schema=schema) bigquery_client.create_table(table) table = bigquery.Table(f"{project_id}.{temporary_dataset}.table2_v1", schema=schema) bigquery_client.create_table(table) metadata = Metadata("test", "test", ["*****@*****.**"], {}, self.default_scheduling) task = Task.of_multipart_query(query_file_part1, metadata) table_task1 = Task.of_query( tmp_path / project_id / temporary_dataset / "table1_v1" / "query.sql", metadata, ) table_task2 = Task.of_query( tmp_path / project_id / temporary_dataset / "table2_v1" / "query.sql", metadata, ) dags = DagCollection.from_dict({ "bqetl_test_dag": { "schedule_interval": "daily", "default_args": { "owner": "*****@*****.**", "start_date": "2020-01-01", }, } }).with_tasks([task, table_task1, table_task2]) task.with_dependencies(dags) result = task.dependencies tables = [t.task_id for t in result] assert f"{temporary_dataset}__table1__v1" in tables assert f"{temporary_dataset}__table2__v1" in tables
def test_validate_task_name(self): query_file = ( TEST_DIR / "data" / "test_sql" / "test" / (("a" * 63) + "_v1") / "query.sql" ) scheduling = { "dag_name": "bqetl_test_dag", "default_args": {"owner": "*****@*****.**"}, } metadata = Metadata("test", "test", ["*****@*****.**"], {}, scheduling) with pytest.raises(ValueError): Task.of_query(query_file, metadata)
def test_owner_validation(self): query_file = (TEST_DIR / "data" / "test_sql" / "test" / "incremental_query_v1" / "query.sql") with pytest.raises(ValueError): assert Task(dag_name="bqetl_test_dag", query_file=str(query_file), owner="invalid") assert Task( dag_name="bqetl_test_dag", query_file=str(query_file), owner="*****@*****.**", )
def test_dags_with_tasks(self): query_file = (TEST_DIR / "data" / "test_sql" / "moz-fx-data-test-project" / "test" / "incremental_query_v1" / "query.sql") metadata = Metadata( "test", "test", ["*****@*****.**"], {}, { "dag_name": "bqetl_test_dag", "depends_on_past": True }, ) tasks = [Task.of_query(query_file, metadata)] dags = DagCollection.from_dict({ "bqetl_test_dag": { "schedule_interval": "daily", "default_args": self.default_args, } }).with_tasks(tasks) assert len(dags.dags) == 1 dag = dags.dag_by_name("bqetl_test_dag") assert len(dag.tasks) == 1 assert dag.tasks[0].dag_name == "bqetl_test_dag"
def get_dags(sql_dir, dags_config): """Return all configured DAGs including associated tasks.""" tasks = [] # parse metadata.yaml to retrieve scheduling information if os.path.isdir(sql_dir): for root, dirs, files in os.walk(sql_dir): if QUERY_FILE in files: query_file = os.path.join(root, QUERY_FILE) try: task = Task.of_query(query_file) tasks.append(task) except FileNotFoundError: # query has no metadata.yaml file; skip pass except UnscheduledTask: # logging.debug( # f"No scheduling information for {query_file}." # ) # # most tasks lack scheduling information for now pass else: logging.error(""" Invalid sql_dir: {}, sql_dir must be a directory with structure /<dataset>/<table>/metadata.yaml. """.format(sql_dir)) return DagCollection.from_file(dags_config).with_tasks(tasks)
def test_dags_with_tasks(self): query_file = (TEST_DIR / "data" / "test_sql" / "test" / "incremental_query_v1" / "query.sql") metadata = Metadata( "test", "test", {}, { "dag_name": "test_dag", "depends_on_past": True, "param": "test_param" }, ) tasks = [Task(query_file, metadata)] dags = DagCollection.from_dict({ "test_dag": { "schedule_interval": "daily", "default_args": {} } }).with_tasks(tasks) assert len(dags.dags) == 1 dag = dags.dag_by_name("test_dag") assert len(dag.tasks) == 1 assert dag.tasks[0].dag_name == "test_dag"
def test_task_for_table(self): query_file = (TEST_DIR / "data" / "test_sql" / "test" / "incremental_query_v1" / "query.sql") metadata = Metadata( "test", "test", ["*****@*****.**"], {}, { "dag_name": "bqetl_test_dag", "depends_on_past": True }, ) tasks = [Task.of_query(query_file, metadata)] dags = DagCollection.from_dict({ "bqetl_test_dag": { "schedule_interval": "daily", "default_args": self.default_args, } }).with_tasks(tasks) task = dags.task_for_table("test", "incremental_query_v1") assert task assert task.dag_name == "bqetl_test_dag"
def test_dags_with_invalid_tasks(self): with pytest.raises(InvalidDag): query_file = (TEST_DIR / "data" / "test_sql" / "test" / "incremental_query_v1" / "query.sql") metadata = Metadata( "test", "test", ["*****@*****.**"], {}, { "dag_name": "bqetl_non_exisiting_dag", "depends_on_past": True, "param": "test_param", }, ) tasks = [Task.of_query(query_file, metadata)] DagCollection.from_dict({ "bqetl_test_dag": { "schedule_interval": "daily", "default_args": self.default_args, } }).with_tasks(tasks)
def test_public_json_dag_to_airflow(self, tmp_path): query_file = (TEST_DIR / "data" / "test_sql" / "test" / "non_incremental_query_v1" / "query.sql") tasks = [Task.of_query(query_file)] default_args = { "depends_on_past": False, "owner": "*****@*****.**", "email": ["*****@*****.**"], "start_date": "2020-01-01", "retry_delay": "1h", } dags = DagCollection.from_dict({ "bqetl_public_data_json": { "schedule_interval": "daily", "default_args": default_args, }, "bqetl_core": { "schedule_interval": "daily", "default_args": default_args, }, }).with_tasks(tasks) dags.to_airflow_dags(tmp_path) result = (tmp_path / "bqetl_public_data_json.py").read_text().strip() expected_dag = ((TEST_DIR / "data" / "dags" / "test_public_data_json_dag").read_text().strip()) assert result == expected_dag
def test_public_data_json_dag_add_task(self): public_json_dag = PublicDataJsonDag("bqetl_public_data_json_dag", "daily", self.default_args) assert len(public_json_dag.tasks) == 0 query_file = (TEST_DIR / "data" / "test_sql" / "moz-fx-data-test-project" / "test" / "incremental_query_v1" / "query.sql") task = Task.of_query(query_file) dag = Dag( "bqetl_events", "0 1 * * *", DagDefaultArgs("*****@*****.**", "2020-01-01"), [task], ) public_json_dag.add_export_tasks([task], DagCollection([dag])) assert len(public_json_dag.tasks) == 1 assert (public_json_dag.tasks[0].task_name == "export_public_data_json_test__incremental_query__v1") assert public_json_dag.tasks[ 0].dag_name == "bqetl_public_data_json_dag" assert len(public_json_dag.tasks[0].dependencies) == 1
def test_task_depends_on(self): query_file = (TEST_DIR / "data" / "test_sql" / "test" / "incremental_query_v1" / "query.sql") scheduling = { "dag_name": "bqetl_test_dag", "default_args": { "owner": "*****@*****.**" }, "depends_on": [ { "dag_name": "external_dag", "task_id": "external_task" }, { "dag_name": "external_dag2", "task_id": "external_task2", "execution_delta": "15m", }, ], } metadata = Metadata("test", "test", ["*****@*****.**"], {}, scheduling) task = Task.of_query(query_file, metadata) assert task.dag_name == "bqetl_test_dag" assert len(task.depends_on) == 2 assert task.depends_on[0].dag_name == "external_dag" assert task.depends_on[0].task_id == "external_task" assert task.depends_on[1].dag_name == "external_dag2" assert task.depends_on[1].task_id == "external_task2" assert task.depends_on[1].execution_delta == "15m"
def test_no_dag_name(self): query_file = (TEST_DIR / "data" / "test_sql" / "test" / "incremental_query_v1" / "query.sql") metadata = Metadata("test", "test", {}, {"foo": "bar"}) with pytest.raises(TaskParseException): Task(query_file, metadata)
def test_validate_custom_task_name(self): query_file = (TEST_DIR / "data" / "test_sql" / "moz-fx-data-test-project" / "test" / "incremental_query_v1" / "query.sql") scheduling = { "dag_name": "bqetl_test_dag", "default_args": { "owner": "*****@*****.**" }, "task_name": "a" * 63, } metadata = Metadata("test", "test", ["*****@*****.**"], {}, scheduling) with pytest.raises(ValueError): Task.of_query(query_file, metadata) scheduling = { "dag_name": "bqetl_test_dag", "default_args": { "owner": "*****@*****.**" }, "task_name": "", } metadata = Metadata("test", "test", ["*****@*****.**"], {}, scheduling) with pytest.raises(ValueError): Task.of_query(query_file, metadata) scheduling = { "dag_name": "bqetl_test_dag", "default_args": { "owner": "*****@*****.**" }, "task_name": "a" * 62, } metadata = Metadata("test", "test", ["*****@*****.**"], {}, scheduling) task = Task.of_query(query_file, metadata) assert task.task_name == "a" * 62
def test_add_tasks(self): dag = Dag("test_dag", "daily", {}) query_file = ( TEST_DIR / "data" / "test_sql" / "test" / "incremental_query_v1" / "query.sql" ) tasks = [Task.of_query(query_file), Task.of_query(query_file)] assert dag.tasks == [] dag.add_tasks(tasks) assert len(dag.tasks) == 2
def test_multipart_task_get_dependencies(self, tmp_path): query_file_path = tmp_path / "test-project" / "test" / "query_v1" os.makedirs(query_file_path) query_file_part1 = query_file_path / "part1.sql" query_file_part1.write_text( "SELECT * FROM `test-project`.test.table1_v1") query_file_part2 = query_file_path / "part2.sql" query_file_part2.write_text( "SELECT * FROM `test-project`.test.table2_v1") metadata = Metadata("test", "test", ["*****@*****.**"], {}, self.default_scheduling) task = Task.of_multipart_query(query_file_part1, metadata) table_task1 = Task.of_query( tmp_path / "test-project" / "test" / "table1_v1" / "query.sql", metadata, ) table_task2 = Task.of_query( tmp_path / "test-project" / "test" / "table2_v1" / "query.sql", metadata, ) dags = DagCollection.from_dict({ "bqetl_test_dag": { "schedule_interval": "daily", "default_args": { "owner": "*****@*****.**", "start_date": "2020-01-01", }, } }).with_tasks([task, table_task1, table_task2]) task.with_dependencies(dags) result = task.dependencies tables = [t.task_id for t in result] assert "test__table1__v1" in tables assert "test__table2__v1" in tables
def test_task_instantiation(self): query_file = (TEST_DIR / "data" / "test_sql" / "test" / "incremental_query_v1" / "query.sql") metadata = Metadata("test", "test", ["*****@*****.**"], {}, self.default_scheduling) task = Task.of_query(query_file, metadata) assert task.dag_name == "bqetl_test_dag" assert task.depends_on_past is False assert task.task_name == "test__incremental_query__v1" assert task.public_json is False
def test_task_get_dependencies_none(self, tmp_path, bigquery_client): query_file_path = tmp_path / "sql" / "test" / "query_v1" os.makedirs(query_file_path) query_file = query_file_path / "query.sql" query_file.write_text("SELECT 123423") metadata = Metadata( "test", "test", {}, { "dag_name": "test_dag", "depends_on_past": True, "param": "test_param" }, ) task = Task(query_file, metadata) dags = DagCollection.from_dict({}) assert task.get_dependencies(bigquery_client, dags) == []
def test_of_query(self): query_file = (TEST_DIR / "data" / "test_sql" / "test" / "incremental_query_v1" / "query.sql") task = Task.of_query(query_file) assert task.query_file == str(query_file) assert task.dataset == "test" assert task.table == "incremental_query" assert task.version == "v1" assert task.task_name == "test__incremental_query__v1" assert task.dag_name == "bqetl_events" assert task.args == {"depends_on_past": False}
def test_task_get_dependencies_none(self, tmp_path, bigquery_client): query_file_path = tmp_path / "sql" / "test" / "query_v1" os.makedirs(query_file_path) query_file = query_file_path / "query.sql" query_file.write_text("SELECT 123423") metadata = Metadata("test", "test", ["*****@*****.**"], {}, self.default_scheduling) task = Task.of_query(query_file, metadata) dags = DagCollection.from_dict({}) task.with_dependencies(bigquery_client, dags) assert task.dependencies == []