def get_dags(project_id, dags_config): """Return all configured DAGs including associated tasks.""" tasks = [] dag_collection = DagCollection.from_file(dags_config) for project_dir in project_dirs(project_id): # parse metadata.yaml to retrieve scheduling information if os.path.isdir(project_dir): for root, dirs, files in os.walk(project_dir): try: if QUERY_FILE in files: query_file = os.path.join(root, QUERY_FILE) task = Task.of_query(query_file, dag_collection=dag_collection) elif QUERY_PART_FILE in files: # multipart query query_file = os.path.join(root, QUERY_PART_FILE) task = Task.of_multipart_query( query_file, dag_collection=dag_collection) elif SCRIPT_FILE in files: query_file = os.path.join(root, SCRIPT_FILE) task = Task.of_script(query_file, dag_collection=dag_collection) elif PYTHON_SCRIPT_FILE in files: query_file = os.path.join(root, PYTHON_SCRIPT_FILE) task = Task.of_python_script( query_file, dag_collection=dag_collection) else: continue except FileNotFoundError: # query has no metadata.yaml file; skip pass except UnscheduledTask: # logging.debug( # f"No scheduling information for {query_file}." # ) # # most tasks lack scheduling information for now pass except Exception as e: # in the case that there was some other error, report the query # that failed before exiting logging.error( f"Error processing task for query {query_file}") raise e else: tasks.append(task) else: logging.error(""" Invalid project_dir: {}, project_dir must be a directory with structure <sql>/<project>/<dataset>/<table>/metadata.yaml. """.format(project_dir)) return dag_collection.with_tasks(tasks)
def test_multipart_task_get_dependencies(self, tmp_path, bigquery_client, project_id, temporary_dataset): query_file_path = tmp_path / project_id / temporary_dataset / "query_v1" os.makedirs(query_file_path) query_file_part1 = query_file_path / "part1.sql" query_file_part1.write_text( f"SELECT * FROM {project_id}.{temporary_dataset}.table1_v1") query_file_part2 = query_file_path / "part2.sql" query_file_part2.write_text( f"SELECT * FROM {project_id}.{temporary_dataset}.table2_v1") schema = [bigquery.SchemaField("a", "STRING", mode="NULLABLE")] table = bigquery.Table(f"{project_id}.{temporary_dataset}.table1_v1", schema=schema) bigquery_client.create_table(table) table = bigquery.Table(f"{project_id}.{temporary_dataset}.table2_v1", schema=schema) bigquery_client.create_table(table) metadata = Metadata("test", "test", ["*****@*****.**"], {}, self.default_scheduling) task = Task.of_multipart_query(query_file_part1, metadata) table_task1 = Task.of_query( tmp_path / project_id / temporary_dataset / "table1_v1" / "query.sql", metadata, ) table_task2 = Task.of_query( tmp_path / project_id / temporary_dataset / "table2_v1" / "query.sql", metadata, ) dags = DagCollection.from_dict({ "bqetl_test_dag": { "schedule_interval": "daily", "default_args": { "owner": "*****@*****.**", "start_date": "2020-01-01", }, } }).with_tasks([task, table_task1, table_task2]) task.with_dependencies(dags) result = task.dependencies tables = [t.task_id for t in result] assert f"{temporary_dataset}__table1__v1" in tables assert f"{temporary_dataset}__table2__v1" in tables
def test_of_multipart_query(self): query_file = (TEST_DIR / "data" / "test_sql" / "test" / "multipart_query_v1" / "part1.sql") task = Task.of_multipart_query(query_file) assert task.query_file == str(query_file) assert task.dataset == "test" assert task.table == "multipart_query" assert task.version == "v1" assert task.task_name == "test__multipart_query__v1" assert task.dag_name == "bqetl_core" assert task.depends_on_past is False assert task.multipart assert task.sql_file_path == os.path.dirname(query_file)
def test_multipart_task_get_dependencies(self, tmp_path): query_file_path = tmp_path / "test-project" / "test" / "query_v1" os.makedirs(query_file_path) query_file_part1 = query_file_path / "part1.sql" query_file_part1.write_text( "SELECT * FROM `test-project`.test.table1_v1") query_file_part2 = query_file_path / "part2.sql" query_file_part2.write_text( "SELECT * FROM `test-project`.test.table2_v1") metadata = Metadata("test", "test", ["*****@*****.**"], {}, self.default_scheduling) task = Task.of_multipart_query(query_file_part1, metadata) table_task1 = Task.of_query( tmp_path / "test-project" / "test" / "table1_v1" / "query.sql", metadata, ) table_task2 = Task.of_query( tmp_path / "test-project" / "test" / "table2_v1" / "query.sql", metadata, ) dags = DagCollection.from_dict({ "bqetl_test_dag": { "schedule_interval": "daily", "default_args": { "owner": "*****@*****.**", "start_date": "2020-01-01", }, } }).with_tasks([task, table_task1, table_task2]) task.with_dependencies(dags) result = task.dependencies tables = [t.task_id for t in result] assert "test__table1__v1" in tables assert "test__table2__v1" in tables
def get_dags(sql_dir, dags_config): """Return all configured DAGs including associated tasks.""" tasks = [] # parse metadata.yaml to retrieve scheduling information if os.path.isdir(sql_dir): for root, dirs, files in os.walk(sql_dir): if QUERY_FILE in files: query_file = os.path.join(root, QUERY_FILE) try: task = Task.of_query(query_file) tasks.append(task) except FileNotFoundError: # query has no metadata.yaml file; skip pass except UnscheduledTask: # logging.debug( # f"No scheduling information for {query_file}." # ) # # most tasks lack scheduling information for now pass elif QUERY_PART_FILE in files: # multipart query query_file = os.path.join(root, QUERY_PART_FILE) try: task = Task.of_multipart_query(query_file) tasks.append(task) except FileNotFoundError: # query has no metadata.yaml file; skip pass except UnscheduledTask: pass else: logging.error(""" Invalid sql_dir: {}, sql_dir must be a directory with structure /<dataset>/<table>/metadata.yaml. """.format(sql_dir)) return DagCollection.from_file(dags_config).with_tasks(tasks)