示例#1
0
def get_dags(project_id, dags_config):
    """Return all configured DAGs including associated tasks."""
    tasks = []
    dag_collection = DagCollection.from_file(dags_config)

    for project_dir in project_dirs(project_id):
        # parse metadata.yaml to retrieve scheduling information
        if os.path.isdir(project_dir):
            for root, dirs, files in os.walk(project_dir):
                try:
                    if QUERY_FILE in files:
                        query_file = os.path.join(root, QUERY_FILE)
                        task = Task.of_query(query_file,
                                             dag_collection=dag_collection)
                    elif QUERY_PART_FILE in files:
                        # multipart query
                        query_file = os.path.join(root, QUERY_PART_FILE)
                        task = Task.of_multipart_query(
                            query_file, dag_collection=dag_collection)
                    elif SCRIPT_FILE in files:
                        query_file = os.path.join(root, SCRIPT_FILE)
                        task = Task.of_script(query_file,
                                              dag_collection=dag_collection)
                    elif PYTHON_SCRIPT_FILE in files:
                        query_file = os.path.join(root, PYTHON_SCRIPT_FILE)
                        task = Task.of_python_script(
                            query_file, dag_collection=dag_collection)
                    else:
                        continue
                except FileNotFoundError:
                    # query has no metadata.yaml file; skip
                    pass
                except UnscheduledTask:
                    # logging.debug(
                    #     f"No scheduling information for {query_file}."
                    # )
                    #
                    # most tasks lack scheduling information for now
                    pass
                except Exception as e:
                    # in the case that there was some other error, report the query
                    # that failed before exiting
                    logging.error(
                        f"Error processing task for query {query_file}")
                    raise e
                else:
                    tasks.append(task)

        else:
            logging.error("""
                Invalid project_dir: {}, project_dir must be a directory with
                structure <sql>/<project>/<dataset>/<table>/metadata.yaml.
                """.format(project_dir))

    return dag_collection.with_tasks(tasks)
示例#2
0
    def test_multipart_task_get_dependencies(self, tmp_path, bigquery_client,
                                             project_id, temporary_dataset):
        query_file_path = tmp_path / project_id / temporary_dataset / "query_v1"
        os.makedirs(query_file_path)

        query_file_part1 = query_file_path / "part1.sql"
        query_file_part1.write_text(
            f"SELECT * FROM {project_id}.{temporary_dataset}.table1_v1")

        query_file_part2 = query_file_path / "part2.sql"
        query_file_part2.write_text(
            f"SELECT * FROM {project_id}.{temporary_dataset}.table2_v1")

        schema = [bigquery.SchemaField("a", "STRING", mode="NULLABLE")]
        table = bigquery.Table(f"{project_id}.{temporary_dataset}.table1_v1",
                               schema=schema)
        bigquery_client.create_table(table)
        table = bigquery.Table(f"{project_id}.{temporary_dataset}.table2_v1",
                               schema=schema)
        bigquery_client.create_table(table)

        metadata = Metadata("test", "test", ["*****@*****.**"], {},
                            self.default_scheduling)

        task = Task.of_multipart_query(query_file_part1, metadata)

        table_task1 = Task.of_query(
            tmp_path / project_id / temporary_dataset / "table1_v1" /
            "query.sql",
            metadata,
        )
        table_task2 = Task.of_query(
            tmp_path / project_id / temporary_dataset / "table2_v1" /
            "query.sql",
            metadata,
        )

        dags = DagCollection.from_dict({
            "bqetl_test_dag": {
                "schedule_interval": "daily",
                "default_args": {
                    "owner": "*****@*****.**",
                    "start_date": "2020-01-01",
                },
            }
        }).with_tasks([task, table_task1, table_task2])

        task.with_dependencies(dags)
        result = task.dependencies

        tables = [t.task_id for t in result]

        assert f"{temporary_dataset}__table1__v1" in tables
        assert f"{temporary_dataset}__table2__v1" in tables
示例#3
0
    def test_of_multipart_query(self):
        query_file = (TEST_DIR / "data" / "test_sql" / "test" /
                      "multipart_query_v1" / "part1.sql")

        task = Task.of_multipart_query(query_file)

        assert task.query_file == str(query_file)
        assert task.dataset == "test"
        assert task.table == "multipart_query"
        assert task.version == "v1"
        assert task.task_name == "test__multipart_query__v1"
        assert task.dag_name == "bqetl_core"
        assert task.depends_on_past is False
        assert task.multipart
        assert task.sql_file_path == os.path.dirname(query_file)
示例#4
0
    def test_multipart_task_get_dependencies(self, tmp_path):
        query_file_path = tmp_path / "test-project" / "test" / "query_v1"
        os.makedirs(query_file_path)

        query_file_part1 = query_file_path / "part1.sql"
        query_file_part1.write_text(
            "SELECT * FROM `test-project`.test.table1_v1")

        query_file_part2 = query_file_path / "part2.sql"
        query_file_part2.write_text(
            "SELECT * FROM `test-project`.test.table2_v1")

        metadata = Metadata("test", "test", ["*****@*****.**"], {},
                            self.default_scheduling)

        task = Task.of_multipart_query(query_file_part1, metadata)

        table_task1 = Task.of_query(
            tmp_path / "test-project" / "test" / "table1_v1" / "query.sql",
            metadata,
        )
        table_task2 = Task.of_query(
            tmp_path / "test-project" / "test" / "table2_v1" / "query.sql",
            metadata,
        )

        dags = DagCollection.from_dict({
            "bqetl_test_dag": {
                "schedule_interval": "daily",
                "default_args": {
                    "owner": "*****@*****.**",
                    "start_date": "2020-01-01",
                },
            }
        }).with_tasks([task, table_task1, table_task2])

        task.with_dependencies(dags)
        result = task.dependencies

        tables = [t.task_id for t in result]

        assert "test__table1__v1" in tables
        assert "test__table2__v1" in tables
示例#5
0
def get_dags(sql_dir, dags_config):
    """Return all configured DAGs including associated tasks."""
    tasks = []

    # parse metadata.yaml to retrieve scheduling information
    if os.path.isdir(sql_dir):
        for root, dirs, files in os.walk(sql_dir):
            if QUERY_FILE in files:
                query_file = os.path.join(root, QUERY_FILE)

                try:
                    task = Task.of_query(query_file)
                    tasks.append(task)
                except FileNotFoundError:
                    # query has no metadata.yaml file; skip
                    pass
                except UnscheduledTask:
                    # logging.debug(
                    #     f"No scheduling information for {query_file}."
                    # )
                    #
                    # most tasks lack scheduling information for now
                    pass
            elif QUERY_PART_FILE in files:
                # multipart query
                query_file = os.path.join(root, QUERY_PART_FILE)

                try:
                    task = Task.of_multipart_query(query_file)
                    tasks.append(task)
                except FileNotFoundError:
                    # query has no metadata.yaml file; skip
                    pass
                except UnscheduledTask:
                    pass

    else:
        logging.error("""
            Invalid sql_dir: {}, sql_dir must be a directory with
            structure /<dataset>/<table>/metadata.yaml.
            """.format(sql_dir))

    return DagCollection.from_file(dags_config).with_tasks(tasks)