Пример #1
0
def test_normalize_column_name(input_str: str, destination_type: str,
                               expected: str, expected_in_jinja: str):
    t = DestinationType.from_string(destination_type)
    assert DestinationNameTransformer(t).normalize_column_name(
        input_str, in_jinja=False) == expected
    assert DestinationNameTransformer(t).normalize_column_name(
        input_str, in_jinja=True) == expected_in_jinja
Пример #2
0
def test_normalize_name(input_str: str, destination_type: str, expected: str,
                        expected_column: str):
    t = DestinationType.from_string(destination_type)
    assert DestinationNameTransformer(t).normalize_schema_name(
        input_str) == expected
    assert DestinationNameTransformer(t).normalize_table_name(
        input_str) == expected
    assert DestinationNameTransformer(t).normalize_column_name(
        input_str) == expected_column
Пример #3
0
 def process_catalog(self) -> None:
     destination_type = DestinationType.from_string(self.config["integration_type"])
     schema = self.config["schema"]
     output = self.config["output_path"]
     json_col = self.config["json_column"]
     processor = CatalogProcessor(output_directory=output, destination_type=destination_type)
     for catalog_file in self.config["catalog"]:
         print(f"Processing {catalog_file}...")
         processor.process(catalog_file=catalog_file, json_column_name=json_col, default_schema=schema)
Пример #4
0
def test_destination_failure_over_limits(integration_type: str,
                                         column_count: int,
                                         expected_exception_message: str,
                                         setup_test_path):
    destination_type = DestinationType.from_string(integration_type)
    if destination_type.value not in dbt_test_utils.get_test_targets():
        pytest.skip(
            f"Destinations {destination_type} is not in NORMALIZATION_TEST_TARGET env variable"
        )
    run_test(destination_type, column_count, expected_exception_message)
Пример #5
0
    def get_test_targets() -> List[str]:
        """
        Returns a list of destinations to run tests on.

        if the environment variable NORMALIZATION_TEST_TARGET is set with a comma separated list of destination names,
        then the tests are run only on that subsets of destinations
        Otherwise tests are run against all destinations
        """
        if os.getenv(NORMALIZATION_TEST_TARGET):
            target_str = os.getenv(NORMALIZATION_TEST_TARGET)
            return [d.value for d in {DestinationType.from_string(s.strip()) for s in target_str.split(",")}]
        else:
            return [d.value for d in DestinationType]
Пример #6
0
def test_normalization(integration_type: str, test_resource_name: str, setup_test_path):
    print("Testing normalization")
    destination_type = DestinationType.from_string(integration_type)
    # Create the test folder with dbt project and appropriate destination settings to run integration tests from
    test_root_dir = setup_test_dir(integration_type, test_resource_name)
    destination_config = generate_profile_yaml_file(destination_type, test_root_dir)
    # Use destination connector to create _airbyte_raw_* tables to use as input for the test
    assert setup_input_raw_data(integration_type, test_resource_name, test_root_dir, destination_config)
    # Normalization step
    generate_dbt_models(destination_type, test_resource_name, test_root_dir)
    dbt_run(test_root_dir)
    # Run checks on Tests results
    dbt_test(destination_type, test_resource_name, test_root_dir)
    check_outputs(destination_type, test_resource_name, test_root_dir)
Пример #7
0
def test_needs_quote(input_str: str, destination_type: str, expected: bool):
    name_transformer = DestinationNameTransformer(
        DestinationType.from_string(destination_type))
    assert name_transformer.needs_quotes(input_str) == expected
Пример #8
0
def test_stream_processor_tables_naming(integration_type: str,
                                        catalog_file: str, setup_test_path):
    destination_type = DestinationType.from_string(integration_type)
    tables_registry = set()

    substreams = []
    catalog = read_json(f"resources/{catalog_file}.json")

    # process top level
    for stream_processor in CatalogProcessor.build_stream_processor(
            catalog=catalog,
            json_column_name="'json_column_name_test'",
            target_schema="schema_test",
            name_transformer=DestinationNameTransformer(destination_type),
            destination_type=destination_type,
            tables_registry=tables_registry,
    ):
        nested_processors = stream_processor.process()
        for table in stream_processor.local_registry:
            found_sql_output = False
            for sql_output in stream_processor.sql_outputs:
                if re.match(r".*/" + table + ".sql", sql_output) is not None:
                    found_sql_output = True
                    break
            assert found_sql_output
        add_table_to_registry(tables_registry, stream_processor)
        if nested_processors and len(nested_processors) > 0:
            substreams += nested_processors

    if os.path.exists(
            f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json"
    ):
        expected_top_level = set(
            read_json(
                f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json"
            )["tables"])
    else:
        expected_top_level = set(
            read_json(f"resources/{catalog_file}_expected_top_level.json")
            ["tables"])
        if DestinationType.SNOWFLAKE.value == destination_type.value:
            expected_top_level = {
                table.upper()
                for table in expected_top_level
            }
        elif DestinationType.REDSHIFT.value == destination_type.value:
            expected_top_level = {
                table.lower()
                for table in expected_top_level
            }
    assert tables_registry == expected_top_level

    # process substreams
    while substreams:
        children = substreams
        substreams = []
        for substream in children:
            substream.tables_registry = tables_registry
            nested_processors = substream.process()
            add_table_to_registry(tables_registry, substream)
            if nested_processors:
                substreams += nested_processors

    if os.path.exists(
            f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json"
    ):
        expected_nested = set(
            read_json(
                f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json"
            )["tables"])
    else:
        expected_nested = set(
            read_json(f"resources/{catalog_file}_expected_nested.json")
            ["tables"])
        if DestinationType.SNOWFLAKE.value == destination_type.value:
            expected_nested = {table.upper() for table in expected_nested}
        elif DestinationType.REDSHIFT.value == destination_type.value:
            expected_nested = {table.lower() for table in expected_nested}
    assert (tables_registry - expected_top_level) == expected_nested
Пример #9
0
def test_stream_processor_tables_naming(integration_type: str,
                                        catalog_file: str, setup_test_path):
    destination_type = DestinationType.from_string(integration_type)
    tables_registry = {}

    substreams = []
    catalog = read_json(f"resources/{catalog_file}.json")

    # process top level
    for stream_processor in CatalogProcessor.build_stream_processor(
            catalog=catalog,
            json_column_name="'json_column_name_test'",
            default_schema="schema_test",
            name_transformer=DestinationNameTransformer(destination_type),
            destination_type=destination_type,
            tables_registry=tables_registry,
    ):
        nested_processors = stream_processor.process()
        for schema in stream_processor.local_registry:
            for table in stream_processor.local_registry[schema]:
                found_sql_output = False
                for sql_output in stream_processor.sql_outputs:
                    file_name = f"{schema}_{table}"
                    if len(
                            file_name
                    ) > stream_processor.name_transformer.get_name_max_length(
                    ):
                        file_name = stream_processor.name_transformer.truncate_identifier_name(
                            input_name=file_name)

                    if re.match(r".*/" + file_name + ".sql",
                                sql_output) is not None:
                        found_sql_output = True
                        break
                assert found_sql_output
        add_table_to_registry(tables_registry, stream_processor)
        if nested_processors and len(nested_processors) > 0:
            substreams += nested_processors

    if os.path.exists(
            f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json"
    ):
        expected_top_level = set(
            read_json(
                f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json"
            )["tables"])
    else:
        expected_top_level = set(
            read_json(f"resources/{catalog_file}_expected_top_level.json")
            ["tables"])
        if DestinationType.SNOWFLAKE.value == destination_type.value:
            expected_top_level = {
                table.upper()
                for table in expected_top_level
            }
        elif DestinationType.REDSHIFT.value == destination_type.value:
            expected_top_level = {
                table.lower()
                for table in expected_top_level
            }

    # process substreams
    while substreams:
        children = substreams
        substreams = []
        for substream in children:
            substream.tables_registry = tables_registry
            nested_processors = substream.process()
            add_table_to_registry(tables_registry, substream)
            if nested_processors:
                substreams += nested_processors

    if os.path.exists(
            f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json"
    ):
        expected_nested = set(
            read_json(
                f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json"
            )["tables"])
    else:
        expected_nested = set(
            read_json(f"resources/{catalog_file}_expected_nested.json")
            ["tables"])
        if DestinationType.SNOWFLAKE.value == destination_type.value:
            expected_nested = {table.upper() for table in expected_nested}
        elif DestinationType.REDSHIFT.value == destination_type.value:
            expected_nested = {table.lower() for table in expected_nested}

    # TODO(davin): Instead of unwrapping all tables, rewrite this test so tables are compared based on schema.
    all_tables = set()
    for schema in tables_registry:
        for tables in tables_registry[schema]:
            all_tables.add(tables)

    assert (all_tables - expected_top_level) == expected_nested
Пример #10
0
def test_stream_processor_tables_naming(integration_type: str,
                                        catalog_file: str):
    """
    For a given catalog.json and destination, multiple cases can occur where naming becomes tricky.
    (especially since some destination like postgres have a very low limit to identifiers length of 64 characters)

    In case of nested objects/arrays in a stream, names can drag on to very long names.
    Tests are built here using resources files as follow:
    - `<name of source or test types>_catalog.json`:
        input catalog.json, typically as what source would provide.
        For example Hubspot, Stripe and Facebook catalog.json contains some level of nesting.
        (here, nested_catalog.json is an extracted smaller sample of stream/properties from the facebook catalog)
    - `<name of source or test types>_expected_top_level.json`:
        list of expected table names for the top level stream names
    - `<name of source or test types>_expected_nested.json`:
        list of expected table names for nested objects, extracted to their own and separate table names

    For the expected json files, it is possible to specialize the file to a certain destination.
    So if for example, the resources folder contains these two expected files:
        - edge_cases_catalog_expected_top_level.json
        - edge_cases_catalog_expected_top_level_postgres.json
    Then the test will be using the first edge_cases_catalog_expected_top_level.json except for
    Postgres destination where the expected table names will come from edge_cases_catalog_expected_top_level_postgres.json

    The content of the expected_*.json files are the serialization of the stream_processor.tables_registry
    (mapping per schema to all tables in that schema, mapping to the final filename)
    """
    destination_type = DestinationType.from_string(integration_type)
    tables_registry = {}

    substreams = []
    catalog = read_json(f"resources/{catalog_file}.json")

    # process top level
    for stream_processor in CatalogProcessor.build_stream_processor(
            catalog=catalog,
            json_column_name="'json_column_name_test'",
            default_schema="schema_test",
            name_transformer=DestinationNameTransformer(destination_type),
            destination_type=destination_type,
            tables_registry=tables_registry,
    ):
        nested_processors = stream_processor.process()
        tables_registry = add_table_to_registry(tables_registry,
                                                stream_processor)
        if nested_processors and len(nested_processors) > 0:
            substreams += nested_processors

    apply_function = None
    if DestinationType.SNOWFLAKE.value == destination_type.value:
        apply_function = str.upper
    elif DestinationType.REDSHIFT.value == destination_type.value:
        apply_function = str.lower
    if os.path.exists(
            f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json"
    ):
        expected_top_level = read_json(
            f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json",
            apply_function)
    else:
        expected_top_level = read_json(
            f"resources/{catalog_file}_expected_top_level.json",
            apply_function)

    assert tables_registry == expected_top_level

    # process substreams
    while substreams:
        children = substreams
        substreams = []
        for substream in children:
            substream.tables_registry = tables_registry
            nested_processors = substream.process()
            tables_registry = add_table_to_registry(tables_registry, substream)
            if nested_processors:
                substreams += nested_processors

    apply_function = None
    if DestinationType.SNOWFLAKE.value == destination_type.value:
        apply_function = str.upper
    elif DestinationType.REDSHIFT.value == destination_type.value:
        apply_function = str.lower
    if os.path.exists(
            f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json"
    ):
        expected_nested = read_json(
            f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json",
            apply_function)
    else:
        expected_nested = read_json(
            f"resources/{catalog_file}_expected_nested.json", apply_function)

    # remove expected top level tables from tables_registry
    for schema in expected_top_level:
        for table in expected_top_level[schema]:
            del tables_registry[schema][table]
        if len(tables_registry[schema]) == 0:
            del tables_registry[schema]
    assert tables_registry == expected_nested
Пример #11
0
def test_destination_failure_over_limits(integration_type: str,
                                         column_count: int,
                                         expected_exception_message: str,
                                         setup_test_path):
    run_test(DestinationType.from_string(integration_type), column_count,
             expected_exception_message)