def test_resolve_names(destination_type: DestinationType, catalog_file: str):
    """
    For a given catalog.json and destination, multiple cases can occur where naming becomes tricky.
    (especially since some destination like postgres have a very low limit to identifiers length of 64 characters)

    In case of nested objects/arrays in a stream, names can drag on to very long names.
    Tests are built here using resources files as follow:
    - `<name of source or test types>_catalog.json`:
        input catalog.json, typically as what source would provide.
        For example Hubspot, Stripe and Facebook catalog.json contains some level of nesting.
        (here, nested_catalog.json is an extracted smaller sample of stream/properties from the facebook catalog)
    - `<name of source or test types>_expected_names.json`:
        list of expected table names

    For the expected json files, it is possible to specialize the file to a certain destination.
    So if for example, the resources folder contains these two expected files:
        - edge_cases_catalog_expected_names.json
        - edge_cases_catalog_expected_postgres_names.json
    Then the test will be using the first edge_cases_catalog_expected_names.json except for
    Postgres destination where the expected table names will come from edge_cases_catalog_expected_postgres_names.json

    The content of the expected_*.json files are the serialization of the stream_processor.tables_registry.registry
    """
    integration_type = destination_type.value
    tables_registry = TableNameRegistry(destination_type)

    catalog = read_json(f"resources/{catalog_file}.json")

    # process top level
    stream_processors = CatalogProcessor.build_stream_processor(
        catalog=catalog,
        json_column_name="'json_column_name_test'",
        default_schema="schema_test",
        name_transformer=DestinationNameTransformer(destination_type),
        destination_type=destination_type,
        tables_registry=tables_registry,
    )
    for stream_processor in stream_processors:
        # Check properties
        if not stream_processor.properties:
            raise EOFError("Invalid Catalog: Unexpected empty properties in catalog")
        stream_processor.collect_table_names()
    for conflict in tables_registry.resolve_names():
        print(
            f"WARN: Resolving conflict: {conflict.schema}.{conflict.table_name_conflict} "
            f"from '{'.'.join(conflict.json_path)}' into {conflict.table_name_resolved}"
        )
    apply_function = identity
    if DestinationType.SNOWFLAKE.value == destination_type.value:
        apply_function = str.upper
    elif DestinationType.REDSHIFT.value == destination_type.value:
        apply_function = str.lower
    if os.path.exists(f"resources/{catalog_file}_expected_{integration_type.lower()}_names.json"):
        expected_names = read_json(f"resources/{catalog_file}_expected_{integration_type.lower()}_names.json", apply_function)
    else:
        expected_names = read_json(f"resources/{catalog_file}_expected_names.json", apply_function)

    assert tables_registry.to_dict(apply_function) == expected_names
def test_primary_key(
    primary_key: List[List[str]],
    column_type: str,
    expecting_exception: bool,
    expected_primary_keys: List[str],
    expected_final_primary_key_string: str,
):
    stream_processor = StreamProcessor.create(
        stream_name="test_primary_key",
        destination_type=DestinationType.POSTGRES,
        raw_schema="raw_schema",
        default_schema="default_schema",
        schema="schema_name",
        source_sync_mode=SyncMode.incremental,
        destination_sync_mode=DestinationSyncMode.append_dedup,
        cursor_field=[],
        primary_key=primary_key,
        json_column_name="json_column_name",
        properties={
            key: {
                "type": column_type
            }
            for key in expected_primary_keys
        },
        tables_registry=TableNameRegistry(DestinationType.POSTGRES),
        from_table="",
    )
    try:
        assert (", ".join(
            stream_processor.get_primary_key_partition(
                column_names=stream_processor.extract_column_names())) ==
                expected_final_primary_key_string)
    except ValueError as e:
        if not expecting_exception:
            raise e
示例#3
0
    def process(self, catalog_file: str, json_column_name: str,
                default_schema: str):
        """
        This method first parse and build models to handle top-level streams.
        In a second loop will go over the substreams that were nested in a breadth-first traversal manner.

        @param catalog_file input AirbyteCatalog file in JSON Schema describing the structure of the raw data
        @param json_column_name is the column name containing the JSON Blob with the raw data
        @param default_schema is the final schema where to output the final transformed data to
        """
        tables_registry: TableNameRegistry = TableNameRegistry(
            self.destination_type)
        schema_to_source_tables: Dict[str, Set[str]] = {}
        catalog = read_json(catalog_file)
        # print(json.dumps(catalog, separators=(",", ":")))
        substreams = []
        stream_processors = self.build_stream_processor(
            catalog=catalog,
            json_column_name=json_column_name,
            default_schema=default_schema,
            name_transformer=self.name_transformer,
            destination_type=self.destination_type,
            tables_registry=tables_registry,
        )
        for stream_processor in stream_processors:
            stream_processor.collect_table_names()
        for conflict in tables_registry.resolve_names():
            print(
                f"WARN: Resolving conflict: {conflict.schema}.{conflict.table_name_conflict} "
                f"from '{'.'.join(conflict.json_path)}' into {conflict.table_name_resolved}"
            )
        for stream_processor in stream_processors:
            # MySQL table names need to be manually truncated, because it does not do it automatically
            truncate = self.destination_type == DestinationType.MYSQL
            raw_table_name = self.name_transformer.normalize_table_name(
                f"_airbyte_raw_{stream_processor.stream_name}",
                truncate=truncate)
            add_table_to_sources(schema_to_source_tables,
                                 stream_processor.schema, raw_table_name)

            nested_processors = stream_processor.process()
            if nested_processors and len(nested_processors) > 0:
                substreams += nested_processors
            for file in stream_processor.sql_outputs:
                output_sql_file(os.path.join(self.output_directory, file),
                                stream_processor.sql_outputs[file])
        self.write_yaml_sources_file(schema_to_source_tables)
        self.process_substreams(substreams, tables_registry)
def test_get_simple_table_name(json_path: List[str], expected_postgres: str, expected_bigquery: str):
    """
    Checks how to generate a simple and easy to understand name from a json path
    """
    postgres_registry = TableNameRegistry(DestinationType.POSTGRES)
    actual_postgres_name = postgres_registry.get_simple_table_name(json_path)
    assert actual_postgres_name == expected_postgres
    assert len(actual_postgres_name) <= 43  # explicitly check for our max postgres length in case tests are changed in the future

    bigquery_registry = TableNameRegistry(DestinationType.BIGQUERY)
    actual_bigquery_name = bigquery_registry.get_simple_table_name(json_path)
    assert actual_bigquery_name == expected_bigquery
示例#5
0
def test_cursor_field(cursor_field: List[str], expecting_exception: bool,
                      expected_cursor_field: str):
    stream_processor = StreamProcessor.create(
        stream_name="test_cursor_field",
        destination_type=DestinationType.POSTGRES,
        raw_schema="raw_schema",
        schema="schema_name",
        source_sync_mode=SyncMode.incremental,
        destination_sync_mode=DestinationSyncMode.append_dedup,
        cursor_field=cursor_field,
        primary_key=[],
        json_column_name="json_column_name",
        properties=dict(),
        tables_registry=TableNameRegistry(DestinationType.POSTGRES),
        from_table="",
    )
    try:
        assert (stream_processor.get_cursor_field(column_names={
            expected_cursor_field: (expected_cursor_field, "random")
        }) == expected_cursor_field)
    except ValueError as e:
        if not expecting_exception:
            raise e