Exemplo n.º 1
0
def test_nested_generate_new_table_name(stream_name: str,
                                        is_intermediate: bool, suffix: str,
                                        expected: str,
                                        expected_final_name: str):
    stream_processor = StreamProcessor.create(
        stream_name=stream_name,
        destination_type=DestinationType.POSTGRES,
        raw_schema="raw_schema",
        schema="schema_name",
        source_sync_mode=SyncMode.full_refresh,
        destination_sync_mode=DestinationSyncMode.append_dedup,
        cursor_field=[],
        primary_key=[],
        json_column_name="json_column_name",
        properties=[],
        tables_registry=dict(),
        from_table="",
    )
    nested_stream_processor = StreamProcessor.create_from_parent(
        parent=stream_processor,
        child_name="child_stream",
        json_column_name="json_column_name",
        properties=[],
        is_nested_array=False,
        from_table="",
    )
    assert nested_stream_processor.generate_new_table_name(
        is_intermediate=is_intermediate, suffix=suffix) == expected
    assert nested_stream_processor.final_table_name == expected_final_name
Exemplo n.º 2
0
def test_primary_key(
    primary_key: List[List[str]],
    column_type: str,
    expecting_exception: bool,
    expected_primary_keys: List[str],
    expected_final_primary_key_string: str,
):
    stream_processor = StreamProcessor.create(
        stream_name="test_primary_key",
        destination_type=DestinationType.POSTGRES,
        raw_schema="raw_schema",
        default_schema="default_schema",
        schema="schema_name",
        source_sync_mode=SyncMode.incremental,
        destination_sync_mode=DestinationSyncMode.append_dedup,
        cursor_field=[],
        primary_key=primary_key,
        json_column_name="json_column_name",
        properties={
            key: {
                "type": column_type
            }
            for key in expected_primary_keys
        },
        tables_registry=TableNameRegistry(DestinationType.POSTGRES),
        from_table="",
    )
    try:
        assert (", ".join(
            stream_processor.get_primary_key_partition(
                column_names=stream_processor.extract_column_names())) ==
                expected_final_primary_key_string)
    except ValueError as e:
        if not expecting_exception:
            raise e
Exemplo n.º 3
0
def test_collisions_generate_new_table_name(stream_name: str,
                                            is_intermediate: bool, suffix: str,
                                            expected: str,
                                            expected_final_name: str):
    # fill test_registry with the same stream names as if it was already used so there would be collisions...
    test_registry = dict()
    test_registry["schema_name"] = set()
    test_registry["schema_name"].add("stream_name")
    test_registry["schema_name"].add("stream_name_suffix")
    test_registry["raw_schema"] = set()
    test_registry["raw_schema"].add("stream_name_suffix")
    stream_processor = StreamProcessor.create(
        stream_name=stream_name,
        destination_type=DestinationType.POSTGRES,
        raw_schema="raw_schema",
        schema="schema_name",
        source_sync_mode=SyncMode.full_refresh,
        destination_sync_mode=DestinationSyncMode.append_dedup,
        cursor_field=[],
        primary_key=[],
        json_column_name="json_column_name",
        properties=[],
        tables_registry=test_registry,
        from_table="",
    )
    assert stream_processor.generate_new_table_name(
        is_intermediate=is_intermediate, suffix=suffix) == expected
    assert stream_processor.final_table_name == expected_final_name
Exemplo n.º 4
0
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        default_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: TableNameRegistry,
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams")

            # The logic here matches the logic in JdbcBufferedConsumerFactory.java.
            # Any modifications need to be reflected there and vice versa.
            schema = default_schema
            if "namespace" in stream_config:
                schema = stream_config["namespace"]

            schema_name = name_transformer.normalize_schema_name(schema, truncate=False)
            raw_schema_name = name_transformer.normalize_schema_name(f"_airbyte_{schema}", truncate=False)
            stream_name = get_field(stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}")
            raw_table_name = name_transformer.normalize_table_name(f"_airbyte_raw_{stream_name}", truncate=False)

            source_sync_mode = get_source_sync_mode(configured_stream, stream_name)
            destination_sync_mode = get_destination_sync_mode(configured_stream, stream_name)
            cursor_field = []
            primary_key = []
            if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value,
            ]:
                cursor_field = get_field(configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}")
            if destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value
            ]:
                primary_key = get_field(configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}")

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(get_field(stream_config, "json_schema", message), "properties", message)

            from_table = "source('{}', '{}')".format(schema_name, raw_table_name)

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                schema=schema_name,
                source_sync_mode=source_sync_mode,
                destination_sync_mode=destination_sync_mode,
                cursor_field=cursor_field,
                primary_key=primary_key,
                json_column_name=f"'{json_column_name}'",
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result
Exemplo n.º 5
0
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        target_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: Set[str],
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(
                catalog, "streams",
                "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(
                configured_stream, "stream",
                "Invalid Stream: 'stream' is not defined in Catalog streams")
            schema_name = name_transformer.normalize_schema_name(target_schema)
            raw_schema_name = name_transformer.normalize_schema_name(
                f"_airbyte_{target_schema}", truncate=False)
            stream_name = get_field(
                stream_config, "name",
                f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}"
            )
            raw_table_name = name_transformer.normalize_table_name(
                f"_airbyte_raw_{stream_name}", truncate=False)

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(
                get_field(stream_config, "json_schema", message), "properties",
                message)

            from_table = "source('{}', '{}')".format(schema_name,
                                                     raw_table_name)

            # Check properties
            if not properties:
                raise EOFError(
                    "Invalid Catalog: Unexpected empty properties in catalog")

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                schema=schema_name,
                json_column_name=f"'{json_column_name}'",
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result
Exemplo n.º 6
0
def test_cursor_field(cursor_field: List[str], expecting_exception: bool,
                      expected_cursor_field: str):
    stream_processor = StreamProcessor.create(
        stream_name="test_cursor_field",
        destination_type=DestinationType.POSTGRES,
        raw_schema="raw_schema",
        schema="schema_name",
        source_sync_mode=SyncMode.incremental,
        destination_sync_mode=DestinationSyncMode.append_dedup,
        cursor_field=cursor_field,
        primary_key=[],
        json_column_name="json_column_name",
        properties=dict(),
        tables_registry=TableNameRegistry(DestinationType.POSTGRES),
        from_table="",
    )
    try:
        assert (stream_processor.get_cursor_field(column_names={
            expected_cursor_field: (expected_cursor_field, "random")
        }) == expected_cursor_field)
    except ValueError as e:
        if not expecting_exception:
            raise e
Exemplo n.º 7
0
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        target_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: Set[str],
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(
                catalog, "streams",
                "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(
                configured_stream, "stream",
                "Invalid Stream: 'stream' is not defined in Catalog streams")
            schema_name = name_transformer.normalize_schema_name(target_schema)
            raw_schema_name = name_transformer.normalize_schema_name(
                f"_airbyte_{target_schema}", truncate=False)
            stream_name = get_field(
                stream_config, "name",
                f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}"
            )
            raw_table_name = name_transformer.normalize_table_name(
                f"_airbyte_raw_{stream_name}", truncate=False)

            source_sync_mode = get_source_sync_mode(configured_stream,
                                                    stream_name)
            destination_sync_mode = get_destination_sync_mode(
                configured_stream, stream_name)
            cursor_field = []
            primary_key = []
            if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [
                    # DestinationSyncMode.upsert_dedup.value,
                    DestinationSyncMode.append_dedup.value,
            ]:
                cursor_field = get_field(
                    configured_stream, "cursor_field",
                    f"Undefined cursor field for stream {stream_name}")
            if destination_sync_mode.value in [
                    # DestinationSyncMode.upsert_dedup.value,
                    DestinationSyncMode.append_dedup.value
            ]:
                primary_key = get_field(
                    configured_stream, "primary_key",
                    f"Undefined primary key for stream {stream_name}")

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(
                get_field(stream_config, "json_schema", message), "properties",
                message)

            from_table = "source('{}', '{}')".format(schema_name,
                                                     raw_table_name)

            # Check properties
            if not properties:
                raise EOFError(
                    "Invalid Catalog: Unexpected empty properties in catalog")

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                schema=schema_name,
                source_sync_mode=source_sync_mode,
                destination_sync_mode=destination_sync_mode,
                cursor_field=cursor_field,
                primary_key=primary_key,
                json_column_name=f"'{json_column_name}'",
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result
Exemplo n.º 8
0
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        default_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: TableNameRegistry,
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams")

            # The logic here matches the logic in JdbcBufferedConsumerFactory.java.
            # Any modifications need to be reflected there and vice versa.
            schema = default_schema
            if "namespace" in stream_config:
                schema = stream_config["namespace"]

            schema_name = name_transformer.normalize_schema_name(schema, truncate=False)
            if destination_type == DestinationType.ORACLE:
                quote_in_parenthesis = re.compile(r"quote\((.*)\)")
                raw_schema_name = name_transformer.normalize_schema_name(schema, truncate=False)
                if not quote_in_parenthesis.findall(json_column_name):
                    json_column_name = name_transformer.normalize_column_name(json_column_name, in_jinja=True)
            else:
                column_inside_single_quote = re.compile(r"\'(.*)\'")
                raw_schema_name = name_transformer.normalize_schema_name(f"_airbyte_{schema}", truncate=False)
                if not column_inside_single_quote.findall(json_column_name):
                    json_column_name = f"'{json_column_name}'"

            stream_name = get_field(stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}")
            # MySQL table names need to be manually truncated, because it does not do it automatically
            truncate = destination_type == DestinationType.MYSQL
            raw_table_name = name_transformer.normalize_table_name(f"_airbyte_raw_{stream_name}", truncate=truncate)

            source_sync_mode = get_source_sync_mode(configured_stream, stream_name)
            destination_sync_mode = get_destination_sync_mode(configured_stream, stream_name)
            cursor_field = []
            primary_key = []
            if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value,
            ]:
                cursor_field = get_field(configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}")
            if destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value
            ]:
                primary_key = get_field(configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}")

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(get_field(stream_config, "json_schema", message), "properties", message)

            from_table = dbt_macro.Source(schema_name, raw_table_name)

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                default_schema=default_schema,
                schema=schema_name,
                source_sync_mode=source_sync_mode,
                destination_sync_mode=destination_sync_mode,
                cursor_field=cursor_field,
                primary_key=primary_key,
                json_column_name=json_column_name,
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result
Exemplo n.º 9
0
    def process(self, catalog_file: str, json_column_name: str,
                target_schema: str):
        """
        This method first parse and build models to handle top-level streams.
        In a second loop will go over the substreams that were nested in a breadth-first traversal manner.

        @param catalog_file input AirbyteCatalog file in JSON Schema describing the structure of the raw data
        @param json_column_name is the column name containing the JSON Blob with the raw data
        @param target_schema is the final schema where to output the final transformed data to
        """
        # Registry of all tables in all schemas
        tables_registry: Set[str] = set()
        # Registry of source tables in each schemas
        schema_to_source_tables: Dict[str, Set[str]] = {}

        catalog = read_json(catalog_file)
        # print(json.dumps(catalog, separators=(",", ":")))
        substreams = []
        for configured_stream in get_field(
                catalog, "streams",
                "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(
                configured_stream, "stream",
                "Invalid Stream: 'stream' is not defined in Catalog streams")
            schema_name = self.name_transformer.normalize_schema_name(
                target_schema)
            raw_schema_name = self.name_transformer.normalize_schema_name(
                f"_airbyte_{target_schema}", truncate=False)
            stream_name = get_field(
                stream_config, "name",
                f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}"
            )
            raw_table_name = self.name_transformer.normalize_table_name(
                f"_airbyte_raw_{stream_name}", truncate=False)

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(
                get_field(stream_config, "json_schema", message), "properties",
                message)

            from_table = "source('{}', '{}')".format(schema_name,
                                                     raw_table_name)

            # Check properties
            if not properties:
                raise EOFError(
                    "Invalid Catalog: Unexpected empty properties in catalog")

            add_table_to_sources(schema_to_source_tables, schema_name,
                                 raw_table_name)

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                integration_type=self.integration_type,
                raw_schema=raw_schema_name,
                schema=schema_name,
                json_column_name=f"'{json_column_name}'",
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            nested_processors = stream_processor.process()
            add_table_to_registry(tables_registry, stream_processor)
            if nested_processors and len(nested_processors) > 0:
                substreams += nested_processors
            for file in stream_processor.sql_outputs:
                output_sql_file(os.path.join(self.output_directory, file),
                                stream_processor.sql_outputs[file])
        self.write_yaml_sources_file(schema_to_source_tables)
        self.process_substreams(substreams, tables_registry)