示例#1
0
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        default_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: TableNameRegistry,
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams")

            # The logic here matches the logic in JdbcBufferedConsumerFactory.java.
            # Any modifications need to be reflected there and vice versa.
            schema = default_schema
            if "namespace" in stream_config:
                schema = stream_config["namespace"]

            schema_name = name_transformer.normalize_schema_name(schema, truncate=False)
            raw_schema_name = name_transformer.normalize_schema_name(f"_airbyte_{schema}", truncate=False)
            stream_name = get_field(stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}")
            raw_table_name = name_transformer.normalize_table_name(f"_airbyte_raw_{stream_name}", truncate=False)

            source_sync_mode = get_source_sync_mode(configured_stream, stream_name)
            destination_sync_mode = get_destination_sync_mode(configured_stream, stream_name)
            cursor_field = []
            primary_key = []
            if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value,
            ]:
                cursor_field = get_field(configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}")
            if destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value
            ]:
                primary_key = get_field(configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}")

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(get_field(stream_config, "json_schema", message), "properties", message)

            from_table = "source('{}', '{}')".format(schema_name, raw_table_name)

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                schema=schema_name,
                source_sync_mode=source_sync_mode,
                destination_sync_mode=destination_sync_mode,
                cursor_field=cursor_field,
                primary_key=primary_key,
                json_column_name=f"'{json_column_name}'",
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result
示例#2
0
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        target_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: Set[str],
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(
                catalog, "streams",
                "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(
                configured_stream, "stream",
                "Invalid Stream: 'stream' is not defined in Catalog streams")
            schema_name = name_transformer.normalize_schema_name(target_schema)
            raw_schema_name = name_transformer.normalize_schema_name(
                f"_airbyte_{target_schema}", truncate=False)
            stream_name = get_field(
                stream_config, "name",
                f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}"
            )
            raw_table_name = name_transformer.normalize_table_name(
                f"_airbyte_raw_{stream_name}", truncate=False)

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(
                get_field(stream_config, "json_schema", message), "properties",
                message)

            from_table = "source('{}', '{}')".format(schema_name,
                                                     raw_table_name)

            # Check properties
            if not properties:
                raise EOFError(
                    "Invalid Catalog: Unexpected empty properties in catalog")

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                schema=schema_name,
                json_column_name=f"'{json_column_name}'",
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result
示例#3
0
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        target_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: Set[str],
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(
                catalog, "streams",
                "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(
                configured_stream, "stream",
                "Invalid Stream: 'stream' is not defined in Catalog streams")
            schema_name = name_transformer.normalize_schema_name(target_schema)
            raw_schema_name = name_transformer.normalize_schema_name(
                f"_airbyte_{target_schema}", truncate=False)
            stream_name = get_field(
                stream_config, "name",
                f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}"
            )
            raw_table_name = name_transformer.normalize_table_name(
                f"_airbyte_raw_{stream_name}", truncate=False)

            source_sync_mode = get_source_sync_mode(configured_stream,
                                                    stream_name)
            destination_sync_mode = get_destination_sync_mode(
                configured_stream, stream_name)
            cursor_field = []
            primary_key = []
            if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [
                    # DestinationSyncMode.upsert_dedup.value,
                    DestinationSyncMode.append_dedup.value,
            ]:
                cursor_field = get_field(
                    configured_stream, "cursor_field",
                    f"Undefined cursor field for stream {stream_name}")
            if destination_sync_mode.value in [
                    # DestinationSyncMode.upsert_dedup.value,
                    DestinationSyncMode.append_dedup.value
            ]:
                primary_key = get_field(
                    configured_stream, "primary_key",
                    f"Undefined primary key for stream {stream_name}")

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(
                get_field(stream_config, "json_schema", message), "properties",
                message)

            from_table = "source('{}', '{}')".format(schema_name,
                                                     raw_table_name)

            # Check properties
            if not properties:
                raise EOFError(
                    "Invalid Catalog: Unexpected empty properties in catalog")

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                schema=schema_name,
                source_sync_mode=source_sync_mode,
                destination_sync_mode=destination_sync_mode,
                cursor_field=cursor_field,
                primary_key=primary_key,
                json_column_name=f"'{json_column_name}'",
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result
示例#4
0
    def build_stream_processor(
        catalog: Dict,
        json_column_name: str,
        default_schema: str,
        name_transformer: DestinationNameTransformer,
        destination_type: DestinationType,
        tables_registry: TableNameRegistry,
    ) -> List[StreamProcessor]:
        result = []
        for configured_stream in get_field(catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"):
            stream_config = get_field(configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams")

            # The logic here matches the logic in JdbcBufferedConsumerFactory.java.
            # Any modifications need to be reflected there and vice versa.
            schema = default_schema
            if "namespace" in stream_config:
                schema = stream_config["namespace"]

            schema_name = name_transformer.normalize_schema_name(schema, truncate=False)
            if destination_type == DestinationType.ORACLE:
                quote_in_parenthesis = re.compile(r"quote\((.*)\)")
                raw_schema_name = name_transformer.normalize_schema_name(schema, truncate=False)
                if not quote_in_parenthesis.findall(json_column_name):
                    json_column_name = name_transformer.normalize_column_name(json_column_name, in_jinja=True)
            else:
                column_inside_single_quote = re.compile(r"\'(.*)\'")
                raw_schema_name = name_transformer.normalize_schema_name(f"_airbyte_{schema}", truncate=False)
                if not column_inside_single_quote.findall(json_column_name):
                    json_column_name = f"'{json_column_name}'"

            stream_name = get_field(stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}")
            # MySQL table names need to be manually truncated, because it does not do it automatically
            truncate = destination_type == DestinationType.MYSQL
            raw_table_name = name_transformer.normalize_table_name(f"_airbyte_raw_{stream_name}", truncate=truncate)

            source_sync_mode = get_source_sync_mode(configured_stream, stream_name)
            destination_sync_mode = get_destination_sync_mode(configured_stream, stream_name)
            cursor_field = []
            primary_key = []
            if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value,
            ]:
                cursor_field = get_field(configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}")
            if destination_sync_mode.value in [
                # DestinationSyncMode.upsert_dedup.value,
                DestinationSyncMode.append_dedup.value
            ]:
                primary_key = get_field(configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}")

            message = f"'json_schema'.'properties' are not defined for stream {stream_name}"
            properties = get_field(get_field(stream_config, "json_schema", message), "properties", message)

            from_table = dbt_macro.Source(schema_name, raw_table_name)

            stream_processor = StreamProcessor.create(
                stream_name=stream_name,
                destination_type=destination_type,
                raw_schema=raw_schema_name,
                default_schema=default_schema,
                schema=schema_name,
                source_sync_mode=source_sync_mode,
                destination_sync_mode=destination_sync_mode,
                cursor_field=cursor_field,
                primary_key=primary_key,
                json_column_name=json_column_name,
                properties=properties,
                tables_registry=tables_registry,
                from_table=from_table,
            )
            result.append(stream_processor)
        return result