def test_nested_generate_new_table_name(stream_name: str, is_intermediate: bool, suffix: str, expected: str, expected_final_name: str): stream_processor = StreamProcessor.create( stream_name=stream_name, destination_type=DestinationType.POSTGRES, raw_schema="raw_schema", schema="schema_name", source_sync_mode=SyncMode.full_refresh, destination_sync_mode=DestinationSyncMode.append_dedup, cursor_field=[], primary_key=[], json_column_name="json_column_name", properties=[], tables_registry=dict(), from_table="", ) nested_stream_processor = StreamProcessor.create_from_parent( parent=stream_processor, child_name="child_stream", json_column_name="json_column_name", properties=[], is_nested_array=False, from_table="", ) assert nested_stream_processor.generate_new_table_name( is_intermediate=is_intermediate, suffix=suffix) == expected assert nested_stream_processor.final_table_name == expected_final_name
def test_primary_key( primary_key: List[List[str]], column_type: str, expecting_exception: bool, expected_primary_keys: List[str], expected_final_primary_key_string: str, ): stream_processor = StreamProcessor.create( stream_name="test_primary_key", destination_type=DestinationType.POSTGRES, raw_schema="raw_schema", default_schema="default_schema", schema="schema_name", source_sync_mode=SyncMode.incremental, destination_sync_mode=DestinationSyncMode.append_dedup, cursor_field=[], primary_key=primary_key, json_column_name="json_column_name", properties={ key: { "type": column_type } for key in expected_primary_keys }, tables_registry=TableNameRegistry(DestinationType.POSTGRES), from_table="", ) try: assert (", ".join( stream_processor.get_primary_key_partition( column_names=stream_processor.extract_column_names())) == expected_final_primary_key_string) except ValueError as e: if not expecting_exception: raise e
def test_collisions_generate_new_table_name(stream_name: str, is_intermediate: bool, suffix: str, expected: str, expected_final_name: str): # fill test_registry with the same stream names as if it was already used so there would be collisions... test_registry = dict() test_registry["schema_name"] = set() test_registry["schema_name"].add("stream_name") test_registry["schema_name"].add("stream_name_suffix") test_registry["raw_schema"] = set() test_registry["raw_schema"].add("stream_name_suffix") stream_processor = StreamProcessor.create( stream_name=stream_name, destination_type=DestinationType.POSTGRES, raw_schema="raw_schema", schema="schema_name", source_sync_mode=SyncMode.full_refresh, destination_sync_mode=DestinationSyncMode.append_dedup, cursor_field=[], primary_key=[], json_column_name="json_column_name", properties=[], tables_registry=test_registry, from_table="", ) assert stream_processor.generate_new_table_name( is_intermediate=is_intermediate, suffix=suffix) == expected assert stream_processor.final_table_name == expected_final_name
def build_stream_processor( catalog: Dict, json_column_name: str, default_schema: str, name_transformer: DestinationNameTransformer, destination_type: DestinationType, tables_registry: TableNameRegistry, ) -> List[StreamProcessor]: result = [] for configured_stream in get_field(catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"): stream_config = get_field(configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams") # The logic here matches the logic in JdbcBufferedConsumerFactory.java. # Any modifications need to be reflected there and vice versa. schema = default_schema if "namespace" in stream_config: schema = stream_config["namespace"] schema_name = name_transformer.normalize_schema_name(schema, truncate=False) raw_schema_name = name_transformer.normalize_schema_name(f"_airbyte_{schema}", truncate=False) stream_name = get_field(stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}") raw_table_name = name_transformer.normalize_table_name(f"_airbyte_raw_{stream_name}", truncate=False) source_sync_mode = get_source_sync_mode(configured_stream, stream_name) destination_sync_mode = get_destination_sync_mode(configured_stream, stream_name) cursor_field = [] primary_key = [] if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value, ]: cursor_field = get_field(configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}") if destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value ]: primary_key = get_field(configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}") message = f"'json_schema'.'properties' are not defined for stream {stream_name}" properties = get_field(get_field(stream_config, "json_schema", message), "properties", message) from_table = "source('{}', '{}')".format(schema_name, raw_table_name) stream_processor = StreamProcessor.create( stream_name=stream_name, destination_type=destination_type, raw_schema=raw_schema_name, schema=schema_name, source_sync_mode=source_sync_mode, destination_sync_mode=destination_sync_mode, cursor_field=cursor_field, primary_key=primary_key, json_column_name=f"'{json_column_name}'", properties=properties, tables_registry=tables_registry, from_table=from_table, ) result.append(stream_processor) return result
def build_stream_processor( catalog: Dict, json_column_name: str, target_schema: str, name_transformer: DestinationNameTransformer, destination_type: DestinationType, tables_registry: Set[str], ) -> List[StreamProcessor]: result = [] for configured_stream in get_field( catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"): stream_config = get_field( configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams") schema_name = name_transformer.normalize_schema_name(target_schema) raw_schema_name = name_transformer.normalize_schema_name( f"_airbyte_{target_schema}", truncate=False) stream_name = get_field( stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}" ) raw_table_name = name_transformer.normalize_table_name( f"_airbyte_raw_{stream_name}", truncate=False) message = f"'json_schema'.'properties' are not defined for stream {stream_name}" properties = get_field( get_field(stream_config, "json_schema", message), "properties", message) from_table = "source('{}', '{}')".format(schema_name, raw_table_name) # Check properties if not properties: raise EOFError( "Invalid Catalog: Unexpected empty properties in catalog") stream_processor = StreamProcessor.create( stream_name=stream_name, destination_type=destination_type, raw_schema=raw_schema_name, schema=schema_name, json_column_name=f"'{json_column_name}'", properties=properties, tables_registry=tables_registry, from_table=from_table, ) result.append(stream_processor) return result
def test_cursor_field(cursor_field: List[str], expecting_exception: bool, expected_cursor_field: str): stream_processor = StreamProcessor.create( stream_name="test_cursor_field", destination_type=DestinationType.POSTGRES, raw_schema="raw_schema", schema="schema_name", source_sync_mode=SyncMode.incremental, destination_sync_mode=DestinationSyncMode.append_dedup, cursor_field=cursor_field, primary_key=[], json_column_name="json_column_name", properties=dict(), tables_registry=TableNameRegistry(DestinationType.POSTGRES), from_table="", ) try: assert (stream_processor.get_cursor_field(column_names={ expected_cursor_field: (expected_cursor_field, "random") }) == expected_cursor_field) except ValueError as e: if not expecting_exception: raise e
def build_stream_processor( catalog: Dict, json_column_name: str, target_schema: str, name_transformer: DestinationNameTransformer, destination_type: DestinationType, tables_registry: Set[str], ) -> List[StreamProcessor]: result = [] for configured_stream in get_field( catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"): stream_config = get_field( configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams") schema_name = name_transformer.normalize_schema_name(target_schema) raw_schema_name = name_transformer.normalize_schema_name( f"_airbyte_{target_schema}", truncate=False) stream_name = get_field( stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}" ) raw_table_name = name_transformer.normalize_table_name( f"_airbyte_raw_{stream_name}", truncate=False) source_sync_mode = get_source_sync_mode(configured_stream, stream_name) destination_sync_mode = get_destination_sync_mode( configured_stream, stream_name) cursor_field = [] primary_key = [] if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value, ]: cursor_field = get_field( configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}") if destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value ]: primary_key = get_field( configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}") message = f"'json_schema'.'properties' are not defined for stream {stream_name}" properties = get_field( get_field(stream_config, "json_schema", message), "properties", message) from_table = "source('{}', '{}')".format(schema_name, raw_table_name) # Check properties if not properties: raise EOFError( "Invalid Catalog: Unexpected empty properties in catalog") stream_processor = StreamProcessor.create( stream_name=stream_name, destination_type=destination_type, raw_schema=raw_schema_name, schema=schema_name, source_sync_mode=source_sync_mode, destination_sync_mode=destination_sync_mode, cursor_field=cursor_field, primary_key=primary_key, json_column_name=f"'{json_column_name}'", properties=properties, tables_registry=tables_registry, from_table=from_table, ) result.append(stream_processor) return result
def build_stream_processor( catalog: Dict, json_column_name: str, default_schema: str, name_transformer: DestinationNameTransformer, destination_type: DestinationType, tables_registry: TableNameRegistry, ) -> List[StreamProcessor]: result = [] for configured_stream in get_field(catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"): stream_config = get_field(configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams") # The logic here matches the logic in JdbcBufferedConsumerFactory.java. # Any modifications need to be reflected there and vice versa. schema = default_schema if "namespace" in stream_config: schema = stream_config["namespace"] schema_name = name_transformer.normalize_schema_name(schema, truncate=False) if destination_type == DestinationType.ORACLE: quote_in_parenthesis = re.compile(r"quote\((.*)\)") raw_schema_name = name_transformer.normalize_schema_name(schema, truncate=False) if not quote_in_parenthesis.findall(json_column_name): json_column_name = name_transformer.normalize_column_name(json_column_name, in_jinja=True) else: column_inside_single_quote = re.compile(r"\'(.*)\'") raw_schema_name = name_transformer.normalize_schema_name(f"_airbyte_{schema}", truncate=False) if not column_inside_single_quote.findall(json_column_name): json_column_name = f"'{json_column_name}'" stream_name = get_field(stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}") # MySQL table names need to be manually truncated, because it does not do it automatically truncate = destination_type == DestinationType.MYSQL raw_table_name = name_transformer.normalize_table_name(f"_airbyte_raw_{stream_name}", truncate=truncate) source_sync_mode = get_source_sync_mode(configured_stream, stream_name) destination_sync_mode = get_destination_sync_mode(configured_stream, stream_name) cursor_field = [] primary_key = [] if source_sync_mode.value == SyncMode.incremental.value or destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value, ]: cursor_field = get_field(configured_stream, "cursor_field", f"Undefined cursor field for stream {stream_name}") if destination_sync_mode.value in [ # DestinationSyncMode.upsert_dedup.value, DestinationSyncMode.append_dedup.value ]: primary_key = get_field(configured_stream, "primary_key", f"Undefined primary key for stream {stream_name}") message = f"'json_schema'.'properties' are not defined for stream {stream_name}" properties = get_field(get_field(stream_config, "json_schema", message), "properties", message) from_table = dbt_macro.Source(schema_name, raw_table_name) stream_processor = StreamProcessor.create( stream_name=stream_name, destination_type=destination_type, raw_schema=raw_schema_name, default_schema=default_schema, schema=schema_name, source_sync_mode=source_sync_mode, destination_sync_mode=destination_sync_mode, cursor_field=cursor_field, primary_key=primary_key, json_column_name=json_column_name, properties=properties, tables_registry=tables_registry, from_table=from_table, ) result.append(stream_processor) return result
def process(self, catalog_file: str, json_column_name: str, target_schema: str): """ This method first parse and build models to handle top-level streams. In a second loop will go over the substreams that were nested in a breadth-first traversal manner. @param catalog_file input AirbyteCatalog file in JSON Schema describing the structure of the raw data @param json_column_name is the column name containing the JSON Blob with the raw data @param target_schema is the final schema where to output the final transformed data to """ # Registry of all tables in all schemas tables_registry: Set[str] = set() # Registry of source tables in each schemas schema_to_source_tables: Dict[str, Set[str]] = {} catalog = read_json(catalog_file) # print(json.dumps(catalog, separators=(",", ":"))) substreams = [] for configured_stream in get_field( catalog, "streams", "Invalid Catalog: 'streams' is not defined in Catalog"): stream_config = get_field( configured_stream, "stream", "Invalid Stream: 'stream' is not defined in Catalog streams") schema_name = self.name_transformer.normalize_schema_name( target_schema) raw_schema_name = self.name_transformer.normalize_schema_name( f"_airbyte_{target_schema}", truncate=False) stream_name = get_field( stream_config, "name", f"Invalid Stream: 'name' is not defined in stream: {str(stream_config)}" ) raw_table_name = self.name_transformer.normalize_table_name( f"_airbyte_raw_{stream_name}", truncate=False) message = f"'json_schema'.'properties' are not defined for stream {stream_name}" properties = get_field( get_field(stream_config, "json_schema", message), "properties", message) from_table = "source('{}', '{}')".format(schema_name, raw_table_name) # Check properties if not properties: raise EOFError( "Invalid Catalog: Unexpected empty properties in catalog") add_table_to_sources(schema_to_source_tables, schema_name, raw_table_name) stream_processor = StreamProcessor.create( stream_name=stream_name, integration_type=self.integration_type, raw_schema=raw_schema_name, schema=schema_name, json_column_name=f"'{json_column_name}'", properties=properties, tables_registry=tables_registry, from_table=from_table, ) nested_processors = stream_processor.process() add_table_to_registry(tables_registry, stream_processor) if nested_processors and len(nested_processors) > 0: substreams += nested_processors for file in stream_processor.sql_outputs: output_sql_file(os.path.join(self.output_directory, file), stream_processor.sql_outputs[file]) self.write_yaml_sources_file(schema_to_source_tables) self.process_substreams(substreams, tables_registry)