def generate_dbt_models(destination_type: DestinationType, test_resource_name: str, test_root_dir: str): """ This is the normalization step generating dbt models files from the destination_catalog.json taken as input. """ catalog_processor = CatalogProcessor(os.path.join(test_root_dir, "models", "generated"), destination_type) catalog_processor.process( os.path.join("resources", test_resource_name, "data_input", "catalog.json"), "_airbyte_data", dbt_test_utils.target_schema )
def process_catalog(self) -> None: destination_type = DestinationType.from_string(self.config["integration_type"]) schema = self.config["schema"] output = self.config["output_path"] json_col = self.config["json_column"] processor = CatalogProcessor(output_directory=output, destination_type=destination_type) for catalog_file in self.config["catalog"]: print(f"Processing {catalog_file}...") processor.process(catalog_file=catalog_file, json_column_name=json_col, default_schema=schema)
def generate_dbt_models(destination_type: DestinationType, test_root_dir: str, column_count: int): """ This is the normalization step generating dbt models files from the destination_catalog.json taken as input. """ output_directory = os.path.join(test_root_dir, "models", "generated") shutil.rmtree(output_directory, ignore_errors=True) catalog_processor = CatalogProcessor(output_directory, destination_type) catalog_config = { "streams": [{ "stream": { "name": dbt_test_utils.generate_random_string( f"stream_with_{column_count}_columns"), "json_schema": { "type": ["null", "object"], "properties": {}, }, "supported_sync_modes": ["incremental"], "source_defined_cursor": True, "default_cursor_field": [], }, "sync_mode": "incremental", "cursor_field": [], "destination_sync_mode": "overwrite", }] } if column_count == 1: catalog_config["streams"][0]["stream"]["json_schema"]["properties"][ "_airbyte_id"] = { "type": "integer" } else: for column in [ dbt_test_utils.random_string(5) for _ in range(column_count) ]: catalog_config["streams"][0]["stream"]["json_schema"][ "properties"][column] = { "type": "string" } catalog = os.path.join(test_root_dir, "catalog.json") with open(catalog, "w") as fh: fh.write(json.dumps(catalog_config)) catalog_processor.process(catalog, "_airbyte_data", dbt_test_utils.target_schema)
def test_resolve_names(destination_type: DestinationType, catalog_file: str): """ For a given catalog.json and destination, multiple cases can occur where naming becomes tricky. (especially since some destination like postgres have a very low limit to identifiers length of 64 characters) In case of nested objects/arrays in a stream, names can drag on to very long names. Tests are built here using resources files as follow: - `<name of source or test types>_catalog.json`: input catalog.json, typically as what source would provide. For example Hubspot, Stripe and Facebook catalog.json contains some level of nesting. (here, nested_catalog.json is an extracted smaller sample of stream/properties from the facebook catalog) - `<name of source or test types>_expected_names.json`: list of expected table names For the expected json files, it is possible to specialize the file to a certain destination. So if for example, the resources folder contains these two expected files: - edge_cases_catalog_expected_names.json - edge_cases_catalog_expected_postgres_names.json Then the test will be using the first edge_cases_catalog_expected_names.json except for Postgres destination where the expected table names will come from edge_cases_catalog_expected_postgres_names.json The content of the expected_*.json files are the serialization of the stream_processor.tables_registry.registry """ integration_type = destination_type.value tables_registry = TableNameRegistry(destination_type) catalog = read_json(f"resources/{catalog_file}.json") # process top level stream_processors = CatalogProcessor.build_stream_processor( catalog=catalog, json_column_name="'json_column_name_test'", default_schema="schema_test", name_transformer=DestinationNameTransformer(destination_type), destination_type=destination_type, tables_registry=tables_registry, ) for stream_processor in stream_processors: # Check properties if not stream_processor.properties: raise EOFError("Invalid Catalog: Unexpected empty properties in catalog") stream_processor.collect_table_names() for conflict in tables_registry.resolve_names(): print( f"WARN: Resolving conflict: {conflict.schema}.{conflict.table_name_conflict} " f"from '{'.'.join(conflict.json_path)}' into {conflict.table_name_resolved}" ) apply_function = identity if DestinationType.SNOWFLAKE.value == destination_type.value: apply_function = str.upper elif DestinationType.REDSHIFT.value == destination_type.value: apply_function = str.lower if os.path.exists(f"resources/{catalog_file}_expected_{integration_type.lower()}_names.json"): expected_names = read_json(f"resources/{catalog_file}_expected_{integration_type.lower()}_names.json", apply_function) else: expected_names = read_json(f"resources/{catalog_file}_expected_names.json", apply_function) assert tables_registry.to_dict(apply_function) == expected_names
def test_stream_processor_tables_naming(integration_type: str, catalog_file: str, setup_test_path): destination_type = DestinationType.from_string(integration_type) tables_registry = set() substreams = [] catalog = read_json(f"resources/{catalog_file}.json") # process top level for stream_processor in CatalogProcessor.build_stream_processor( catalog=catalog, json_column_name="'json_column_name_test'", target_schema="schema_test", name_transformer=DestinationNameTransformer(destination_type), destination_type=destination_type, tables_registry=tables_registry, ): nested_processors = stream_processor.process() for table in stream_processor.local_registry: found_sql_output = False for sql_output in stream_processor.sql_outputs: if re.match(r".*/" + table + ".sql", sql_output) is not None: found_sql_output = True break assert found_sql_output add_table_to_registry(tables_registry, stream_processor) if nested_processors and len(nested_processors) > 0: substreams += nested_processors if os.path.exists( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json" ): expected_top_level = set( read_json( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json" )["tables"]) else: expected_top_level = set( read_json(f"resources/{catalog_file}_expected_top_level.json") ["tables"]) if DestinationType.SNOWFLAKE.value == destination_type.value: expected_top_level = { table.upper() for table in expected_top_level } elif DestinationType.REDSHIFT.value == destination_type.value: expected_top_level = { table.lower() for table in expected_top_level } assert tables_registry == expected_top_level # process substreams while substreams: children = substreams substreams = [] for substream in children: substream.tables_registry = tables_registry nested_processors = substream.process() add_table_to_registry(tables_registry, substream) if nested_processors: substreams += nested_processors if os.path.exists( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json" ): expected_nested = set( read_json( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json" )["tables"]) else: expected_nested = set( read_json(f"resources/{catalog_file}_expected_nested.json") ["tables"]) if DestinationType.SNOWFLAKE.value == destination_type.value: expected_nested = {table.upper() for table in expected_nested} elif DestinationType.REDSHIFT.value == destination_type.value: expected_nested = {table.lower() for table in expected_nested} assert (tables_registry - expected_top_level) == expected_nested
def test_stream_processor_tables_naming(integration_type: str, catalog_file: str, setup_test_path): destination_type = DestinationType.from_string(integration_type) tables_registry = {} substreams = [] catalog = read_json(f"resources/{catalog_file}.json") # process top level for stream_processor in CatalogProcessor.build_stream_processor( catalog=catalog, json_column_name="'json_column_name_test'", default_schema="schema_test", name_transformer=DestinationNameTransformer(destination_type), destination_type=destination_type, tables_registry=tables_registry, ): nested_processors = stream_processor.process() for schema in stream_processor.local_registry: for table in stream_processor.local_registry[schema]: found_sql_output = False for sql_output in stream_processor.sql_outputs: file_name = f"{schema}_{table}" if len( file_name ) > stream_processor.name_transformer.get_name_max_length( ): file_name = stream_processor.name_transformer.truncate_identifier_name( input_name=file_name) if re.match(r".*/" + file_name + ".sql", sql_output) is not None: found_sql_output = True break assert found_sql_output add_table_to_registry(tables_registry, stream_processor) if nested_processors and len(nested_processors) > 0: substreams += nested_processors if os.path.exists( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json" ): expected_top_level = set( read_json( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json" )["tables"]) else: expected_top_level = set( read_json(f"resources/{catalog_file}_expected_top_level.json") ["tables"]) if DestinationType.SNOWFLAKE.value == destination_type.value: expected_top_level = { table.upper() for table in expected_top_level } elif DestinationType.REDSHIFT.value == destination_type.value: expected_top_level = { table.lower() for table in expected_top_level } # process substreams while substreams: children = substreams substreams = [] for substream in children: substream.tables_registry = tables_registry nested_processors = substream.process() add_table_to_registry(tables_registry, substream) if nested_processors: substreams += nested_processors if os.path.exists( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json" ): expected_nested = set( read_json( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json" )["tables"]) else: expected_nested = set( read_json(f"resources/{catalog_file}_expected_nested.json") ["tables"]) if DestinationType.SNOWFLAKE.value == destination_type.value: expected_nested = {table.upper() for table in expected_nested} elif DestinationType.REDSHIFT.value == destination_type.value: expected_nested = {table.lower() for table in expected_nested} # TODO(davin): Instead of unwrapping all tables, rewrite this test so tables are compared based on schema. all_tables = set() for schema in tables_registry: for tables in tables_registry[schema]: all_tables.add(tables) assert (all_tables - expected_top_level) == expected_nested
def test_stream_processor_tables_naming(integration_type: str, catalog_file: str): """ For a given catalog.json and destination, multiple cases can occur where naming becomes tricky. (especially since some destination like postgres have a very low limit to identifiers length of 64 characters) In case of nested objects/arrays in a stream, names can drag on to very long names. Tests are built here using resources files as follow: - `<name of source or test types>_catalog.json`: input catalog.json, typically as what source would provide. For example Hubspot, Stripe and Facebook catalog.json contains some level of nesting. (here, nested_catalog.json is an extracted smaller sample of stream/properties from the facebook catalog) - `<name of source or test types>_expected_top_level.json`: list of expected table names for the top level stream names - `<name of source or test types>_expected_nested.json`: list of expected table names for nested objects, extracted to their own and separate table names For the expected json files, it is possible to specialize the file to a certain destination. So if for example, the resources folder contains these two expected files: - edge_cases_catalog_expected_top_level.json - edge_cases_catalog_expected_top_level_postgres.json Then the test will be using the first edge_cases_catalog_expected_top_level.json except for Postgres destination where the expected table names will come from edge_cases_catalog_expected_top_level_postgres.json The content of the expected_*.json files are the serialization of the stream_processor.tables_registry (mapping per schema to all tables in that schema, mapping to the final filename) """ destination_type = DestinationType.from_string(integration_type) tables_registry = {} substreams = [] catalog = read_json(f"resources/{catalog_file}.json") # process top level for stream_processor in CatalogProcessor.build_stream_processor( catalog=catalog, json_column_name="'json_column_name_test'", default_schema="schema_test", name_transformer=DestinationNameTransformer(destination_type), destination_type=destination_type, tables_registry=tables_registry, ): nested_processors = stream_processor.process() tables_registry = add_table_to_registry(tables_registry, stream_processor) if nested_processors and len(nested_processors) > 0: substreams += nested_processors apply_function = None if DestinationType.SNOWFLAKE.value == destination_type.value: apply_function = str.upper elif DestinationType.REDSHIFT.value == destination_type.value: apply_function = str.lower if os.path.exists( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json" ): expected_top_level = read_json( f"resources/{catalog_file}_expected_top_level_{integration_type.lower()}.json", apply_function) else: expected_top_level = read_json( f"resources/{catalog_file}_expected_top_level.json", apply_function) assert tables_registry == expected_top_level # process substreams while substreams: children = substreams substreams = [] for substream in children: substream.tables_registry = tables_registry nested_processors = substream.process() tables_registry = add_table_to_registry(tables_registry, substream) if nested_processors: substreams += nested_processors apply_function = None if DestinationType.SNOWFLAKE.value == destination_type.value: apply_function = str.upper elif DestinationType.REDSHIFT.value == destination_type.value: apply_function = str.lower if os.path.exists( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json" ): expected_nested = read_json( f"resources/{catalog_file}_expected_nested_{integration_type.lower()}.json", apply_function) else: expected_nested = read_json( f"resources/{catalog_file}_expected_nested.json", apply_function) # remove expected top level tables from tables_registry for schema in expected_top_level: for table in expected_top_level[schema]: del tables_registry[schema][table] if len(tables_registry[schema]) == 0: del tables_registry[schema] assert tables_registry == expected_nested