def get_primary_key_from_path(self, column_names: Dict[str, Tuple[str, str]], path: List[str]) -> str: if path and len(path) == 1: field = path[0] if not is_airbyte_column(field): if "type" in self.properties[field]: property_type = self.properties[field]["type"] else: property_type = "object" if is_number(property_type) or is_object(property_type): # some destinations don't handle float columns (or complex types) as primary keys, turn them to string return f"cast({column_names[field][0]} as {jinja_call('dbt_utils.type_string()')})" else: return column_names[field][0] else: # using an airbyte generated column return f"cast({field} as {jinja_call('dbt_utils.type_string()')})" else: if path: raise ValueError( f"Unsupported nested path {'.'.join(path)} for stream {self.stream_name}" ) else: raise ValueError( f"No path specified for stream {self.stream_name}")
def extract_column_names(self) -> Dict[str, Tuple[str, str]]: """ Generate a mapping of JSON properties to normalized SQL Column names, handling collisions and avoid duplicate names The mapped value to a field property is a tuple where: - the first value is the normalized "raw" column name - the second value is the normalized quoted column name to be used in jinja context """ fields = [] for field in self.properties.keys(): if not is_airbyte_column(field): fields.append(field) result = {} field_names = set() for field in fields: field_name = self.name_transformer.normalize_column_name( field, in_jinja=False) jinja_name = self.name_transformer.normalize_column_name( field, in_jinja=True) if field_name in field_names: # TODO handle column name duplicates or collisions deterministically in this stream for i in range(1, 1000): field_name = self.name_transformer.normalize_column_name( f"{field}_{i}", in_jinja=False) jinja_name = self.name_transformer.normalize_column_name( f"{field}_{i}", in_jinja=True) if field_name not in field_names: break field_names.add(field_name) result[field] = (field_name, jinja_name) return result
def get_cursor_field(self, column_names: Dict[str, Tuple[str, str]]) -> str: if not self.cursor_field: return "_airbyte_emitted_at" elif len(self.cursor_field) == 1: if not is_airbyte_column(self.cursor_field[0]): return column_names[self.cursor_field[0]][0] else: # using an airbyte generated column return self.cursor_field[0] else: raise ValueError(f"Unsupported nested cursor field {'.'.join(self.cursor_field)} for stream {self.stream_name}")
def find_children_streams( self, from_table: str, column_names: Dict[str, Tuple[str, str]]) -> List["StreamProcessor"]: """ For each complex type properties, generate a new child StreamProcessor that produce separate child pipelines. The current stream/table is used as the parent from which to extract data from. """ properties = self.properties children: List[StreamProcessor] = [] for field in properties.keys(): children_properties = None if is_airbyte_column(field): pass elif is_combining_node(properties[field]): # TODO: merge properties of all combinations pass elif "type" not in properties[field] or is_object( properties[field]["type"]): # properties without 'type' field are treated like properties with 'type' = 'object' children_properties = find_properties_object([], field, properties[field]) is_nested_array = False # json_column_name = f"'{field}'" json_column_name = column_names[field][1] elif is_array(properties[field] ["type"]) and "items" in properties[field]: quoted_field = column_names[field][1] children_properties = find_properties_object( [], field, properties[field]["items"]) is_nested_array = True json_column_name = f"unnested_column_value({quoted_field})" if children_properties: for child_key in children_properties: stream_processor = StreamProcessor.create_from_parent( parent=self, child_name=field, json_column_name=json_column_name, properties=children_properties[child_key], is_nested_array=is_nested_array, from_table=from_table, ) children.append(stream_processor) return children
def extract_column_names(self) -> Dict[str, Tuple[str, str]]: """ Generate a mapping of JSON properties to normalized SQL Column names, handling collisions and avoid duplicate names The mapped value to a field property is a tuple where: - the first value is the normalized "raw" column name - the second value is the normalized quoted column name to be used in jinja context """ fields = [] for field in self.properties.keys(): if not is_airbyte_column(field): fields.append(field) if self.destination_sync_mode.value == DestinationSyncMode.append_dedup.value: # When deduping, some airbyte columns could be used as special cursor or primary key columns if field in self.cursor_field[0] or field in [ f[0] for f in self.primary_key if len(f) == 1 ]: if field not in fields: fields.append(field) result = {} field_names = set() for field in fields: field_name = self.name_transformer.normalize_column_name( field, in_jinja=False) jinja_name = self.name_transformer.normalize_column_name( field, in_jinja=True) if field_name in field_names: # TODO handle column name duplicates or collisions deterministically in this stream for i in range(1, 1000): field_name = self.name_transformer.normalize_column_name( f"{field}_{i}", in_jinja=False) jinja_name = self.name_transformer.normalize_column_name( f"{field}_{i}", in_jinja=True) if field_name not in field_names: break field_names.add(field_name) result[field] = (field_name, jinja_name) return result