Exemplo n.º 1
0
 def get_primary_key_from_path(self, column_names: Dict[str, Tuple[str,
                                                                   str]],
                               path: List[str]) -> str:
     if path and len(path) == 1:
         field = path[0]
         if not is_airbyte_column(field):
             if "type" in self.properties[field]:
                 property_type = self.properties[field]["type"]
             else:
                 property_type = "object"
             if is_number(property_type) or is_object(property_type):
                 # some destinations don't handle float columns (or complex types) as primary keys, turn them to string
                 return f"cast({column_names[field][0]} as {jinja_call('dbt_utils.type_string()')})"
             else:
                 return column_names[field][0]
         else:
             # using an airbyte generated column
             return f"cast({field} as {jinja_call('dbt_utils.type_string()')})"
     else:
         if path:
             raise ValueError(
                 f"Unsupported nested path {'.'.join(path)} for stream {self.stream_name}"
             )
         else:
             raise ValueError(
                 f"No path specified for stream {self.stream_name}")
Exemplo n.º 2
0
    def extract_column_names(self) -> Dict[str, Tuple[str, str]]:
        """
        Generate a mapping of JSON properties to normalized SQL Column names, handling collisions and avoid duplicate names

        The mapped value to a field property is a tuple where:
         - the first value is the normalized "raw" column name
         - the second value is the normalized quoted column name to be used in jinja context
        """
        fields = []
        for field in self.properties.keys():
            if not is_airbyte_column(field):
                fields.append(field)
        result = {}
        field_names = set()
        for field in fields:
            field_name = self.name_transformer.normalize_column_name(
                field, in_jinja=False)
            jinja_name = self.name_transformer.normalize_column_name(
                field, in_jinja=True)
            if field_name in field_names:
                # TODO handle column name duplicates or collisions deterministically in this stream
                for i in range(1, 1000):
                    field_name = self.name_transformer.normalize_column_name(
                        f"{field}_{i}", in_jinja=False)
                    jinja_name = self.name_transformer.normalize_column_name(
                        f"{field}_{i}", in_jinja=True)
                    if field_name not in field_names:
                        break
            field_names.add(field_name)
            result[field] = (field_name, jinja_name)
        return result
Exemplo n.º 3
0
 def get_cursor_field(self, column_names: Dict[str, Tuple[str, str]]) -> str:
     if not self.cursor_field:
         return "_airbyte_emitted_at"
     elif len(self.cursor_field) == 1:
         if not is_airbyte_column(self.cursor_field[0]):
             return column_names[self.cursor_field[0]][0]
         else:
             # using an airbyte generated column
             return self.cursor_field[0]
     else:
         raise ValueError(f"Unsupported nested cursor field {'.'.join(self.cursor_field)} for stream {self.stream_name}")
Exemplo n.º 4
0
 def find_children_streams(
         self, from_table: str,
         column_names: Dict[str, Tuple[str,
                                       str]]) -> List["StreamProcessor"]:
     """
     For each complex type properties, generate a new child StreamProcessor that produce separate child pipelines.
     The current stream/table is used as the parent from which to extract data from.
     """
     properties = self.properties
     children: List[StreamProcessor] = []
     for field in properties.keys():
         children_properties = None
         if is_airbyte_column(field):
             pass
         elif is_combining_node(properties[field]):
             # TODO: merge properties of all combinations
             pass
         elif "type" not in properties[field] or is_object(
                 properties[field]["type"]):
             # properties without 'type' field are treated like properties with 'type' = 'object'
             children_properties = find_properties_object([], field,
                                                          properties[field])
             is_nested_array = False
             # json_column_name = f"'{field}'"
             json_column_name = column_names[field][1]
         elif is_array(properties[field]
                       ["type"]) and "items" in properties[field]:
             quoted_field = column_names[field][1]
             children_properties = find_properties_object(
                 [], field, properties[field]["items"])
             is_nested_array = True
             json_column_name = f"unnested_column_value({quoted_field})"
         if children_properties:
             for child_key in children_properties:
                 stream_processor = StreamProcessor.create_from_parent(
                     parent=self,
                     child_name=field,
                     json_column_name=json_column_name,
                     properties=children_properties[child_key],
                     is_nested_array=is_nested_array,
                     from_table=from_table,
                 )
                 children.append(stream_processor)
     return children
Exemplo n.º 5
0
    def extract_column_names(self) -> Dict[str, Tuple[str, str]]:
        """
        Generate a mapping of JSON properties to normalized SQL Column names, handling collisions and avoid duplicate names

        The mapped value to a field property is a tuple where:
         - the first value is the normalized "raw" column name
         - the second value is the normalized quoted column name to be used in jinja context
        """
        fields = []
        for field in self.properties.keys():
            if not is_airbyte_column(field):
                fields.append(field)
            if self.destination_sync_mode.value == DestinationSyncMode.append_dedup.value:
                # When deduping, some airbyte columns could be used as special cursor or primary key columns
                if field in self.cursor_field[0] or field in [
                        f[0] for f in self.primary_key if len(f) == 1
                ]:
                    if field not in fields:
                        fields.append(field)
        result = {}
        field_names = set()
        for field in fields:
            field_name = self.name_transformer.normalize_column_name(
                field, in_jinja=False)
            jinja_name = self.name_transformer.normalize_column_name(
                field, in_jinja=True)
            if field_name in field_names:
                # TODO handle column name duplicates or collisions deterministically in this stream
                for i in range(1, 1000):
                    field_name = self.name_transformer.normalize_column_name(
                        f"{field}_{i}", in_jinja=False)
                    jinja_name = self.name_transformer.normalize_column_name(
                        f"{field}_{i}", in_jinja=True)
                    if field_name not in field_names:
                        break
            field_names.add(field_name)
            result[field] = (field_name, jinja_name)
        return result