Пример #1
0
def get_table_name(name_transformer: DestinationNameTransformer, parent: str,
                   child: str, suffix: str, json_path: List[str]) -> str:
    max_length = name_transformer.get_name_max_length(
    ) - 2  # less two for the underscores
    json_path_hash = hash_json_path(json_path)
    norm_suffix = suffix if not suffix or suffix.startswith(
        "_") else f"_{suffix}"
    norm_parent = parent if not parent else name_transformer.normalize_table_name(
        parent, False, False)
    norm_child = name_transformer.normalize_table_name(child, False, False)
    min_parent_length = min(MINIMUM_PARENT_LENGTH, len(norm_parent))

    # no parent
    if not parent:
        return name_transformer.truncate_identifier_name(
            f"{norm_child}{norm_suffix}")
    # if everything fits without truncation, don't truncate anything
    elif (len(norm_parent) + len(norm_child) + len(json_path_hash) +
          len(norm_suffix)) < max_length:
        return f"{norm_parent}_{json_path_hash}_{norm_child}{norm_suffix}"
    # if everything fits except for the parent, just truncate the parent
    elif (len(norm_child) + len(json_path_hash) +
          len(norm_suffix)) < (max_length - min_parent_length):
        max_parent_length = max_length - len(norm_child) - len(
            json_path_hash) - len(norm_suffix)
        return f"{norm_parent[:max_parent_length]}_{json_path_hash}_{norm_child}{norm_suffix}"
    # otherwise first truncate parent to the minimum length and middle truncate the child
    else:
        norm_child_max_length = max_length - min_parent_length - len(
            json_path_hash) - len(norm_suffix)
        trunc_norm_child = name_transformer.truncate_identifier_name(
            norm_child, norm_child_max_length)
        return f"{norm_parent[:min_parent_length]}_{json_path_hash}_{trunc_norm_child}{norm_suffix}"
Пример #2
0
def get_nested_hashed_table_name(name_transformer: DestinationNameTransformer, schema: str, json_path: List[str], child: str) -> str:
    """
    In normalization code base, we often have to deal with naming for tables, combining informations from:
    - parent table: to denote where a table is extracted from (in case of nesting)
    - child table: in case of nesting, the field name or the original stream name
    - extra suffix: normalization is done in multiple transformation steps, each may need to generate separate tables,
    so we can add a suffix to distinguish the different transformation steps of a pipeline.
    - json path: in terms of parent and nested field names in order to reach the table currently being built

    All these informations should be included (if possible) in the table naming for the user to (somehow) identify and
    recognize what data is available there.
    """
    parent = "_".join(json_path[:-1])
    max_length = name_transformer.get_name_max_length()
    json_path_hash = hash_json_path([schema] + json_path)
    norm_parent = parent if not parent else name_transformer.normalize_table_name(parent, False, False)
    norm_child = name_transformer.normalize_table_name(child, False, False)
    min_parent_length = min(MINIMUM_PARENT_LENGTH, len(norm_parent))

    # no parent
    if not parent:
        raise RuntimeError("There is no nested table names without parents")
    # if everything fits without truncation, don't truncate anything
    elif (len(norm_parent) + len(json_path_hash) + len(norm_child) + 2) < max_length:
        return f"{norm_parent}_{json_path_hash}_{norm_child}"
    # if everything fits except for the parent, just truncate the parent (still guarantees parent is of length min_parent_length)
    elif (min_parent_length + len(json_path_hash) + len(norm_child) + 2) < max_length:
        max_parent_length = max_length - len(json_path_hash) - len(norm_child) - 2
        return f"{norm_parent[:max_parent_length]}_{json_path_hash}_{norm_child}"
    # otherwise first truncate parent to the minimum length and middle truncate the child too
    else:
        norm_child_max_length = max_length - len(json_path_hash) - 2 - min_parent_length
        trunc_norm_child = name_transformer.truncate_identifier_name(norm_child, norm_child_max_length)
        return f"{norm_parent[:min_parent_length]}_{json_path_hash}_{trunc_norm_child}"
Пример #3
0
def get_table_name(name_transformer: DestinationNameTransformer, parent: str,
                   child: str, suffix: str, json_path: List[str]) -> str:
    """
    In normalization code base, we often have to deal with naming for tables, combining informations from:
    - parent table: to denote where a table is extracted from (in case of nesting)
    - child table: in case of nesting, the field name or the original stream name
    - extra suffix: normalization is done in multiple transformation steps, each may need to generate separate tables,
    so we can add a suffix to distinguish the different transformation steps of a pipeline.
    - json path: in terms of parent and nested field names in order to reach the table currently being built

    All these informations should be included (if possible) in the table naming for the user to (somehow) identify and
    recognize what data is available there.
    """
    max_length = name_transformer.get_name_max_length(
    ) - 2  # less two for the underscores
    json_path_hash = hash_json_path(json_path)
    norm_suffix = suffix if not suffix or suffix.startswith(
        "_") else f"_{suffix}"
    norm_parent = parent if not parent else name_transformer.normalize_table_name(
        parent, False, False)
    norm_child = name_transformer.normalize_table_name(child, False, False)
    min_parent_length = min(MINIMUM_PARENT_LENGTH, len(norm_parent))

    # no parent
    if not parent:
        return name_transformer.truncate_identifier_name(
            f"{norm_child}{norm_suffix}")
    # if everything fits without truncation, don't truncate anything
    elif (len(norm_parent) + len(norm_child) + len(json_path_hash) +
          len(norm_suffix)) < max_length:
        return f"{norm_parent}_{json_path_hash}_{norm_child}{norm_suffix}"
    # if everything fits except for the parent, just truncate the parent
    elif (len(norm_child) + len(json_path_hash) +
          len(norm_suffix)) < (max_length - min_parent_length):
        max_parent_length = max_length - len(norm_child) - len(
            json_path_hash) - len(norm_suffix)
        return f"{norm_parent[:max_parent_length]}_{json_path_hash}_{norm_child}{norm_suffix}"
    # otherwise first truncate parent to the minimum length and middle truncate the child
    else:
        norm_child_max_length = max_length - min_parent_length - len(
            json_path_hash) - len(norm_suffix)
        trunc_norm_child = name_transformer.truncate_identifier_name(
            norm_child, norm_child_max_length)
        return f"{norm_parent[:min_parent_length]}_{json_path_hash}_{trunc_norm_child}{norm_suffix}"
Пример #4
0
def test_truncate_identifier(input_str: str, expected: str):
    """
    Rules about truncations, for example for both of these strings which are too long for the postgres 64 limit:
    - `Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii`
    - `Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii`

    Deciding on how to truncate (in the middle) are being verified in these tests.
    In this instance, both strings ends up as:`Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii`
    and can potentially cause a collision in table names.

    Note that dealing with such collisions is not part of `destination_name_transformer` but of the `stream_processor`.
    """
    name_transformer = DestinationNameTransformer(DestinationType.POSTGRES)
    print(f"Truncating from #{len(input_str)} to #{len(expected)}")
    assert name_transformer.truncate_identifier_name(input_str) == expected
def test_truncate_identifier(input_str: str, expected: str):
    name_transformer = DestinationNameTransformer(DestinationType.POSTGRES)
    print(f"Truncating from #{len(input_str)} to #{len(expected)}")
    assert name_transformer.truncate_identifier_name(input_str) == expected