def get_relations_from_database(self, database: str) -> List[Relation]:
        relations_sql = f"""
                                 SELECT
                                    m.table_schema AS schema,
                                    m.table_name AS relation,
                                    m.table_type AS materialization,
                                    c.column_name AS attribute,
                                    c.ordinal_position AS ordinal,
                                    c.data_type AS data_type
                                 FROM
                                    {database}.INFORMATION_SCHEMA.TABLES m
                                 INNER JOIN
                                    {database}.INFORMATION_SCHEMA.COLUMNS c
                                 ON
                                    c.table_schema = m.table_schema
                                 AND
                                    c.table_name = m.table_name
                                 WHERE
                                    m.table_schema <> 'INFORMATION_SCHEMA'
                              """

        logger.debug(
            f'Collecting detailed relations from database {database}...')
        relations_frame = self._safe_query(relations_sql)
        unique_relations = (relations_frame['schema'] + '.' +
                            relations_frame['relation']).unique().tolist()
        logger.debug(
            f'Done collecting relations. Found a total of {len(unique_relations)} unique relations in database {database}'
        )
        relations = list()
        for relation in unique_relations:
            logger.debug(f'Building relation { database + "." + relation }...')
            attributes = list()

            for attribute in relations_frame.loc[(
                    relations_frame['schema'] + '.' +
                    relations_frame['relation']) == relation].itertuples():
                logger.debug(
                    f'adding attribute {attribute.attribute} to relation..')
                attributes.append(
                    Attribute(self._correct_case(attribute.attribute),
                              self._get_data_type(attribute.data_type)))

            relation = Relation(
                self._correct_case(database),
                self._correct_case(attribute.schema),
                self._correct_case(attribute.relation),
                self.MATERIALIZATION_MAPPINGS[attribute.materialization],
                attributes)
            logger.debug(f'Added relation {relation.dot_notation} to pool.')
            relations.append(relation)

        logger.debug(
            f'Acquired {len(relations)} total relations from database {database}.'
        )
        return relations
def test_x00_replacement():
    adapter = PostgresAdapter(replica_metadata={})
    id_col = "id"
    content_col = "content"
    normal_val = "normal_value"
    weird_value = "weird\x00value"
    custom_replacement = "__CUSTOM_VALUE__"

    cols = [
        Attribute(id_col, data_types.BIGINT),
        Attribute(content_col, data_types.VARCHAR)
    ]
    # test default replacement
    relation = Relation("db", "schema", "relation", TABLE, cols)
    relation.data = DataFrame({
        id_col: [1, 2],
        content_col: [normal_val, weird_value]
    })

    fixed_relation = adapter.replace_x00_values(relation)
    assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 1,
                                       [content_col]] == normal_val)
    assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 2,
                                       [content_col]] == "weirdvalue")

    # test custom replacement
    adapter = PostgresAdapter(replica_metadata={},
                              pg_0x00_replacement=custom_replacement)
    relation = Relation("db", "schema", "relation", TABLE, cols)
    relation.data = DataFrame({
        id_col: [1, 2],
        content_col: [normal_val, weird_value]
    })

    fixed_relation = adapter.replace_x00_values(relation)
    assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 1,
                                       [content_col]] == normal_val)
    assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 2,
                                       [content_col]] ==
               f"weird{custom_replacement}value")
示例#3
0
def test_run_deps_bidirectional_exclude_outliers(stub_relation_set):
    upstream = stub_relation_set.upstream_relation
    downstream = stub_relation_set.downstream_relation
    for relation in (
            downstream,
            upstream,
    ):
        relation.attributes = [Attribute('id', dt.INTEGER)]
        relation = stub_out_sampling(relation)
    upstream.data = pd.DataFrame([dict(id=1), dict(id=2), dict(id=3)])

    dag = nx.DiGraph()
    dag.add_edge(upstream,
                 downstream,
                 direction="bidirectional",
                 remote_attribute='id',
                 local_attribute='id')
    adapter = SnowflakeAdapter()
    RuntimeSourceCompiler.compile_queries_for_relation(upstream, dag, adapter,
                                                       False)
    RuntimeSourceCompiler.compile_queries_for_relation(downstream, dag,
                                                       adapter, False)
    assert query_equalize(downstream.compiled_query) == query_equalize(f"""
            SELECT
                *
            FROM {downstream.quoted_dot_notation}
            WHERE id IN (1,2,3)
    """)

    assert query_equalize(upstream.compiled_query) == query_equalize(f"""
        WITH {upstream.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
            {upstream.quoted_dot_notation} 
        WHERE 
            id 
        in (SELECT 
                id 
            FROM 
                {downstream.quoted_dot_notation}) ) 
        ,{upstream.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( 
            SELECT 
                * 
            FROM 
                {upstream.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) 
        ) 
        SELECT 
            * 
        FROM 
        {upstream.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')}
    """)
示例#4
0
    def _get_relations_from_database(
            self,
            schema_obj: BaseTargetAdapter._DatabaseObject) -> List[Relation]:
        quoted_database = schema_obj.full_relation.quoted(
            schema_obj.full_relation.database)  # quoted db name
        relation_database = schema_obj.full_relation.database  # case corrected db name
        case_sensitive_schema = schema_obj.case_sensitive_name  # case sensitive schame name
        relations_sql = f"""
                                 SELECT
                                    m.table_schema AS schema,
                                    m.table_name AS relation,
                                    m.table_type AS materialization,
                                    c.column_name AS attribute,
                                    c.ordinal_position AS ordinal,
                                    c.data_type AS data_type
                                 FROM
                                    {quoted_database}.information_schema.tables m
                                 INNER JOIN
                                    {quoted_database}.information_schema.columns c
                                 ON
                                    c.table_schema = m.table_schema
                                 AND
                                    c.table_name = m.table_name
                                 WHERE
                                    m.table_schema = '{case_sensitive_schema}'
                                    AND m.table_schema NOT IN ('information_schema', 'pg_catalog')
                                    AND m.table_type <> 'external'
                              """

        logger.debug(
            f'Collecting detailed relations from database {quoted_database}...'
        )
        relations_frame = self._safe_query(relations_sql, quoted_database)
        unique_relations = (relations_frame['schema'] + '.' +
                            relations_frame['relation']).unique().tolist()
        logger.debug(
            f'Done collecting relations. Found a total of {len(unique_relations)} '
            f'unique relations in database {quoted_database}')
        relations = list()
        for relation in unique_relations:
            logger.debug(
                f'Building relation { quoted_database + "." + relation }...')
            attributes = list()

            for attribute in relations_frame.loc[(
                    relations_frame['schema'] + '.' +
                    relations_frame['relation']) == relation].itertuples():
                logger.debug(
                    f'adding attribute {attribute.attribute} to relation..')
                attributes.append(
                    Attribute(self._correct_case(attribute.attribute),
                              self._get_data_type(attribute.data_type)))

                relation = Relation(
                    relation_database,
                    self._correct_case(attribute.schema),  # noqa pylint: disable=undefined-loop-variable
                    self._correct_case(attribute.relation),  # noqa pylint: disable=undefined-loop-variable
                    self.MATERIALIZATION_MAPPINGS[
                        attribute.materialization.replace(" ", "_")],  # noqa pylint: disable=undefined-loop-variable
                    attributes)
            logger.debug(f'Added relation {relation.dot_notation} to pool.')
            relations.append(relation)
        logger.debug(
            f'Acquired {len(relations)} total relations from database {quoted_database}.'
        )
        return relations
示例#5
0
def test_run_deps_mixed_multi_deps():
    r"""
        a --bidir--> c <--dir-- b
         \          / \
          \     bidir  dir
           \       |   |
            \      V   V
             dir-> d   e
    """
    relation_helper = RelationTestHelper()
    relation_a = Relation(name='rel_a',
                          **relation_helper.rand_relation_helper())
    relation_a.attributes = [
        Attribute('col_a_c', dt.INTEGER),
        Attribute('col_a_d', dt.VARCHAR)
    ]
    relation_a.data = pd.DataFrame({
        "col_a_c": [
            1,
            2,
            3,
            4,
            5,
        ],
        "col_a_d": ["var_a_1", "var_a_2", "var_a_3", "var_a_1", "var_a_2"],
    })
    relation_b = Relation(name='rel_b',
                          **relation_helper.rand_relation_helper())
    relation_b.attributes = [Attribute('col_b_c', dt.VARCHAR)]
    relation_b.data = pd.DataFrame({
        "col_b_c": [
            "val1",
            "val2",
            "val3",
            "val4",
            "val5",
        ],
    })

    relation_c = Relation(name='rel_c',
                          **relation_helper.rand_relation_helper())
    relation_c.attributes = [
        Attribute('col_c_ae', dt.INTEGER),
        Attribute('col_c_bd', dt.VARCHAR)
    ]
    relation_c.data = pd.DataFrame({
        "col_c_ae": [
            1,
            1,
            2,
            2,
            5,
            5,
            5,
        ],
        "col_c_bd": [
            "val1",
            "val1",
            "val2",
            "val2",
            "val5",
            "val5",
            "val5",
        ]
    })

    relation_d = Relation(name='rel_d',
                          **relation_helper.rand_relation_helper())
    relation_d.attributes = [
        Attribute('col_d_a', dt.INTEGER),
        Attribute('col_d_c', dt.INTEGER)
    ]

    relation_e = Relation(name='rel_e',
                          **relation_helper.rand_relation_helper())
    relation_e.attributes = [Attribute('col_e_c', dt.INTEGER)]

    for relation in (
            relation_a,
            relation_b,
            relation_c,
            relation_d,
            relation_e,
    ):
        relation = stub_out_sampling(relation)

    dag = nx.DiGraph()
    dag.add_edge(relation_a,
                 relation_c,
                 direction="bidirectional",
                 remote_attribute="col_a_c",
                 local_attribute="col_c_ae")
    dag.add_edge(relation_a,
                 relation_d,
                 direction="directional",
                 remote_attribute="col_a_d",
                 local_attribute="col_d_a")
    dag.add_edge(relation_b,
                 relation_c,
                 direction="directional",
                 remote_attribute="col_b_c",
                 local_attribute="col_c_bd")
    dag.add_edge(relation_c,
                 relation_d,
                 direction="bidirectional",
                 remote_attribute="col_c_bd",
                 local_attribute="col_d_c")
    dag.add_edge(relation_c,
                 relation_e,
                 direction="directional",
                 remote_attribute="col_c_ae",
                 local_attribute="col_e_c")
    adapter = SnowflakeAdapter()
    RuntimeSourceCompiler.compile_queries_for_relation(relation_a, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_b, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_c, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_d, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_e, dag,
                                                       adapter, False)
    assert query_equalize(relation_a.compiled_query) == query_equalize(f"""
        WITH {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
            {relation_a.quoted_dot_notation} 
        WHERE 
            col_a_c 
        in (SELECT 
                col_c_ae 
            FROM 
                {relation_c.quoted_dot_notation}) ) 
        ,{relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( 
            SELECT 
                * 
            FROM 
                {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) 
        ) 
        SELECT 
            * 
        FROM 
        {relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')}
    """)
    assert query_equalize(relation_b.compiled_query) == query_equalize(f"""
        SELECT
            *
        FROM
            {relation_b.quoted_dot_notation}
        SAMPLE BERNOULLI (1500 ROWS)
    """)
    assert query_equalize(relation_c.compiled_query) == query_equalize(f"""
        SELECT 
            * 
        FROM 
            {relation_c.quoted_dot_notation} 
        WHERE 
            col_c_bd 
        in (SELECT 
                col_d_c 
            FROM 
                {relation_d.quoted_dot_notation})
        AND
            col_c_ae IN (1,2,3,4,5)
        AND
            col_c_bd IN ('val1','val2','val3','val4','val5')
    """)
    assert query_equalize(relation_d.compiled_query) == query_equalize(f"""
        SELECT 
            * 
        FROM 
            {relation_d.quoted_dot_notation}
        WHERE 
            col_d_a IN ('var_a_1','var_a_2','var_a_3') 
        AND
            col_d_c IN ('val1','val2','val5') 
    """)
    assert query_equalize(relation_e.compiled_query) == query_equalize(f"""
        WITH 
        {relation_e.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
        {relation_e.quoted_dot_notation}
        WHERE 
            col_e_c IN (1,2,5) 
        )
        ,{relation_e.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
        {relation_e.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) 
        ) 
        SELECT 
            * 
        FROM 
        {relation_e.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')}
    """)
示例#6
0
def test_run_deps_bidirectional_multi_deps():
    """
        a --bidir--> c <--bidir-- b
    """
    relation_helper = RelationTestHelper()
    relation_a = Relation(name='rel_a',
                          **relation_helper.rand_relation_helper())
    relation_a.attributes = [Attribute('col_a', dt.INTEGER)]
    relation_a.data = pd.DataFrame({"col_a": [
        1,
        2,
        3,
        4,
        5,
    ]})

    relation_b = Relation(name='rel_b',
                          **relation_helper.rand_relation_helper())
    relation_b.attributes = [Attribute('col_b', dt.VARCHAR)]
    relation_b.data = pd.DataFrame({
        "col_b": [
            "val1",
            "val2",
            "val3",
            "val4",
            "val5",
        ],
    })

    relation_c = Relation(name='rel_c',
                          **relation_helper.rand_relation_helper())
    relation_c.attributes = [
        Attribute('col_c_a', dt.INTEGER),
        Attribute('col_c_b', dt.VARCHAR)
    ]

    for relation in (
            relation_a,
            relation_b,
            relation_c,
    ):
        relation = stub_out_sampling(relation)

    dag = nx.DiGraph()
    dag.add_edge(relation_a,
                 relation_c,
                 direction="bidirectional",
                 remote_attribute="col_a",
                 local_attribute="col_c_a")
    dag.add_edge(relation_b,
                 relation_c,
                 direction="bidirectional",
                 remote_attribute="col_b",
                 local_attribute="col_c_b")
    adapter = SnowflakeAdapter()
    RuntimeSourceCompiler.compile_queries_for_relation(relation_a, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_b, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_c, dag,
                                                       adapter, False)
    assert query_equalize(relation_a.compiled_query) == query_equalize(f"""
        WITH {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
            {relation_a.quoted_dot_notation} 
        WHERE 
            col_a 
        in (SELECT 
                col_c_a 
            FROM 
                {relation_c.quoted_dot_notation}) ) 
        ,{relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( 
            SELECT 
                * 
            FROM 
                {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) 
        ) 
        SELECT 
            * 
        FROM 
        {relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')}
    """)
    assert query_equalize(relation_b.compiled_query) == query_equalize(f"""
        WITH 
        {relation_b.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
        {relation_b.quoted_dot_notation}
        WHERE 
            col_b 
        in (SELECT 
                col_c_b 
            FROM 
                {relation_c.quoted_dot_notation})
        )
        ,{relation_b.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
        {relation_b.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) 
        ) 
        SELECT 
            * 
        FROM 
        {relation_b.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')}
    """)
    assert query_equalize(relation_c.compiled_query) == query_equalize(f"""
            SELECT
                *
            FROM {relation_c.quoted_dot_notation} 
            WHERE
                col_c_a IN (1,2,3,4,5)
            AND
                col_c_b IN ('val1','val2','val3','val4','val5')
    """)