def get_relations_from_database(self, database: str) -> List[Relation]: relations_sql = f""" SELECT m.table_schema AS schema, m.table_name AS relation, m.table_type AS materialization, c.column_name AS attribute, c.ordinal_position AS ordinal, c.data_type AS data_type FROM {database}.INFORMATION_SCHEMA.TABLES m INNER JOIN {database}.INFORMATION_SCHEMA.COLUMNS c ON c.table_schema = m.table_schema AND c.table_name = m.table_name WHERE m.table_schema <> 'INFORMATION_SCHEMA' """ logger.debug( f'Collecting detailed relations from database {database}...') relations_frame = self._safe_query(relations_sql) unique_relations = (relations_frame['schema'] + '.' + relations_frame['relation']).unique().tolist() logger.debug( f'Done collecting relations. Found a total of {len(unique_relations)} unique relations in database {database}' ) relations = list() for relation in unique_relations: logger.debug(f'Building relation { database + "." + relation }...') attributes = list() for attribute in relations_frame.loc[( relations_frame['schema'] + '.' + relations_frame['relation']) == relation].itertuples(): logger.debug( f'adding attribute {attribute.attribute} to relation..') attributes.append( Attribute(self._correct_case(attribute.attribute), self._get_data_type(attribute.data_type))) relation = Relation( self._correct_case(database), self._correct_case(attribute.schema), self._correct_case(attribute.relation), self.MATERIALIZATION_MAPPINGS[attribute.materialization], attributes) logger.debug(f'Added relation {relation.dot_notation} to pool.') relations.append(relation) logger.debug( f'Acquired {len(relations)} total relations from database {database}.' ) return relations
def test_x00_replacement(): adapter = PostgresAdapter(replica_metadata={}) id_col = "id" content_col = "content" normal_val = "normal_value" weird_value = "weird\x00value" custom_replacement = "__CUSTOM_VALUE__" cols = [ Attribute(id_col, data_types.BIGINT), Attribute(content_col, data_types.VARCHAR) ] # test default replacement relation = Relation("db", "schema", "relation", TABLE, cols) relation.data = DataFrame({ id_col: [1, 2], content_col: [normal_val, weird_value] }) fixed_relation = adapter.replace_x00_values(relation) assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 1, [content_col]] == normal_val) assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 2, [content_col]] == "weirdvalue") # test custom replacement adapter = PostgresAdapter(replica_metadata={}, pg_0x00_replacement=custom_replacement) relation = Relation("db", "schema", "relation", TABLE, cols) relation.data = DataFrame({ id_col: [1, 2], content_col: [normal_val, weird_value] }) fixed_relation = adapter.replace_x00_values(relation) assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 1, [content_col]] == normal_val) assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 2, [content_col]] == f"weird{custom_replacement}value")
def test_run_deps_bidirectional_exclude_outliers(stub_relation_set): upstream = stub_relation_set.upstream_relation downstream = stub_relation_set.downstream_relation for relation in ( downstream, upstream, ): relation.attributes = [Attribute('id', dt.INTEGER)] relation = stub_out_sampling(relation) upstream.data = pd.DataFrame([dict(id=1), dict(id=2), dict(id=3)]) dag = nx.DiGraph() dag.add_edge(upstream, downstream, direction="bidirectional", remote_attribute='id', local_attribute='id') adapter = SnowflakeAdapter() RuntimeSourceCompiler.compile_queries_for_relation(upstream, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(downstream, dag, adapter, False) assert query_equalize(downstream.compiled_query) == query_equalize(f""" SELECT * FROM {downstream.quoted_dot_notation} WHERE id IN (1,2,3) """) assert query_equalize(upstream.compiled_query) == query_equalize(f""" WITH {upstream.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( SELECT * FROM {upstream.quoted_dot_notation} WHERE id in (SELECT id FROM {downstream.quoted_dot_notation}) ) ,{upstream.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( SELECT * FROM {upstream.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) ) SELECT * FROM {upstream.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} """)
def _get_relations_from_database( self, schema_obj: BaseTargetAdapter._DatabaseObject) -> List[Relation]: quoted_database = schema_obj.full_relation.quoted( schema_obj.full_relation.database) # quoted db name relation_database = schema_obj.full_relation.database # case corrected db name case_sensitive_schema = schema_obj.case_sensitive_name # case sensitive schame name relations_sql = f""" SELECT m.table_schema AS schema, m.table_name AS relation, m.table_type AS materialization, c.column_name AS attribute, c.ordinal_position AS ordinal, c.data_type AS data_type FROM {quoted_database}.information_schema.tables m INNER JOIN {quoted_database}.information_schema.columns c ON c.table_schema = m.table_schema AND c.table_name = m.table_name WHERE m.table_schema = '{case_sensitive_schema}' AND m.table_schema NOT IN ('information_schema', 'pg_catalog') AND m.table_type <> 'external' """ logger.debug( f'Collecting detailed relations from database {quoted_database}...' ) relations_frame = self._safe_query(relations_sql, quoted_database) unique_relations = (relations_frame['schema'] + '.' + relations_frame['relation']).unique().tolist() logger.debug( f'Done collecting relations. Found a total of {len(unique_relations)} ' f'unique relations in database {quoted_database}') relations = list() for relation in unique_relations: logger.debug( f'Building relation { quoted_database + "." + relation }...') attributes = list() for attribute in relations_frame.loc[( relations_frame['schema'] + '.' + relations_frame['relation']) == relation].itertuples(): logger.debug( f'adding attribute {attribute.attribute} to relation..') attributes.append( Attribute(self._correct_case(attribute.attribute), self._get_data_type(attribute.data_type))) relation = Relation( relation_database, self._correct_case(attribute.schema), # noqa pylint: disable=undefined-loop-variable self._correct_case(attribute.relation), # noqa pylint: disable=undefined-loop-variable self.MATERIALIZATION_MAPPINGS[ attribute.materialization.replace(" ", "_")], # noqa pylint: disable=undefined-loop-variable attributes) logger.debug(f'Added relation {relation.dot_notation} to pool.') relations.append(relation) logger.debug( f'Acquired {len(relations)} total relations from database {quoted_database}.' ) return relations
def test_run_deps_mixed_multi_deps(): r""" a --bidir--> c <--dir-- b \ / \ \ bidir dir \ | | \ V V dir-> d e """ relation_helper = RelationTestHelper() relation_a = Relation(name='rel_a', **relation_helper.rand_relation_helper()) relation_a.attributes = [ Attribute('col_a_c', dt.INTEGER), Attribute('col_a_d', dt.VARCHAR) ] relation_a.data = pd.DataFrame({ "col_a_c": [ 1, 2, 3, 4, 5, ], "col_a_d": ["var_a_1", "var_a_2", "var_a_3", "var_a_1", "var_a_2"], }) relation_b = Relation(name='rel_b', **relation_helper.rand_relation_helper()) relation_b.attributes = [Attribute('col_b_c', dt.VARCHAR)] relation_b.data = pd.DataFrame({ "col_b_c": [ "val1", "val2", "val3", "val4", "val5", ], }) relation_c = Relation(name='rel_c', **relation_helper.rand_relation_helper()) relation_c.attributes = [ Attribute('col_c_ae', dt.INTEGER), Attribute('col_c_bd', dt.VARCHAR) ] relation_c.data = pd.DataFrame({ "col_c_ae": [ 1, 1, 2, 2, 5, 5, 5, ], "col_c_bd": [ "val1", "val1", "val2", "val2", "val5", "val5", "val5", ] }) relation_d = Relation(name='rel_d', **relation_helper.rand_relation_helper()) relation_d.attributes = [ Attribute('col_d_a', dt.INTEGER), Attribute('col_d_c', dt.INTEGER) ] relation_e = Relation(name='rel_e', **relation_helper.rand_relation_helper()) relation_e.attributes = [Attribute('col_e_c', dt.INTEGER)] for relation in ( relation_a, relation_b, relation_c, relation_d, relation_e, ): relation = stub_out_sampling(relation) dag = nx.DiGraph() dag.add_edge(relation_a, relation_c, direction="bidirectional", remote_attribute="col_a_c", local_attribute="col_c_ae") dag.add_edge(relation_a, relation_d, direction="directional", remote_attribute="col_a_d", local_attribute="col_d_a") dag.add_edge(relation_b, relation_c, direction="directional", remote_attribute="col_b_c", local_attribute="col_c_bd") dag.add_edge(relation_c, relation_d, direction="bidirectional", remote_attribute="col_c_bd", local_attribute="col_d_c") dag.add_edge(relation_c, relation_e, direction="directional", remote_attribute="col_c_ae", local_attribute="col_e_c") adapter = SnowflakeAdapter() RuntimeSourceCompiler.compile_queries_for_relation(relation_a, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_b, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_c, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_d, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_e, dag, adapter, False) assert query_equalize(relation_a.compiled_query) == query_equalize(f""" WITH {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( SELECT * FROM {relation_a.quoted_dot_notation} WHERE col_a_c in (SELECT col_c_ae FROM {relation_c.quoted_dot_notation}) ) ,{relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( SELECT * FROM {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) ) SELECT * FROM {relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} """) assert query_equalize(relation_b.compiled_query) == query_equalize(f""" SELECT * FROM {relation_b.quoted_dot_notation} SAMPLE BERNOULLI (1500 ROWS) """) assert query_equalize(relation_c.compiled_query) == query_equalize(f""" SELECT * FROM {relation_c.quoted_dot_notation} WHERE col_c_bd in (SELECT col_d_c FROM {relation_d.quoted_dot_notation}) AND col_c_ae IN (1,2,3,4,5) AND col_c_bd IN ('val1','val2','val3','val4','val5') """) assert query_equalize(relation_d.compiled_query) == query_equalize(f""" SELECT * FROM {relation_d.quoted_dot_notation} WHERE col_d_a IN ('var_a_1','var_a_2','var_a_3') AND col_d_c IN ('val1','val2','val5') """) assert query_equalize(relation_e.compiled_query) == query_equalize(f""" WITH {relation_e.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( SELECT * FROM {relation_e.quoted_dot_notation} WHERE col_e_c IN (1,2,5) ) ,{relation_e.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( SELECT * FROM {relation_e.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) ) SELECT * FROM {relation_e.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} """)
def test_run_deps_bidirectional_multi_deps(): """ a --bidir--> c <--bidir-- b """ relation_helper = RelationTestHelper() relation_a = Relation(name='rel_a', **relation_helper.rand_relation_helper()) relation_a.attributes = [Attribute('col_a', dt.INTEGER)] relation_a.data = pd.DataFrame({"col_a": [ 1, 2, 3, 4, 5, ]}) relation_b = Relation(name='rel_b', **relation_helper.rand_relation_helper()) relation_b.attributes = [Attribute('col_b', dt.VARCHAR)] relation_b.data = pd.DataFrame({ "col_b": [ "val1", "val2", "val3", "val4", "val5", ], }) relation_c = Relation(name='rel_c', **relation_helper.rand_relation_helper()) relation_c.attributes = [ Attribute('col_c_a', dt.INTEGER), Attribute('col_c_b', dt.VARCHAR) ] for relation in ( relation_a, relation_b, relation_c, ): relation = stub_out_sampling(relation) dag = nx.DiGraph() dag.add_edge(relation_a, relation_c, direction="bidirectional", remote_attribute="col_a", local_attribute="col_c_a") dag.add_edge(relation_b, relation_c, direction="bidirectional", remote_attribute="col_b", local_attribute="col_c_b") adapter = SnowflakeAdapter() RuntimeSourceCompiler.compile_queries_for_relation(relation_a, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_b, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_c, dag, adapter, False) assert query_equalize(relation_a.compiled_query) == query_equalize(f""" WITH {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( SELECT * FROM {relation_a.quoted_dot_notation} WHERE col_a in (SELECT col_c_a FROM {relation_c.quoted_dot_notation}) ) ,{relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( SELECT * FROM {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) ) SELECT * FROM {relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} """) assert query_equalize(relation_b.compiled_query) == query_equalize(f""" WITH {relation_b.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( SELECT * FROM {relation_b.quoted_dot_notation} WHERE col_b in (SELECT col_c_b FROM {relation_c.quoted_dot_notation}) ) ,{relation_b.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( SELECT * FROM {relation_b.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) ) SELECT * FROM {relation_b.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} """) assert query_equalize(relation_c.compiled_query) == query_equalize(f""" SELECT * FROM {relation_c.quoted_dot_notation} WHERE col_c_a IN (1,2,3,4,5) AND col_c_b IN ('val1','val2','val3','val4','val5') """)