def test_run_deps_polymorphic_parentid(stub_relation_set): child1 = stub_relation_set.child_relation_type_1 child2 = stub_relation_set.child_relation_type_2 child3 = stub_relation_set.child_relation_type_3 parent = stub_relation_set.parent_relation_parentid parentid = stub_relation_set.parentid_key for relation in ( child1, child2, child3, parent, ): relation = stub_out_sampling(relation) child1.data = pd.DataFrame([{parentid: "1"}, {parentid: "10"}]) child2.data = pd.DataFrame([{parentid: "2"}, {parentid: "20"}]) child3.data = pd.DataFrame([{parentid: "3"}, {parentid: "30"}]) dag = nx.DiGraph() dag.add_edge(child1, parent, direction="polymorphic", remote_attribute=parentid, local_attribute=parentid) dag.add_edge(child2, parent, direction="polymorphic", remote_attribute=parentid, local_attribute=parentid) dag.add_edge(child3, parent, direction="polymorphic", remote_attribute=parentid, local_attribute=parentid) adapter = SnowflakeAdapter() child1 = RuntimeSourceCompiler.compile_queries_for_relation( child1, dag, adapter, False) child2 = RuntimeSourceCompiler.compile_queries_for_relation( child2, dag, adapter, False) child3 = RuntimeSourceCompiler.compile_queries_for_relation( child3, dag, adapter, False) parent = RuntimeSourceCompiler.compile_queries_for_relation( parent, dag, adapter, False) expected_query = f""" SELECT * FROM {parent.quoted_dot_notation} WHERE ( {parentid} IN ('1','10') OR {parentid} IN ('2','20') OR {parentid} IN ('3','30') ) """ assert query_equalize( parent.compiled_query) == query_equalize(expected_query)
def test_run_deps_bidirectional_exclude_outliers(stub_relation_set): upstream = stub_relation_set.upstream_relation downstream = stub_relation_set.downstream_relation for relation in ( downstream, upstream, ): relation.attributes = [Attribute('id', dt.INTEGER)] relation = stub_out_sampling(relation) upstream.data = pd.DataFrame([dict(id=1), dict(id=2), dict(id=3)]) dag = nx.DiGraph() dag.add_edge(upstream, downstream, direction="bidirectional", remote_attribute='id', local_attribute='id') adapter = SnowflakeAdapter() RuntimeSourceCompiler.compile_queries_for_relation(upstream, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(downstream, dag, adapter, False) assert query_equalize(downstream.compiled_query) == query_equalize(f""" SELECT * FROM {downstream.quoted_dot_notation} WHERE id IN (1,2,3) """) assert query_equalize(upstream.compiled_query) == query_equalize(f""" WITH {upstream.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( SELECT * FROM {upstream.quoted_dot_notation} WHERE id in (SELECT id FROM {downstream.quoted_dot_notation}) ) ,{upstream.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( SELECT * FROM {upstream.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) ) SELECT * FROM {upstream.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} """)
def test_run_iso(stub_relation_set): iso = stub_relation_set.iso_relation iso = stub_out_sampling(iso) dag = nx.DiGraph() dag.add_nodes_from([iso]) compiler = RuntimeSourceCompiler() adapter = SnowflakeAdapter() result = compiler.compile_queries_for_relation(iso, dag, adapter, False) assert query_equalize(iso.compiled_query) == query_equalize(f""" SELECT * FROM {iso.quoted_dot_notation} SAMPLE BERNOULLI (1500 ROWS) """)
def test_analyze_unsampled(stub_relation_set): upstream = stub_relation_set.upstream_relation upstream.unsampled = True dag = nx.DiGraph() dag.add_edges_from([( upstream, stub_relation_set.downstream_relation, )]) compiler = RuntimeSourceCompiler() adapter, unsampled_method = [mock.MagicMock() for _ in range(2)] adapter.unsampled_statement = unsampled_method result = compiler.compile_queries_for_relation(upstream, dag, adapter, True) assert unsampled_method.called
def test_analyze_iso(stub_relation_set): iso = stub_relation_set.iso_relation iso = stub_out_sampling(iso) dag = nx.DiGraph() dag.add_nodes_from([iso]) compiler = RuntimeSourceCompiler() adapter = SnowflakeAdapter() result = compiler.compile_queries_for_relation(iso, dag, adapter, True) assert query_equalize(iso.compiled_query) == query_equalize(f""" WITH {iso.scoped_cte('SNOWSHU_COUNT_POPULATION')} AS ( SELECT COUNT(*) AS population_size FROM {iso.quoted_dot_notation} ) ,{iso.scoped_cte('SNOWSHU_CORE_SAMPLE')} AS ( SELECT * FROM {iso.quoted_dot_notation} SAMPLE BERNOULLI (1500 ROWS) ) ,{iso.scoped_cte('SNOWSHU_CORE_SAMPLE')}_COUNT AS ( SELECT COUNT(*) AS sample_size FROM {iso.scoped_cte('SNOWSHU_CORE_SAMPLE')} ) SELECT s.sample_size AS sample_size ,p.population_size AS population_size FROM {iso.scoped_cte('SNOWSHU_CORE_SAMPLE')}_COUNT s INNER JOIN {iso.scoped_cte('SNOWSHU_COUNT_POPULATION')} p ON 1=1 LIMIT 1 """)
def test_run_unsampled(stub_relation_set): upstream = stub_relation_set.upstream_relation upstream.unsampled = True dag = nx.DiGraph() dag.add_edges_from([( upstream, stub_relation_set.downstream_relation, )]) adapter = SnowflakeAdapter() upstream = RuntimeSourceCompiler.compile_queries_for_relation( upstream, dag, adapter, False) assert query_equalize(upstream.compiled_query) == query_equalize(f""" SELECT * FROM {upstream.quoted_dot_notation} """)
def test_analyze_unsampled(stub_relation_set): upstream = stub_relation_set.upstream_relation upstream.unsampled = True dag = nx.DiGraph() dag.add_edges_from([( upstream, stub_relation_set.downstream_relation, )]) adapter = SnowflakeAdapter() upstream = RuntimeSourceCompiler.compile_queries_for_relation( upstream, dag, adapter, True) assert query_equalize(upstream.compiled_query) == query_equalize(f""" WITH {upstream.scoped_cte('SNOWSHU_COUNT_POPULATION')} AS ( SELECT COUNT(*) AS population_size FROM {upstream.quoted_dot_notation} ) ,{upstream.scoped_cte('SNOWSHU_CORE_SAMPLE')} AS ( SELECT * FROM {upstream.quoted_dot_notation} ) ,{upstream.scoped_cte('SNOWSHU_CORE_SAMPLE_COUNT')} AS ( SELECT COUNT(*) AS sample_size FROM {upstream.scoped_cte('SNOWSHU_CORE_SAMPLE')} ) SELECT s.sample_size AS sample_size ,p.population_size AS population_size FROM {upstream.scoped_cte('SNOWSHU_CORE_SAMPLE_COUNT')} s INNER JOIN {upstream.scoped_cte('SNOWSHU_COUNT_POPULATION')} p ON 1=1 LIMIT 1 """)
def test_run_deps_mixed_multi_deps(): r""" a --bidir--> c <--dir-- b \ / \ \ bidir dir \ | | \ V V dir-> d e """ relation_helper = RelationTestHelper() relation_a = Relation(name='rel_a', **relation_helper.rand_relation_helper()) relation_a.attributes = [ Attribute('col_a_c', dt.INTEGER), Attribute('col_a_d', dt.VARCHAR) ] relation_a.data = pd.DataFrame({ "col_a_c": [ 1, 2, 3, 4, 5, ], "col_a_d": ["var_a_1", "var_a_2", "var_a_3", "var_a_1", "var_a_2"], }) relation_b = Relation(name='rel_b', **relation_helper.rand_relation_helper()) relation_b.attributes = [Attribute('col_b_c', dt.VARCHAR)] relation_b.data = pd.DataFrame({ "col_b_c": [ "val1", "val2", "val3", "val4", "val5", ], }) relation_c = Relation(name='rel_c', **relation_helper.rand_relation_helper()) relation_c.attributes = [ Attribute('col_c_ae', dt.INTEGER), Attribute('col_c_bd', dt.VARCHAR) ] relation_c.data = pd.DataFrame({ "col_c_ae": [ 1, 1, 2, 2, 5, 5, 5, ], "col_c_bd": [ "val1", "val1", "val2", "val2", "val5", "val5", "val5", ] }) relation_d = Relation(name='rel_d', **relation_helper.rand_relation_helper()) relation_d.attributes = [ Attribute('col_d_a', dt.INTEGER), Attribute('col_d_c', dt.INTEGER) ] relation_e = Relation(name='rel_e', **relation_helper.rand_relation_helper()) relation_e.attributes = [Attribute('col_e_c', dt.INTEGER)] for relation in ( relation_a, relation_b, relation_c, relation_d, relation_e, ): relation = stub_out_sampling(relation) dag = nx.DiGraph() dag.add_edge(relation_a, relation_c, direction="bidirectional", remote_attribute="col_a_c", local_attribute="col_c_ae") dag.add_edge(relation_a, relation_d, direction="directional", remote_attribute="col_a_d", local_attribute="col_d_a") dag.add_edge(relation_b, relation_c, direction="directional", remote_attribute="col_b_c", local_attribute="col_c_bd") dag.add_edge(relation_c, relation_d, direction="bidirectional", remote_attribute="col_c_bd", local_attribute="col_d_c") dag.add_edge(relation_c, relation_e, direction="directional", remote_attribute="col_c_ae", local_attribute="col_e_c") adapter = SnowflakeAdapter() RuntimeSourceCompiler.compile_queries_for_relation(relation_a, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_b, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_c, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_d, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_e, dag, adapter, False) assert query_equalize(relation_a.compiled_query) == query_equalize(f""" WITH {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( SELECT * FROM {relation_a.quoted_dot_notation} WHERE col_a_c in (SELECT col_c_ae FROM {relation_c.quoted_dot_notation}) ) ,{relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( SELECT * FROM {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) ) SELECT * FROM {relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} """) assert query_equalize(relation_b.compiled_query) == query_equalize(f""" SELECT * FROM {relation_b.quoted_dot_notation} SAMPLE BERNOULLI (1500 ROWS) """) assert query_equalize(relation_c.compiled_query) == query_equalize(f""" SELECT * FROM {relation_c.quoted_dot_notation} WHERE col_c_bd in (SELECT col_d_c FROM {relation_d.quoted_dot_notation}) AND col_c_ae IN (1,2,3,4,5) AND col_c_bd IN ('val1','val2','val3','val4','val5') """) assert query_equalize(relation_d.compiled_query) == query_equalize(f""" SELECT * FROM {relation_d.quoted_dot_notation} WHERE col_d_a IN ('var_a_1','var_a_2','var_a_3') AND col_d_c IN ('val1','val2','val5') """) assert query_equalize(relation_e.compiled_query) == query_equalize(f""" WITH {relation_e.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( SELECT * FROM {relation_e.quoted_dot_notation} WHERE col_e_c IN (1,2,5) ) ,{relation_e.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( SELECT * FROM {relation_e.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) ) SELECT * FROM {relation_e.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} """)
def test_run_deps_bidirectional_multi_deps(): """ a --bidir--> c <--bidir-- b """ relation_helper = RelationTestHelper() relation_a = Relation(name='rel_a', **relation_helper.rand_relation_helper()) relation_a.attributes = [Attribute('col_a', dt.INTEGER)] relation_a.data = pd.DataFrame({"col_a": [ 1, 2, 3, 4, 5, ]}) relation_b = Relation(name='rel_b', **relation_helper.rand_relation_helper()) relation_b.attributes = [Attribute('col_b', dt.VARCHAR)] relation_b.data = pd.DataFrame({ "col_b": [ "val1", "val2", "val3", "val4", "val5", ], }) relation_c = Relation(name='rel_c', **relation_helper.rand_relation_helper()) relation_c.attributes = [ Attribute('col_c_a', dt.INTEGER), Attribute('col_c_b', dt.VARCHAR) ] for relation in ( relation_a, relation_b, relation_c, ): relation = stub_out_sampling(relation) dag = nx.DiGraph() dag.add_edge(relation_a, relation_c, direction="bidirectional", remote_attribute="col_a", local_attribute="col_c_a") dag.add_edge(relation_b, relation_c, direction="bidirectional", remote_attribute="col_b", local_attribute="col_c_b") adapter = SnowflakeAdapter() RuntimeSourceCompiler.compile_queries_for_relation(relation_a, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_b, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_c, dag, adapter, False) assert query_equalize(relation_a.compiled_query) == query_equalize(f""" WITH {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( SELECT * FROM {relation_a.quoted_dot_notation} WHERE col_a in (SELECT col_c_a FROM {relation_c.quoted_dot_notation}) ) ,{relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( SELECT * FROM {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) ) SELECT * FROM {relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} """) assert query_equalize(relation_b.compiled_query) == query_equalize(f""" WITH {relation_b.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( SELECT * FROM {relation_b.quoted_dot_notation} WHERE col_b in (SELECT col_c_b FROM {relation_c.quoted_dot_notation}) ) ,{relation_b.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( SELECT * FROM {relation_b.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) ) SELECT * FROM {relation_b.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} """) assert query_equalize(relation_c.compiled_query) == query_equalize(f""" SELECT * FROM {relation_c.quoted_dot_notation} WHERE col_c_a IN (1,2,3,4,5) AND col_c_b IN ('val1','val2','val3','val4','val5') """)
def test_run_deps_polymorphic_idtype(stub_relation_set): child1 = stub_relation_set.child_relation_type_1 child2 = stub_relation_set.child_relation_type_2 child3 = stub_relation_set.child_relation_type_3 parent = stub_relation_set.parent_relation_childid_type childid = stub_relation_set.childid_key childtype = stub_relation_set.childtype_key child2type_override = stub_relation_set.child2override_key local_overrides = {child2.dot_notation: child2type_override} for relation in ( child1, child2, child3, parent, ): relation = stub_out_sampling(relation) child1.data = pd.DataFrame([{childid: "1"}, {childid: "2"}]) child2.data = pd.DataFrame([{childid: "1"}, {childid: "3"}]) child3.data = pd.DataFrame([{childid: "1"}, {childid: "4"}]) dag = nx.DiGraph() dag.add_edge(child1, parent, direction="polymorphic", remote_attribute=childid, local_attribute=childid, local_type_attribute=childtype, local_type_overrides=local_overrides) dag.add_edge(child2, parent, direction="polymorphic", remote_attribute=childid, local_attribute=childid, local_type_attribute=childtype, local_type_overrides=local_overrides) dag.add_edge(child3, parent, direction="polymorphic", remote_attribute=childid, local_attribute=childid, local_type_attribute=childtype, local_type_overrides=local_overrides) adapter = SnowflakeAdapter() child1 = RuntimeSourceCompiler.compile_queries_for_relation( child1, dag, adapter, False) child2 = RuntimeSourceCompiler.compile_queries_for_relation( child2, dag, adapter, False) child3 = RuntimeSourceCompiler.compile_queries_for_relation( child3, dag, adapter, False) parent = RuntimeSourceCompiler.compile_queries_for_relation( parent, dag, adapter, False) expected_query = f""" SELECT * FROM {parent.quoted_dot_notation} WHERE ( ({childid} IN ('1','2') AND LOWER({childtype}) = LOWER('child_type_1_record') ) OR ({childid} IN ('1','3') AND LOWER({childtype}) = LOWER('{child2type_override}') ) OR ({childid} IN ('1','4') AND LOWER({childtype}) = LOWER('child_type_3_record') ) ) """ assert query_equalize( parent.compiled_query) == query_equalize(expected_query)
def _traverse_and_execute( self, executable: GraphExecutable ) -> None: # noqa mccabe: disable=MC0001 """ Processes a single graph and loads the data into the replica if required To save memory after processing, the loaded dataframes are deleted, and garbage collection manually called. Args: executable (GraphExecutable): object that contains all of the necessary info for executing a sample and loading it into the target """ start_time = time.time() if self.barf: with open( os.path.join( self.barf_output, f'{[n for n in executable.graph.nodes][0].dot_notation}.component' ), 'wb') as cmp_file: # noqa pylint: disable=unnecessary-comprehension nx.write_multiline_adjlist(executable.graph, cmp_file) try: logger.debug( f"Executing graph with {len(executable.graph)} relations in it..." ) for i, relation in enumerate( nx.algorithms.dag.topological_sort(executable.graph)): relation.population_size = executable.source_adapter.scalar_query( executable.source_adapter.population_count_statement( relation)) logger.info( f'Executing source query for relation {relation.dot_notation} ' f'({i+1} of {len(executable.graph)} in graph)...') relation.sampling.prepare(relation, executable.source_adapter) relation = RuntimeSourceCompiler.compile_queries_for_relation( relation, executable.graph, executable.source_adapter, executable.analyze) if executable.analyze: if relation.is_view: relation.population_size = "N/A" relation.sample_size = "N/A" logger.info( f'Relation {relation.dot_notation} is a view, skipping.' ) else: result = executable.source_adapter.check_count_and_query( relation.compiled_query, MAX_ALLOWED_ROWS, relation.unsampled).iloc[0] relation.population_size = result.population_size relation.sample_size = result.sample_size logger.info( f'Analysis of relation {relation.dot_notation} completed in {duration(start_time)}.' ) else: executable.target_adapter.create_database_if_not_exists( relation.quoted(relation.database)) executable.target_adapter.create_schema_if_not_exists( relation.quoted(relation.database), relation.quoted(relation.schema)) if relation.is_view: logger.info( f'Retrieving DDL statement for view {relation.dot_notation} in source...' ) relation.population_size = "N/A" relation.sample_size = "N/A" try: relation.view_ddl = executable.source_adapter.scalar_query( relation.compiled_query) except Exception: raise SystemError( f'Failed to extract DDL statement: {relation.compiled_query}' ) logger.info( f'Successfully extracted DDL statement for view {relation.quoted_dot_notation}' ) else: logger.info( f'Retrieving records from source {relation.dot_notation}...' ) try: relation.data = executable.source_adapter.check_count_and_query( relation.compiled_query, MAX_ALLOWED_ROWS, relation.unsampled) except Exception as exc: raise SystemError( f'Failed execution of extraction sql statement: {relation.compiled_query} {exc}' ) relation.sample_size = len(relation.data) logger.info( f'{relation.sample_size} records retrieved for relation {relation.dot_notation}.' ) logger.info( f'Inserting relation {relation.quoted_dot_notation} into target...' ) try: executable.target_adapter.create_and_load_relation( relation) except Exception as exc: raise SystemError( f'Failed to load relation {relation.quoted_dot_notation} into target: {exc}' ) logger.info( f'Done replication of relation {relation.dot_notation} in {duration(start_time)}.' ) relation.target_loaded = True relation.source_extracted = True logger.info( f'population:{relation.population_size}, sample:{relation.sample_size}' ) if self.barf: with open( os.path.join(self.barf_output, f'{relation.dot_notation}.sql'), 'w') as barf_file: barf_file.write(relation.compiled_query) try: for relation in executable.graph.nodes: del relation.data except AttributeError: pass gc.collect() except Exception as exc: logger.error(f'failed with error of type {type(exc)}: {str(exc)}') raise exc
def _traverse_and_execute(self, executable: GraphExecutable, start_time: int) -> None: try: logger.debug( f"Executing graph with {len(executable.graph)} relations in it..." ) for i, relation in enumerate( nx.algorithms.dag.topological_sort(executable.graph)): relation.population_size = executable.source_adapter.scalar_query( executable.source_adapter.population_count_statement( relation)) logger.info( f'Executing graph {i+1} of {len(executable.graph)} source query for relation {relation.dot_notation}...' ) relation.sampling.prepare(relation, executable.source_adapter) relation = RuntimeSourceCompiler.compile_queries_for_relation( relation, executable.graph, executable.source_adapter, executable.analyze) if executable.analyze: if relation.is_view: relation.population_size = "N/A" relation.sample_size = "N/A" logger.info( f'Relation {relation.dot_notation} is a view, skipping.' ) else: result = [ row for row in executable.source_adapter.check_count_and_query( relation.compiled_query, MAX_ALLOWED_ROWS).itertuples() ][0] relation.population_size = result.population_size relation.sample_size = result.sample_size logger.info( f'Analysis of relation {relation.dot_notation} completed in {duration(start_time)}.' ) else: executable.target_adapter.create_database_if_not_exists( relation.quoted(relation.database)) executable.target_adapter.create_schema_if_not_exists( relation.quoted(relation.database), relation.quoted(relation.schema)) if relation.is_view: logger.info( f'Retrieving DDL statement for view {relation.dot_notation} in source...' ) relation.population_size = "N/A" relation.sample_size = "N/A" try: relation.view_ddl = executable.source_adapter.scalar_query( relation.compiled_query) except Exception: raise SystemError( f'Failed to extract DDL statement: {relation.compiled_query}' ) logger.info( f'Successfully extracted DDL statement for view {relation.quoted_dot_notation}' ) else: logger.info( f'Retrieving records from source {relation.dot_notation}...' ) try: relation.data = executable.source_adapter.check_count_and_query( relation.compiled_query, MAX_ALLOWED_ROWS) except Exception as e: raise SystemError( f'Failed execution of extraction sql statement: {relation.compiled_query} {e}' ) relation.sample_size = len(relation.data) logger.info( f'{relation.sample_size} records retrieved for relation {relation.dot_notation}.' ) logger.info( f'Inserting relation {relation.quoted_dot_notation} into target...' ) try: executable.target_adapter.create_and_load_relation( relation) except Exception as e: raise SystemError( f'Failed to load relation {relation.quoted_dot_notation} into target: {e}' ) logger.info( f'Done replication of relation {relation.dot_notation} in {duration(start_time)}.' ) relation.target_loaded = True relation.source_extracted = True logger.info( f'population:{relation.population_size}, sample:{relation.sample_size}' ) if self.barf: with open( os.path.join(self.barf_output, f'{relation.dot_notation}.sql'), 'w') as f: f.write(relation.compiled_query) try: for relation in executable.graph.nodes: del relation.data except AttributeError: pass gc.collect() except Exception as e: logger.error(f'failed with error of type {type(e)}: {str(e)}') raise e