def test_sets_only_existing_adapters(): shgraph = SnowShuGraph() test_relation = Relation(database='SNOWSHU_DEVELOPMENT', schema='SOURCE_SYSTEM', name='ORDER_ITEMS', materialization=mz.TABLE, attributes=[]) test_relation.include_outliers, test_relation.unsampled = [ False for _ in range(2) ] test_relation.sampling = DefaultSampling() config_dict = copy.deepcopy(CONFIGURATION) config_dict['preserve_case'] = True config_dict['source']['specified_relations'][1]['sampling'] = 'lucky_guess' with pytest.raises(AttributeError): config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) assert isinstance(test_relation.sampling, DefaultSampling) config_dict['source']['specified_relations'][1]['sampling'] = 'brute_force' config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) assert isinstance( shgraph._set_overriding_params_for_node(test_relation, config).sampling, BruteForceSampling)
def test_schema_verification(tmpdir, stub_creds, stub_configs): """ Verifies that the configuration parser can load a proper file. """ replica_file = Path(tmpdir / 'replica_file.yml') cred_file = Path(tmpdir / 'credentials_file.yml') stub_creds = stub_creds() stub_configs = stub_configs() stub_configs["credpath"] = str(cred_file.absolute()) cred_file.write_text(json.dumps(stub_creds)) replica_file.write_text(json.dumps(stub_configs)) replica_config = ConfigurationParser()._get_dict_from_anything(replica_file, REPLICA_JSON_SCHEMA) cred_config = ConfigurationParser()._get_dict_from_anything(cred_file, CREDENTIALS_JSON_SCHEMA) assert isinstance(replica_config, dict) assert isinstance(cred_config, dict)
def test_unsampled(stub_graph_set): shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right] config_dict = copy.deepcopy(CONFIGURATION) config_dict['source']['specified_relations'] = [dict(relation=vals.iso_relation.name, database=vals.iso_relation.database, schema=vals.iso_relation.schema, unsampled=True)] config = ConfigurationParser().from_file_or_path(StringIO(yaml.dump(config_dict))) assert vals.iso_relation.unsampled == False modified_graph = shgraph._apply_specifications( config, nx.DiGraph(), full_catalog) modified_graph=shgraph._apply_specifications(config,nx.DiGraph(),full_catalog) assert vals.iso_relation.unsampled==True
def test_build_graph_fails_many_to_many(stub_graph_set): """ Tests build_graph exits on many-to-many relationships """ shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [ vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) config_dict["source"]["specified_relations"] = [{ "database": ".*", "schema": ".*", "relation": ".*relation_.*$", # birelations "relationships": { "directional": [{ "local_attribute": vals.directional_key, "database": ".*", "schema": ".*", "relation": ".*relation$", # non birelations "remote_attribute": vals.directional_key }] } }] config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock with pytest.raises(InvalidRelationshipException) as exc: shgraph.build_graph(config) assert "defines a many-to-many relationship" in str(exc.value) assert "Many-to-many relationship are not allowed by SnowShu" in str( exc.value)
def test_graph_allows_upstream_wildcards(stub_graph_set): shgraph = SnowShuGraph() _, vals = stub_graph_set vals.upstream_relation.database = vals.downstream_relation.database vals.upstream_relation.schema = vals.downstream_relation.schema full_catalog = [vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right] config_dict = copy.deepcopy(CONFIGURATION) config_dict['source']['specified_relations'] = [dict(relation=vals.downstream_relation.name, database=vals.downstream_relation.database, schema=vals.downstream_relation.schema, unsampled=False, relationships=dict(directional=[], bidirectional=[dict(relation=vals.upstream_relation.name, database='', schema='', local_attribute=vals.downstream_relation.attributes[ 0].name, remote_attribute=vals.upstream_relation.attributes[0].name)]))] config = ConfigurationParser().from_file_or_path(StringIO(yaml.dump(config_dict))) modified_graph = shgraph._apply_specifications( config, nx.DiGraph(), full_catalog) assert (vals.upstream_relation, vals.downstream_relation, ) in modified_graph.edges
def load_config(self, config: Union[Path, str, TextIO]): """does all the initial work to make the resulting ReplicaFactory object usable.""" logger.info('Loading configuration...') start_timer = time.time() self.config = ConfigurationParser().from_file_or_path(config) logger.info('Configuration loaded in %s.', duration(start_timer))
def test_casing_polymorphic_overrides(stub_configs): stub_configs = stub_configs() mock_config_file=StringIO(yaml.dump(stub_configs)) parsed = ConfigurationParser().from_file_or_path(mock_config_file) override_relation = [rel for rel in parsed.specified_relations if rel.relation_pattern == 'parent_table'][0] overrides = override_relation.relationships.polymorphic[0].local_type_overrides assert overrides assert 'snowshu_development.polymorphic_data.child_type_2_items' in overrides assert overrides['snowshu_development.polymorphic_data.child_type_2_items'] == 'type_2'
def test_errors_on_bad_profile(stub_configs): stub_configs = stub_configs() SOURCE_PROFILE, TARGET_PROFILE, STORAGE_PROFILE = [ rand_string(10) for _ in range(3)] stub_configs['source']['profile'] = SOURCE_PROFILE stub_configs['storage']['profile'] = STORAGE_PROFILE with pytest.raises(ValueError): mock_config_file = StringIO(yaml.dump(stub_configs)) ConfigurationParser().from_file_or_path(mock_config_file)
def test_schema_verification_errors(stub_creds, stub_configs): stub_creds = stub_creds() stub_configs = stub_configs() # create type error in replica.yml stub_creds['sources'][0]['password'] = True with tempfile.NamedTemporaryFile(mode='w') as mock_file: json.dump(stub_creds, mock_file) mock_file.seek(0) stub_configs['credpath']=mock_file.name with pytest.raises(ValidationError) as exc: ConfigurationParser()._build_adapter_profile('source', stub_configs) assert "True is not of type 'string'" in str(exc.value) # config with missing credentials file with pytest.raises(FileNotFoundError) as fnf_err: mock_config_file = StringIO(yaml.dump(stub_configs)) ConfigurationParser().from_file_or_path(mock_config_file) assert "Credentials specified in replica.yml not found" in fnf_err.value.strerror
def test_graph_difference_more_both_isolated_non_isolated_relations_source( stub_graph_set, stub_relation_set): """ Tests graph_difference returns graph with expected nodes if source graph has non-isolated and isolated nodes which are not present in target catalog """ _, vals = stub_graph_set common_relation = Relation(name=rand_string(10), **stub_relation_set.rand_relation_helper()) source_catalog = [ common_relation, Relation(name=rand_string(10), **stub_relation_set.rand_relation_helper()), Relation(name=rand_string(10), **stub_relation_set.rand_relation_helper()), vals.downstream_relation, vals.upstream_relation, vals.birelation_right ] target_catalog = { common_relation, Relation(name=rand_string(10), **stub_relation_set.rand_relation_helper()), Relation(name=rand_string(10), **stub_relation_set.rand_relation_helper()), vals.birelation_left, vals.birelation_right } config_dict = copy.deepcopy(BASIC_CONFIGURATION) config_dict["source"]["specified_relations"] = [{ "database": vals.downstream_relation.database, "schema": vals.downstream_relation.schema, "relation": vals.downstream_relation.name, "relationships": { "directional": [{ "local_attribute": vals.directional_key, "database": ".*", "schema": ".*", "relation": ".*relation.*$", "remote_attribute": vals.directional_key }] } }] config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) shgraph = SnowShuGraph() with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = source_catalog config.source_profile.adapter = adapter_mock shgraph.build_graph(config) expected_nodes = source_catalog[1:] actual = SnowShuGraph.catalog_difference(shgraph, target_catalog) assert list(actual.nodes) == expected_nodes
def test_fills_empty_top_level_values(stub_configs): stub_configs = stub_configs() del stub_configs['long_description'] for attr in ('include_outliers','max_number_of_outliers',): if attr in stub_configs['source'].keys(): del stub_configs['source'][attr] mock_config_file=StringIO(yaml.dump(stub_configs)) parsed = ConfigurationParser().from_file_or_path(mock_config_file) assert parsed.long_description == '' assert parsed.include_outliers==False assert parsed.max_number_of_outliers==DEFAULT_MAX_NUMBER_OF_OUTLIERS
def test_included_and_excluded(adapter, target): shgraph = SnowShuGraph() conf_obj = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(MOCKED_CONFIG))) shgraph.build_graph(conf_obj, MOCKED_CATALOG) matched_nodes = shgraph.graph assert MOCKED_CATALOG[0] in matched_nodes.nodes assert MOCKED_CATALOG[1] in matched_nodes.nodes assert MOCKED_CATALOG[2] not in matched_nodes.nodes assert MOCKED_CATALOG[3] not in matched_nodes.nodes assert MOCKED_CATALOG[4] not in matched_nodes.nodes assert MOCKED_CATALOG[5] not in matched_nodes.nodes assert MOCKED_CATALOG[6] in matched_nodes.nodes
def test_included_and_excluded(target, adapter): shgraph = SnowShuGraph() conf_obj = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(MOCKED_CONFIG))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = MOCKED_CATALOG conf_obj.source_profile.adapter = adapter_mock shgraph.build_graph(conf_obj) matched_nodes = shgraph.graph assert MOCKED_CATALOG[0] in matched_nodes.nodes assert MOCKED_CATALOG[1] in matched_nodes.nodes assert MOCKED_CATALOG[2] not in matched_nodes.nodes assert MOCKED_CATALOG[3] not in matched_nodes.nodes assert MOCKED_CATALOG[4] not in matched_nodes.nodes assert MOCKED_CATALOG[5] not in matched_nodes.nodes assert MOCKED_CATALOG[6] in matched_nodes.nodes
def test_loads_good_creds(stub_creds,stub_configs): stub_creds = stub_creds() stub_configs = stub_configs() SOURCES_NAME, SOURCES_PASSWORD, STORAGES_ACCOUNT = [ rand_string(10) for _ in range(3)] with tempfile.NamedTemporaryFile(mode='w') as mock_file: stub_creds['sources'][0]['name'] = SOURCES_NAME stub_creds['sources'][0]['password'] = SOURCES_PASSWORD stub_configs['source']['profile'] = SOURCES_NAME json.dump(stub_creds, mock_file) mock_file.seek(0) stub_configs['credpath']=mock_file.name adapter_profile=ConfigurationParser()._build_adapter_profile('source', stub_configs) assert adapter_profile.name == SOURCES_NAME assert adapter_profile.adapter.credentials.password == SOURCES_PASSWORD
def test_build_graph_fails_no_downstream(): """ Tests build_graph exits on no downstream relations """ shgraph = SnowShuGraph() full_catalog = [] # no relations in filtered catalog config_dict = copy.deepcopy( CONFIGURATION) # use the "live" config on random test data config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock with pytest.raises(InvalidRelationshipException) as exc: # building the graph should raise when no downstream relations are found shgraph.build_graph(config) assert "does not match any relations" in str(exc.value)
def test_build_graph_partitions_wildcards(stub_graph_set): """ Tests build_graph partitions wildcard relationships """ shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [ vals.downstream_wildcard_relation_1, vals.downstream_wildcard_relation_2, vals.upstream_wildcard_relation_1, vals.upstream_wildcard_relation_2, ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) config_dict["source"]["specified_relations"] = [{ "database": f"({vals.downstream_wildcard_relation_1.database}|{vals.downstream_wildcard_relation_2.database})", "schema": f"({vals.downstream_wildcard_relation_1.schema}|{vals.downstream_wildcard_relation_2.schema})", "relation": ".*downstream.*$", "relationships": { "directional": [{ "local_attribute": vals.directional_key, "database": "", "schema": "", "relation": ".*upstream.*$", "remote_attribute": vals.directional_key }] } }] config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock shgraph.build_graph(config) assert len(shgraph.graph.edges()) == 2 assert (vals.upstream_wildcard_relation_1, vals.downstream_wildcard_relation_1) in shgraph.graph.edges() assert ( vals.upstream_wildcard_relation_1, vals.downstream_wildcard_relation_2) not in shgraph.graph.edges() assert ( vals.upstream_wildcard_relation_2, vals.downstream_wildcard_relation_1) not in shgraph.graph.edges() assert (vals.upstream_wildcard_relation_2, vals.downstream_wildcard_relation_2) in shgraph.graph.edges()
def test_no_duplicates(stub_graph_set): shgraph=SnowShuGraph() _,vals = stub_graph_set full_catalog=[ vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right] config_dict=copy.deepcopy(CONFIGURATION) config=ConfigurationParser().from_file_or_path(StringIO(yaml.dump(config_dict))) shgraph.build_graph(config,full_catalog) graphs = shgraph.get_graphs() all_nodes=[node for graph in graphs for node in graph.nodes] assert len(set(all_nodes)) == len(all_nodes)
def test_build_graph_allows_upstream_regex(stub_graph_set): """ Tests build_graph builds multiple upstream relationships """ shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [ vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) config_dict["source"]["specified_relations"] = [{ "database": vals.downstream_relation.database, "schema": vals.downstream_relation.schema, "relation": vals.downstream_relation.name, "relationships": { "directional": [{ "local_attribute": vals.directional_key, "database": ".*", "schema": ".*", "relation": ".*relation.*$", # incl birelations "remote_attribute": vals.directional_key }] } }] config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock shgraph.build_graph(config) assert len(shgraph.graph.edges()) == 3 assert (vals.upstream_relation, vals.downstream_relation) in shgraph.graph.edges() assert (vals.birelation_left, vals.downstream_relation) in shgraph.graph.edges() assert (vals.birelation_right, vals.downstream_relation) in shgraph.graph.edges()
def test_build_graph_fails_no_distinct_upstream(stub_graph_set): """ Tests build_graph exits on no distinct upstream relations """ shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [ vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) # add relationship where downstream == upstream config_dict["source"]["specified_relations"] = [{ "database": vals.downstream_relation.database, "schema": vals.downstream_relation.schema, "relation": vals.downstream_relation.name, "relationships": { "directional": [{ "local_attribute": vals.directional_key, "database": vals.downstream_relation.database, "schema": vals.downstream_relation.schema, "relation": vals.downstream_relation.name, "remote_attribute": vals.directional_key }] } }] config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock with pytest.raises(InvalidRelationshipException) as exc: shgraph.build_graph(config) assert "was specified as a dependency, but it does not exist." in str( exc.value)
def test_graph_difference_less_relations_source(stub_graph_set): """ Tests graph_difference returns graph with no nodes if target catalog has more nodes than source, including all nodes present in source graph """ shgraph = SnowShuGraph() _, vals = stub_graph_set source_catalog = [ vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right ] target_catalog = [ vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right, vals.iso_relation ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) config_dict["source"]["specified_relations"] = [{ "database": vals.downstream_relation.database, "schema": vals.downstream_relation.schema, "relation": vals.downstream_relation.name, "relationships": { "directional": [{ "local_attribute": vals.directional_key, "database": ".*", "schema": ".*", "relation": ".*relation.*$", # incl birelations "remote_attribute": vals.directional_key }] } }] config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = source_catalog config.source_profile.adapter = adapter_mock shgraph.build_graph(config) actual = SnowShuGraph.catalog_difference(shgraph, target_catalog) assert len(actual) == 0
def test_sets_outliers(stub_graph_set): shgraph=SnowShuGraph() _,vals = stub_graph_set full_catalog=[ vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right] config_dict=copy.deepcopy(CONFIGURATION) config_dict['source']['include_outliers']=True config_dict['source']['max_number_of_outliers']=1000 config=ConfigurationParser().from_file_or_path(StringIO(yaml.dump(config_dict))) modified_graph=shgraph.build_graph(config,full_catalog) assert vals.iso_relation.include_outliers==True assert vals.iso_relation.max_number_of_outliers==1000
def test_no_duplicates(stub_graph_set): shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [ vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock shgraph.build_graph(config) graphs = shgraph.get_connected_subgraphs() all_nodes = [node for graph in graphs for node in graph.nodes] assert len(set(all_nodes)) == len(all_nodes)
def test_graph_difference_empty_target(stub_graph_set): """ Tests graph_difference returns source graph with all nodes if target catalog is empty """ shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [ vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) config_dict["source"]["specified_relations"] = [{ "database": vals.downstream_relation.database, "schema": vals.downstream_relation.schema, "relation": vals.downstream_relation.name, "relationships": { "directional": [{ "local_attribute": vals.directional_key, "database": ".*", "schema": ".*", "relation": ".*relation.*$", "remote_attribute": vals.directional_key }] } }] config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock shgraph.build_graph(config) expected = shgraph.graph target_catalog = set() actual = SnowShuGraph.catalog_difference(shgraph, target_catalog) assert actual == expected
def test_sets_outliers(stub_graph_set): shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [ vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) config_dict['source']['include_outliers'] = True config_dict['source']['max_number_of_outliers'] = 1000 config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock _ = shgraph.build_graph(config) assert vals.iso_relation.include_outliers is True assert vals.iso_relation.max_number_of_outliers == 1000
def test_sets_sampling_for_all_patterns(stub_configs): stub_configs = stub_configs() mock_config_file = StringIO(yaml.dump(stub_configs)) parsed=ConfigurationParser().from_file_or_path(mock_config_file) assert isinstance(parsed.sampling,DefaultSampling)
def test_errors_on_missing_section(stub_configs): stub_configs = stub_configs() del stub_configs['source'] with pytest.raises((KeyError,AttributeError,)): mock_config_file = StringIO(yaml.dump(stub_configs)) ConfigurationParser().from_file_or_path(mock_config_file)
def stub_replica_configuration(): return ConfigurationParser().from_file_or_path(StringIO(yaml.dump(CONFIGURATION)))