def test_sections_in_order3(): config = { "metadata": {"section_run": ["writer_config"]}, "implementation_config": { "reader_config": { "read_data": { "class": "CsvReader", "filename": "test/tennis.csv", "destinations": ["recipe_csv_writer"], } }, "writer_config": { "recipe_csv_writer": { "class": "CsvWriter", "key": "test_data", "dir": "cache", "filename": "unittest_similar_recipes.csv", } }, }, } config = Configuration( config_location=None, is_dict_config=True, dict_config=config ) sections, source = config.sections_in_order() assert sections == ["writer_config"] assert source == "section_run"
def test_init_ok(config): corpus = pd.read_csv("test/minimal.csv") configuration = Configuration(None, is_dict_config=True, dict_config=config) writer = CsvWriter(configuration, "recipe_csv_writer") data_object = DataObject(configuration) requestor = CsvReader(configuration, "csv_reader") data_object.add(requestor, key="test_data", data=corpus) c = configuration.config_for_instance( "recipe_csv_writer" ) # configuration.sec .writer_config['recipe_csv_writer'] filename = c["dir"] + os.path.sep + c["filename"] # clean out test file location if os.path.exists(filename): os.remove(filename) writer.run(data_object) assert os.path.exists(filename) df = pd.read_csv(filename) assert corpus.equals(df)
def test_init_other_ok(config): config["implementation_config"]["writer_config"]["recipe_file_writer"][ "filename"] = "unittest_file_writer.other" config["implementation_config"]["writer_config"]["recipe_file_writer"][ "serializer"] = "other" test_data_string = "some test data" configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) requestor = CsvReader(configuration, "csv_reader") data_object.add(requestor, test_data_string, "test_data") writer = Serializer(configuration, "recipe_file_writer") c = configuration.config_for_instance("recipe_file_writer") filename = c["dir"] + os.path.sep + c["filename"] # clean out test file location if os.path.exists(filename): os.remove(filename) with pytest.raises(Exception, match=r"Unsupported"): writer.run(data_object)
def test_init_ok(config): test_data_string = "some test data" configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) requestor = CsvReader(configuration, "csv_reader") data_object.add(requestor, test_data_string, "test_data") writer = FileWriter(configuration, "recipe_file_writer") c = configuration.config_for_instance("recipe_file_writer") filename = c["dir"] + os.path.sep + c["filename"] # clean out test file location if os.path.exists(filename): os.remove(filename) data_object, terminate = writer.run(data_object) assert not terminate assert os.path.exists(filename) read_data = open(filename).read() assert test_data_string == read_data
def test_perform_any_config_fragment_substitution_bad(): config_str = """ { {% include "does/not/exist" %} "implementation_config": { } } """ with pytest.raises(Exception) as e: Configuration.perform_any_config_fragment_substitution(config_str) assert "Substitution files do not exist: does/not/exist" in str(e)
def test_yaml_perform_any_config_fragment_substitution_env_var(monkeypatch): monkeypatch.setenv("TEST","foo") config_str = """ {% include "test/metadata_fragment.yml" %} implementation_config: {% include "test/read_write_fragment.yml" %} """ final_str = Configuration.perform_any_config_fragment_substitution(config_str) expected = """ metadata: test: foo implementation_config: reader_config: read_data: class: CsvReader destinations: - write_output filename: data/tennis.csv writer_config: write_output: class: CsvWriter dir: cache filename: tennis_output.csv key: data """ assert final_str == expected
def test_init_ok_pickle(): config = { "implementation_config": { "reader_config": { "pickle_reader": { "class": "Deserializer", "filename": "test/tinymodel.pickle", "deserializer": "pickle", "destinations": [], } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = Deserializer(configuration, "pickle_reader") data_object, terminate = reader.run(data_object) assert not terminate data = data_object.get("pickle_reader", rtype=DataObjectResponseType.VALUE.value) assert data is not None assert set(data.keys()) == {"test", "model"} assert data["test"] == [1, 2, 3] assert isinstance(data["model"], DecisionTreeClassifier)
def test_class_package(mock_env): config_path = { "metadata": {"class_package": "test"}, "implementation_config": { "reader_config": {"read_data": {"class": "TestExtNode", "destinations": []}} }, } config_full_path = { "metadata": {"class_package": "test/ext_node_example.py"}, "implementation_config": { "reader_config": {"read_data": {"class": "TestExtNode", "destinations": []}} }, } config_full_dot = { "metadata": {"class_package": "test"}, "implementation_config": { "reader_config": { "read_data": { "class": "TestExtNode", "class_prefix": "ext_node_example", "destinations": [], } } }, } for config in [config_full_path, config_path, config_full_dot]: config = Configuration( config_location=None, is_dict_config=True, dict_config=config ) assert config.config_string assert config.config_hash NodeFactory().unregister("TestExtNode")
def test_init_traverser_from_config(): class TestTraverser(DagTraverser): def traversal_list(self): return [] def run_section_by_section(self): return False TraverserFactory().register("TestTraverser", TestTraverser) config = { "metadata": { "traverser": "TestTraverser" }, "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": [], } } }, } configuration = Configuration(None, is_dict_config=True, dict_config=config) runner = DagRunner(configuration) assert isinstance(runner.dag_traverser, TestTraverser)
def test_transform(): config = { "implementation_config": { "reader_config": { "myreader_left": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["mypipeline"], }, "myreader_right": { "class": "CsvReader", "filename": "test/merge_right3.csv", "destinations": ["mypipeline"], }, }, "pipeline_config": { "mypipeline": { "class": "DataFrameJoiner", "join_key": ["first"], "start_table": "myreader_left", "is_training": True, } }, } } configuration = Configuration( config_location=None, is_dict_config=True, dict_config=config ) data_object = DataObject(configuration) left_df = pd.read_csv("test/minimal.csv") reader_left = CsvReader(configuration, "myreader_left") data_object.add(reader_left, left_df) right_df = pd.read_csv("test/merge_right3.csv") reader_right = CsvReader(configuration, "myreader_right") data_object.add(reader_right, right_df) pipeline = DataFrameJoiner(configuration, "mypipeline") data_object, terminate = pipeline.run(data_object) assert not terminate joined_data = data_object.get( "mypipeline", pop_data=True, rtype=DataObjectResponseType.VALUE.value ) assert joined_data.shape[0] == 2 assert list(joined_data.T.to_dict().values())[0] == { "first": "joe", "last": "doe", "age": 47, } assert list(joined_data.T.to_dict().values())[1] == { "first": "mary", "last": "poppins", "age": 42, }
def config(): config = { "implementation_config": { "reader_config": { "read_data": { "class": "CsvReader", "filename": "data/tennis.csv", "destinations": ["transformers"], } }, "pipeline_config": { "transformers": { "class": "TransformerPipeline", "transformer_sequence": [ { "class": "primrose.transformers.strings.StringTransformer", "method": "replace", "columns": "outlook", "pat": "sunny", "repl": "rainy", } ], } }, } } configuration = Configuration(None, is_dict_config=True, dict_config=config) return configuration
def test_init_ok(): config = { "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": [], } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = CsvReader(configuration, "csv_reader") data_object, terminate = reader.run(data_object) assert not terminate df = data_object.get("csv_reader", rtype=DataObjectResponseType.VALUE.value) assert df is not None assert df.shape == (2, 2) node_config = { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": [], } assert isinstance(CsvReader.necessary_config(node_config), set) assert len(CsvReader.necessary_config(node_config)) > 0
def run(config, dry_run=False): """Run a primrose job""" from primrose.configuration.configuration import Configuration from primrose.dag_runner import DagRunner configuration = Configuration(config_location=config) DagRunner(configuration).run(dry_run=dry_run)
def test_run(): config = { "implementation_config": { "reader_config": { "read_data": { "class": "RReader", "dataset": "iris", "destinations": [] } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = RReader(configuration, "read_data") data_object, terminate = reader.run(data_object) assert not terminate df = data_object.get("read_data", rtype=DataObjectResponseType.VALUE.value) assert df is not None assert df.shape == (150, 6) assert list(df.columns) == [ "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "row_names", ]
def test_init_error8(): with pytest.raises(ValueError) as e: Configuration("test/tennis.csv") assert ( "config file at: test/tennis.csv has improper extension type - please use a .json or .yml file" in str(e) )
def test_run_bad(): class TestWriterTmp(AbstractWriter): @staticmethod def necessary_config(node_config): return set([]) def run(self, data_object): return data_object, False NodeFactory().register("TestWriterTmp", TestWriterTmp) config = { "implementation_config": { "writer_config": { "mywriter": { "class": "TestWriterTmp", "destinations": [] } } } } configuration = Configuration(None, is_dict_config=True, dict_config=config) runner = DagRunner(configuration) # unregister this class del NodeFactory().name_dict["TestWriterTmp"] with pytest.raises(Exception) as e: runner.run() assert "Issue instantiating mywriter and class TestWriterTmp" in str(e)
def test_run_bad2(): class TestWriterTmp(AbstractWriter): @staticmethod def necessary_config(node_config): return set([]) def run(self, data_object): raise Exception("Deliberate error") # return data_object, False NodeFactory().register("TestWriterTmp", TestWriterTmp) config = { "implementation_config": { "writer_config": { "mywriter": { "class": "TestWriterTmp", "destinations": [] } } } } configuration = Configuration(None, is_dict_config=True, dict_config=config) runner = DagRunner(configuration) with pytest.raises(Exception) as e: runner.run() assert "Deliberate error" in str(e)
def test_destinations_to_prune(): config = { "implementation_config": { "reader_config": { "conditional_node": { "class": "SimpleSwitch", "path_to_travel": "left", "destinations": ["left", "right"], } }, "writer_config": { "left": { "class": "CsvWriter", "key": "test_data", "dir": "cache", "filename": "unittest_similar_recipes.csv", }, "right": { "class": "CsvWriter", "key": "test_data", "dir": "cache", "filename": "unittest_similar_recipes.csv", }, }, } } configuration = Configuration(None, is_dict_config=True, dict_config=config) node = SimpleSwitch(configuration, "conditional_node") to_prune = node.destinations_to_prune() assert to_prune == ["right"]
def test_filter_sequence4(): config = { "metadata": { "section_run": ["writer_config"] }, "implementation_config": { "reader_config": { "read_data": { "class": "CsvReader", "filename": "test/tennis.csv", "destinations": ["recipe_csv_writer"], } }, "writer_config": { "recipe_csv_writer": { "class": "CsvWriter", "key": "test_data", "dir": "cache", "filename": "unittest_similar_recipes.csv", } }, }, } configuration = Configuration(None, is_dict_config=True, dict_config=config) runner = DagRunner(configuration) sequence = runner.filter_sequence(["read_data", "recipe_csv_writer"]) assert sequence == ["recipe_csv_writer"]
def test_run2(): config = { "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": [], } } } } configuration = Configuration(None, is_dict_config=True, dict_config=config) runner = DagRunner(configuration) with LogCapture() as l: runner.run(dry_run=True) l.check( ("root", "INFO", "Taking nodes to run from default"), ( "root", "INFO", "DRY RUN 0: would run node csv_reader of type reader_config and class CsvReader", ), ("root", "INFO", "All done. Bye bye!"), )
def test_run_notification_error(): config = { "metadata": { "section_registry": ["phase1"], "notify_on_error": { "client": "SlackClient", "channel": "some-channel", "token": "slack-api-token", "member_id": "optional-key", }, }, "implementation_config": { "phase1": { "csv_reader": { "class": "CsvReader", "filename": "bad/path.csv" } } }, } configuration = Configuration(None, is_dict_config=True, dict_config=config) runner = DagRunner(configuration) mock_client = mock.Mock() mock_client.post_message = mock.Mock() mock_get_notification_client = mock.Mock(return_value=mock_client) path = "primrose.notification_utils.get_notification_client" with mock.patch(path) as mock_get_notification_client: with pytest.raises(Exception) as error: runner.run() assert mock_get_notification_client.post_message.call_count == 1
def test_init_pipeline(): config = { "implementation_config": { "reader_config": { "myreader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["mypipeline"], } }, "pipeline_config": { "mypipeline": { "class": "DataFrameJoiner", "join_key": ["first"], "start_table": "myreader", } }, } } configuration = Configuration( config_location=None, is_dict_config=True, dict_config=config ) pipeline = DataFrameJoiner(configuration, "mypipeline") ts = pipeline.init_pipeline() assert isinstance(ts, TransformerSequence)
def test_kwargs(): config = { "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "kwargs": { "header": None, "sep": ":" }, "destinations": [], } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = CsvReader(configuration, "csv_reader") data_object, terminate = reader.run(data_object) assert not terminate df = data_object.get("csv_reader", rtype=DataObjectResponseType.VALUE.value) assert df is not None assert df.shape == (3, 1)
def test_run_node(self): path = "primrose.notifications.success_notification.get_notification_client" with mock.patch(path) as get_client_mock: get_client_mock.return_value = mock.Mock() NodeFactory().register("SlackDataMock", SlackDataMock) config = Configuration(None, is_dict_config=True, dict_config=config_dict_node_message) data_object = DataObject(config) reader = SlackDataMock(config, "test_node") data_object = reader.run(data_object) success_instance = ClientNotification( configuration=config, instance_name="node_notification", ) success_instance.client = get_client_mock.return_value success_instance.run(data_object) success_instance.client.post_message.assert_called_once_with( message="Node Success!")
def test_comments_in_json(): # should not raise exception even though there are comments in the JSON config = Configuration(config_location="test/config_with_comments.json") assert list(config.config.keys()) == ["reader_config", "writer_config"] assert config.config["reader_config"]["read_data"]["class"] == "CsvReader" assert list(config.config["reader_config"]["read_data"]["destinations"]) == [ "write_output" ]
def test_init_error7(): config = {"junk": {}} with pytest.raises(Exception) as e: Configuration(config_location=None, is_dict_config=True, dict_config=config) assert ( "Unsupported top-level key: junk. Supported keys are ['metadata', 'implementation_config']" in str(e) )
def test_init_error1(): with pytest.raises(Exception) as e: Configuration(None) if sys.version_info[:2] == (3, 5): assert "stat: can't specify None for path argument" in str(e) else: assert ( "stat: path should be string, bytes, os.PathLike or integer, not NoneType" in str(e) )
def test_run4(): class TestWriter(AbstractFileWriter): def __init__(self, configuration, instance_name): pass @staticmethod def necessary_config(node_config): return set([]) def run(self, data_object): terminate = True return data_object, terminate NodeFactory().register("TestWriter", TestWriter) config = { "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["csv_writer"], } }, "writer_config": { "csv_writer": { "class": "TestWriter" } }, } } configuration = Configuration(None, is_dict_config=True, dict_config=config) runner = DagRunner(configuration) with LogCapture() as l: runner.run(dry_run=False) l.check( ("root", "INFO", "Taking nodes to run from default"), ( "root", "INFO", "received node csv_reader of type reader_config and class CsvReader", ), ("root", "INFO", "Reading test/minimal.csv from CSV"), ( "root", "INFO", "received node csv_writer of type writer_config and class TestWriter", ), ("root", "INFO", "Terminating early due to signal from csv_writer"), ("root", "INFO", "All done. Bye bye!"), )
def test_nodes_of_type(): config = { "metadata": {}, "implementation_config": { "reader_config": { "read_data": { "class": "CsvReader", "filename": "test/tennis.csv", "destinations": ["decision_tree_model"], }, "read_data2": { "class": "CsvReader", "filename": "test/tennis.csv", "destinations": ["decision_tree_model"], }, }, "model_config": { "decision_tree_model": { "class": "SklearnClassifierModel", "mode": "predict", "sklearn_classifier_name": "tree.DecisionTreeClassifier", "grid_search_scoring": "roc_auc", "cv_folds": 3, "model_parameters": {}, "destinations": ["write_output"], } }, "writer_config": { "write_output": { "class": "CsvWriter", "key": "predictions", "dir": "cache", "filename": "hello_world_predictions.csv", } }, }, } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) nodes = configuration.dag.nodes_of_type(OperationType.reader) assert nodes == set(["read_data", "read_data2"]) nodes = configuration.dag.nodes_of_type(OperationType.pipeline) assert nodes == set([]) nodes = configuration.dag.upstream_nodes_of_type("write_output", OperationType.reader) assert nodes == set(["read_data", "read_data2"]) nodes = configuration.dag.upstream_nodes_of_type("write_output", OperationType.cleanup) assert nodes == set([])
def test_sections_in_order2(): config = { "metadata": {}, "implementation_config": { "reader_config": { "read_data": { "class": "CsvReader", "filename": "test/tennis.csv", "destinations": [], } }, "writer_config": {}, }, } config = Configuration( config_location=None, is_dict_config=True, dict_config=config ) sections, source = config.sections_in_order() assert sections == ["reader_config", "writer_config"] assert source == "default"