def test_init_ok(): config = { "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": [], } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = CsvReader(configuration, "csv_reader") data_object, terminate = reader.run(data_object) assert not terminate df = data_object.get("csv_reader", rtype=DataObjectResponseType.VALUE.value) assert df is not None assert df.shape == (2, 2) node_config = { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": [], } assert isinstance(CsvReader.necessary_config(node_config), set) assert len(CsvReader.necessary_config(node_config)) > 0
def test_transform(): config = { "implementation_config": { "reader_config": { "myreader_left": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["mypipeline"], }, "myreader_right": { "class": "CsvReader", "filename": "test/merge_right3.csv", "destinations": ["mypipeline"], }, }, "pipeline_config": { "mypipeline": { "class": "DataFrameJoiner", "join_key": ["first"], "start_table": "myreader_left", "is_training": True, } }, } } configuration = Configuration( config_location=None, is_dict_config=True, dict_config=config ) data_object = DataObject(configuration) left_df = pd.read_csv("test/minimal.csv") reader_left = CsvReader(configuration, "myreader_left") data_object.add(reader_left, left_df) right_df = pd.read_csv("test/merge_right3.csv") reader_right = CsvReader(configuration, "myreader_right") data_object.add(reader_right, right_df) pipeline = DataFrameJoiner(configuration, "mypipeline") data_object, terminate = pipeline.run(data_object) assert not terminate joined_data = data_object.get( "mypipeline", pop_data=True, rtype=DataObjectResponseType.VALUE.value ) assert joined_data.shape[0] == 2 assert list(joined_data.T.to_dict().values())[0] == { "first": "joe", "last": "doe", "age": 47, } assert list(joined_data.T.to_dict().values())[1] == { "first": "mary", "last": "poppins", "age": 42, }
def test_run(): config = { "implementation_config": { "reader_config": { "read_data": { "class": "RReader", "dataset": "iris", "destinations": [] } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = RReader(configuration, "read_data") data_object, terminate = reader.run(data_object) assert not terminate df = data_object.get("read_data", rtype=DataObjectResponseType.VALUE.value) assert df is not None assert df.shape == (150, 6) assert list(df.columns) == [ "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "row_names", ]
def test_kwargs(): config = { "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "kwargs": { "header": None, "sep": ":" }, "destinations": [], } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = CsvReader(configuration, "csv_reader") data_object, terminate = reader.run(data_object) assert not terminate df = data_object.get("csv_reader", rtype=DataObjectResponseType.VALUE.value) assert df is not None assert df.shape == (3, 1)
def test_init_ok_pickle(): config = { "implementation_config": { "reader_config": { "pickle_reader": { "class": "Deserializer", "filename": "test/tinymodel.pickle", "deserializer": "pickle", "destinations": [], } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = Deserializer(configuration, "pickle_reader") data_object, terminate = reader.run(data_object) assert not terminate data = data_object.get("pickle_reader", rtype=DataObjectResponseType.VALUE.value) assert data is not None assert set(data.keys()) == {"test", "model"} assert data["test"] == [1, 2, 3] assert isinstance(data["model"], DecisionTreeClassifier)
def test_run(monkeypatch): config = { "implementation_config": { "reader_config": { "mynode": { "class": "PostgresReader", "query_json": [{ "query": "test/test_mysql.sql" }], "destinations": [], } } } } configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) keys = [ "POSTGRES_HOST", "POSTGRES_PORT", "POSTGRES_DB", "POSTGRES_USER", "POSTGRES_PASS", ] for i, k in enumerate(keys): os.environ[k] = str(i) reader = PostgresReader(configuration, "mynode") with patch("psycopg2.connect") as mock_connect: def fake_df(query, con): return pd.DataFrame({ "Name": ["Tom", "nick", "krish", "jack"], "Age": [20, 21, 19, 18] }) monkeypatch.setattr(pd, "read_sql", fake_df) data_object, terminate = reader.run(data_object) assert not terminate dd = data_object.get("mynode", rtype=DataObjectResponseType.KEY_VALUE.value) assert "query_0" in dd df = dd["query_0"] assert list(df.T.to_dict().values())[0] == {"Name": "Tom", "Age": 20}
def test_concatenate_data(pipeline_obj, configuration): df1 = pd.read_csv("test/tennis.csv") df2 = pd.read_csv("test/tennis.csv") data_object = DataObject(configuration) csv_reader = CsvReader(configuration, "read_data") data_object.add(csv_reader, df1, "query1") data_object.add(csv_reader, df2, "query2") data_object, terminate = pipeline_obj.run(data_object) encoded_data = data_object.get("encode_and_split")["data_train"] assert len(encoded_data) == 18
def test_init_ok(): config = { "implementation_config": { "reader_config": { "dill_reader": { "class": "DillReader", "filename": "test/tinymodel.dill", "destinations": [], } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = DillReader(configuration, "dill_reader") data_object, terminate = reader.run(data_object) assert not terminate data = data_object.get("dill_reader", rtype=DataObjectResponseType.VALUE.value) assert data is not None assert set(data.keys()) == {"test", "model"} node_config = { "class": "DillReader", "filename": "test/tinymodel.dill", "destinations": [], } assert isinstance(DillReader.necessary_config(node_config), set) assert len(DillReader.necessary_config(node_config)) > 0 assert data["test"] == [1, 2, 3] assert isinstance(data["model"], DecisionTreeClassifier)
def test_init(): class TestPostprocess(AbstractPostprocess): # def __init__(self, configuration, instance_name): # super(TestPostprocess, self).__init__(configuration, instance_name) @staticmethod def necessary_config(node_config): return set(["key1"]) def run(self, data_object): data_object.add(self, "some data") return data_object, False # def process(self, data): # return "some data" NodeFactory().register("TestPostprocess", TestPostprocess) class TestModel(AbstractNode): def __init__(self, configuration, instance_name): pass @staticmethod def necessary_config(node_config): return set(["key1"]) def run(self, data_object): return data_object, False NodeFactory().register("TestModel", TestModel) config = { "implementation_config": { "model_config": { "modelname": { "class": "TestModel", "key1": "val1", "destinations": ["nodename"], } }, "postprocess_config": { "nodename": { "class": "TestPostprocess", "key1": "val1", "key2": "val2", "destinations": [], } }, } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) tp = TestPostprocess(configuration, "nodename") node_config = { "class": "TestPostprocess", "key1": "val1", "key2": "val2", "destinations": [], } assert set(["key1"]) == tp.necessary_config(node_config) # assert tp.process(None) == "some data" data_object, terminate = tp.run(data_object) assert not terminate assert (data_object.get( "nodename", rtype=DataObjectResponseType.VALUE.value) == "some data")