예제 #1
0
def test_init_ok():
    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)
    data_object = DataObject(configuration)

    reader = CsvReader(configuration, "csv_reader")
    data_object, terminate = reader.run(data_object)
    assert not terminate
    df = data_object.get("csv_reader",
                         rtype=DataObjectResponseType.VALUE.value)
    assert df is not None
    assert df.shape == (2, 2)

    node_config = {
        "class": "CsvReader",
        "filename": "test/minimal.csv",
        "destinations": [],
    }

    assert isinstance(CsvReader.necessary_config(node_config), set)
    assert len(CsvReader.necessary_config(node_config)) > 0
예제 #2
0
def test_transform():
    config = {
        "implementation_config": {
            "reader_config": {
                "myreader_left": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": ["mypipeline"],
                },
                "myreader_right": {
                    "class": "CsvReader",
                    "filename": "test/merge_right3.csv",
                    "destinations": ["mypipeline"],
                },
            },
            "pipeline_config": {
                "mypipeline": {
                    "class": "DataFrameJoiner",
                    "join_key": ["first"],
                    "start_table": "myreader_left",
                    "is_training": True,
                }
            },
        }
    }
    configuration = Configuration(
        config_location=None, is_dict_config=True, dict_config=config
    )

    data_object = DataObject(configuration)

    left_df = pd.read_csv("test/minimal.csv")
    reader_left = CsvReader(configuration, "myreader_left")
    data_object.add(reader_left, left_df)

    right_df = pd.read_csv("test/merge_right3.csv")
    reader_right = CsvReader(configuration, "myreader_right")
    data_object.add(reader_right, right_df)

    pipeline = DataFrameJoiner(configuration, "mypipeline")

    data_object, terminate = pipeline.run(data_object)

    assert not terminate

    joined_data = data_object.get(
        "mypipeline", pop_data=True, rtype=DataObjectResponseType.VALUE.value
    )
    assert joined_data.shape[0] == 2

    assert list(joined_data.T.to_dict().values())[0] == {
        "first": "joe",
        "last": "doe",
        "age": 47,
    }
    assert list(joined_data.T.to_dict().values())[1] == {
        "first": "mary",
        "last": "poppins",
        "age": 42,
    }
예제 #3
0
def test_run():
    config = {
        "implementation_config": {
            "reader_config": {
                "read_data": {
                    "class": "RReader",
                    "dataset": "iris",
                    "destinations": []
                }
            }
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)
    data_object = DataObject(configuration)

    reader = RReader(configuration, "read_data")
    data_object, terminate = reader.run(data_object)
    assert not terminate
    df = data_object.get("read_data", rtype=DataObjectResponseType.VALUE.value)
    assert df is not None
    assert df.shape == (150, 6)
    assert list(df.columns) == [
        "Sepal.Length",
        "Sepal.Width",
        "Petal.Length",
        "Petal.Width",
        "Species",
        "row_names",
    ]
예제 #4
0
def test_kwargs():
    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "kwargs": {
                        "header": None,
                        "sep": ":"
                    },
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)
    data_object = DataObject(configuration)

    reader = CsvReader(configuration, "csv_reader")
    data_object, terminate = reader.run(data_object)
    assert not terminate
    df = data_object.get("csv_reader",
                         rtype=DataObjectResponseType.VALUE.value)
    assert df is not None
    assert df.shape == (3, 1)
예제 #5
0
def test_init_ok_pickle():
    config = {
        "implementation_config": {
            "reader_config": {
                "pickle_reader": {
                    "class": "Deserializer",
                    "filename": "test/tinymodel.pickle",
                    "deserializer": "pickle",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)
    data_object = DataObject(configuration)

    reader = Deserializer(configuration, "pickle_reader")
    data_object, terminate = reader.run(data_object)
    assert not terminate
    data = data_object.get("pickle_reader",
                           rtype=DataObjectResponseType.VALUE.value)

    assert data is not None
    assert set(data.keys()) == {"test", "model"}

    assert data["test"] == [1, 2, 3]
    assert isinstance(data["model"], DecisionTreeClassifier)
예제 #6
0
def test_run(monkeypatch):
    config = {
        "implementation_config": {
            "reader_config": {
                "mynode": {
                    "class": "PostgresReader",
                    "query_json": [{
                        "query": "test/test_mysql.sql"
                    }],
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    data_object = DataObject(configuration)

    keys = [
        "POSTGRES_HOST",
        "POSTGRES_PORT",
        "POSTGRES_DB",
        "POSTGRES_USER",
        "POSTGRES_PASS",
    ]
    for i, k in enumerate(keys):
        os.environ[k] = str(i)

    reader = PostgresReader(configuration, "mynode")

    with patch("psycopg2.connect") as mock_connect:

        def fake_df(query, con):
            return pd.DataFrame({
                "Name": ["Tom", "nick", "krish", "jack"],
                "Age": [20, 21, 19, 18]
            })

        monkeypatch.setattr(pd, "read_sql", fake_df)

        data_object, terminate = reader.run(data_object)

        assert not terminate

        dd = data_object.get("mynode",
                             rtype=DataObjectResponseType.KEY_VALUE.value)
        assert "query_0" in dd
        df = dd["query_0"]
        assert list(df.T.to_dict().values())[0] == {"Name": "Tom", "Age": 20}
def test_concatenate_data(pipeline_obj, configuration):

    df1 = pd.read_csv("test/tennis.csv")
    df2 = pd.read_csv("test/tennis.csv")

    data_object = DataObject(configuration)
    csv_reader = CsvReader(configuration, "read_data")

    data_object.add(csv_reader, df1, "query1")
    data_object.add(csv_reader, df2, "query2")

    data_object, terminate = pipeline_obj.run(data_object)

    encoded_data = data_object.get("encode_and_split")["data_train"]

    assert len(encoded_data) == 18
예제 #8
0
def test_init_ok():
    config = {
        "implementation_config": {
            "reader_config": {
                "dill_reader": {
                    "class": "DillReader",
                    "filename": "test/tinymodel.dill",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)
    data_object = DataObject(configuration)

    reader = DillReader(configuration, "dill_reader")
    data_object, terminate = reader.run(data_object)
    assert not terminate
    data = data_object.get("dill_reader",
                           rtype=DataObjectResponseType.VALUE.value)

    assert data is not None
    assert set(data.keys()) == {"test", "model"}

    node_config = {
        "class": "DillReader",
        "filename": "test/tinymodel.dill",
        "destinations": [],
    }

    assert isinstance(DillReader.necessary_config(node_config), set)
    assert len(DillReader.necessary_config(node_config)) > 0

    assert data["test"] == [1, 2, 3]
    assert isinstance(data["model"], DecisionTreeClassifier)
예제 #9
0
def test_init():
    class TestPostprocess(AbstractPostprocess):
        #        def __init__(self, configuration, instance_name):
        #            super(TestPostprocess, self).__init__(configuration, instance_name)
        @staticmethod
        def necessary_config(node_config):
            return set(["key1"])

        def run(self, data_object):
            data_object.add(self, "some data")
            return data_object, False

        # def process(self, data):
        #    return "some data"

    NodeFactory().register("TestPostprocess", TestPostprocess)

    class TestModel(AbstractNode):
        def __init__(self, configuration, instance_name):
            pass

        @staticmethod
        def necessary_config(node_config):
            return set(["key1"])

        def run(self, data_object):
            return data_object, False

    NodeFactory().register("TestModel", TestModel)

    config = {
        "implementation_config": {
            "model_config": {
                "modelname": {
                    "class": "TestModel",
                    "key1": "val1",
                    "destinations": ["nodename"],
                }
            },
            "postprocess_config": {
                "nodename": {
                    "class": "TestPostprocess",
                    "key1": "val1",
                    "key2": "val2",
                    "destinations": [],
                }
            },
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)

    data_object = DataObject(configuration)

    tp = TestPostprocess(configuration, "nodename")
    node_config = {
        "class": "TestPostprocess",
        "key1": "val1",
        "key2": "val2",
        "destinations": [],
    }
    assert set(["key1"]) == tp.necessary_config(node_config)

    # assert tp.process(None) == "some data"

    data_object, terminate = tp.run(data_object)

    assert not terminate

    assert (data_object.get(
        "nodename", rtype=DataObjectResponseType.VALUE.value) == "some data")