예제 #1
0
    def test_slice(self):
        wf_json = {
            "id":
            "My workflow",
            "tables": [{
                "id": "My table",
                "row_filter": {
                    "slice": {
                        "start": 1,
                        "end": 4,
                        "step": 2
                    }
                }
            }]
        }

        wf = Workflow(wf_json)

        # Provide data directly (without table population)
        data = {'A': [1, 2, 3, 4, 5, 6]}
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        self.assertEqual(len(tb.data.columns),
                         1)  # Predicate columns will be removed by default
        self.assertEqual(len(tb.data), 2)

        self.assertEqual(tb.data["A"][0], 2)
        self.assertEqual(tb.data["A"][1], 4)
예제 #2
0
    def test_predicate(self):
        wf_json = {
            "id": "My workflow",
            "tables": [{
                "id": "My table",
                "row_filter": {
                    "predicate": ["B", "C"]
                }
            }]
        }

        wf = Workflow(wf_json)

        # Provide data directly (without table population)
        data = {
            'A': [1, 2, 3],
            'B': [True, True, False],
            'C': [True, False, False]
        }
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        self.assertEqual(len(tb.data.columns),
                         1)  # Predicate columns will be removed by default
        self.assertEqual(len(tb.data), 1)
예제 #3
0
    def test_exclude(self):
        wf_json = {
            "id":
            "My workflow",
            "tables": [{
                "id":
                "My table",
                "columns": [{
                    "id": "A"
                }, {
                    "id": "B",
                    "exclude": True
                }, {
                    "id": "C",
                    "exclude": True
                }]
            }]
        }

        wf = Workflow(wf_json)

        # Provide data directly (without table population)
        data = {
            'A': [1, 2, 3],
            'B': [True, True, False],
            'C': [True, False, False]
        }
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        self.assertEqual(len(tb.data.columns), 1)
        self.assertEqual(len(tb.data), 3)
예제 #4
0
    def test_sample(self):
        wf_json = {
            "id": "My workflow",
            "tables": [{
                "id": "My table",
                "row_filter": {
                    "sample": {
                        "frac": 0.6
                    }
                }
            }]
        }

        wf = Workflow(wf_json)

        # Provide data directly (without table population)
        data = {'A': [1, 2, 3]}
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        self.assertEqual(len(tb.data.columns),
                         1)  # Predicate columns will be removed by default
        self.assertEqual(len(tb.data), 2)
예제 #5
0
    def test_read_csv(self):

        wf_json = {
            "id": "My workflow",
            "tables": [
                {
                    "id": "My table",
                    "function": "pandas:read_csv",
                    "inputs": [],
                    "model": {
                        "filepath_or_buffer": "./tests/test1.csv",
                        "nrows": 4
                    }
                }
            ]
        }

        wf = Workflow(wf_json)

        wf.execute()

        tb = wf.tables[0].data

        self.assertEqual(len(tb.columns), 3)
        self.assertEqual(len(tb), 4)
예제 #6
0
파일: main.py 프로젝트: steve21124/lambdo
def run(workflow_file):

    with open(workflow_file, encoding='utf-8') as f:
        wf_json = json.loads(f.read())
    wf = Workflow(wf_json)
    wf.execute()

    return 0
예제 #7
0
    def test_aggregatoin_simple(self):
        """
        Test simple aggregation.
        """

        wf_json = {
            "id": "My workflow",
            "tables": [
                {
                    "id": "Fact Table"
                },
                {
                    "id": "Group Table",
                    "function": "lambdo.std:aggregate",
                    "inputs": ["Fact Table"],
                    "model": {
                        "keys": ["A"],
                        "aggregations": [
                            {
                                "id": "size",
                                "function": "numpy.core.fromnumeric:size",
                                "inputs": []
                            },
                            {
                                "id": "sum(B)",
                                "function": "numpy.core.fromnumeric:sum",
                                "inputs": ["B"]
                            }
                        ]
                    }
                }
            ]
        }

        wf = Workflow(wf_json)

        # Fact table
        data = {'A': [0, 1, 0, 1], 'B': [1.0, 2.0, 3.0, 4.0]}
        df = pd.DataFrame(data)
        main_tb = wf.tables[0]
        main_tb.data = df

        wf.execute()

        df2 = wf.tables[1]

        self.assertEqual(len(df2.data.columns), 2)
        self.assertEqual(len(df2.data), 2)

        self.assertAlmostEqual(df2.data['size'][0], 2)
        self.assertAlmostEqual(df2.data['size'][0], 2)

        self.assertAlmostEqual(df2.data['sum(B)'][0], 4.0)
        self.assertAlmostEqual(df2.data['sum(B)'][1], 6.0)
예제 #8
0
    def test_join_by_key(self):

        wf_json = {
            "id": "My workflow",
            "tables": [
                {
                    "id": "Main Table"
                },
                {
                    "id": "Second Table"
                },
                {
                    "id": "Merged Table",
                    "function": "lambdo.std:join",
                    "inputs": ["Main Table", "Second Table"],
                    "model": {"suffixes": ["", "_JOINED"]}
                }
            ]
        }

        wf = Workflow(wf_json)

        # Main table
        data = {'A': [0, 1, 2]}
        df = pd.DataFrame(data)
        main_tb = wf.tables[0]
        main_tb.data = df

        # Secondary table (more rows than in the main table)
        data = {'A': [3, 4, 5, 6, 7]}
        df = pd.DataFrame(data)
        sec_tb = wf.tables[1]
        sec_tb.data = df

        wf.execute()

        merged_tb = wf.tables[2]

        self.assertEqual(len(merged_tb.data.columns), 2)
        self.assertEqual(len(merged_tb.data), 3)
        self.assertEqual(merged_tb.data.columns[1], 'A_JOINED')

        # Secondary table (fewer rows than in the main table)
        data = {'B': [3, 4]}
        df = pd.DataFrame(data)
        sec_tb = wf.tables[1]
        sec_tb.data = df

        wf.execute()

        merged_tb = wf.tables[2]

        self.assertEqual(len(merged_tb.data.columns), 2)
        self.assertEqual(len(merged_tb.data), 3)
예제 #9
0
    def test_two_keys(self):

        #
        # One key to another table
        #
        wf_json = {
            "id":
            "My workflow",
            "tables": [{
                "id":
                "Table 1",
                "columns": [{
                    "id": "My Link",
                    "operation": "link",
                    "keys": ["A", "B"],
                    "linked_table": "Table 2",
                    "linked_keys": ["A", "B"]
                }]
            }, {
                "id": "Table 2",
                "operation": "noop",
                "columns": []
            }]
        }
        wf = Workflow(wf_json)

        # Main table
        df = pd.DataFrame({
            'A': ['a', 'b', 'b', 'a'],
            'B': ['b', 'c', 'c', 'a']
        })
        main_tb = wf.tables[0]
        main_tb.data = df

        # Secondary table (more data than used in the main table)
        df = pd.DataFrame({
            'A': ['a', 'b', 'a'],
            'B': ['b', 'c', 'c'],
            'C': [1, 2, 3]
        })
        sec_tb = wf.tables[1]
        sec_tb.data = df

        wf.execute()

        merged_tb = wf.tables[0]
        self.assertEqual(len(merged_tb.data), 4)  # Same number of rows
        self.assertEqual(len(merged_tb.data.columns), 3)

        link_column = main_tb.data['My Link']
        self.assertEqual(link_column[0], 0)
        self.assertEqual(link_column[1], 1)
        self.assertEqual(link_column[2], 1)
        self.assertTrue(pd.isna(link_column[3]))
예제 #10
0
def run(workflow_file):

    with open(workflow_file, encoding='utf-8') as f:
        wf_str = f.read()

        # Remove everything starting with // till the end of line
        wf_str = re.sub(r"//.*$", "", wf_str, flags=re.M)

        wf_json = json.loads(wf_str)
    wf = Workflow(wf_json)
    wf.execute()

    return 0
예제 #11
0
    def test_project(self):

        #
        # Project one column
        #
        wf_json = {
            "id":
            "My workflow",
            "tables": [
                {
                    "id": "Source",
                    "attributes": ["A"],
                    "columns": []
                },
                {
                    "id": "Destination",
                    "operation": "project",
                    "source_table": "Source",
                    "inputs": [
                        "A"
                    ],  # Source columns to be projected. If not specified then all columns will be used.
                    "outputs": [
                        "B"
                    ],  # New names in the target table. If not specified then the same names will be used.
                    "attributes":
                    [],  # Alternatively, we could declare attributes as names of the target columns
                    "columns": []
                }
            ]
        }
        wf = Workflow(wf_json)

        # Source tables
        df = pd.DataFrame({
            'A': ['a', 'a', 'b', 'b'],
            'M': [1.0, 2.0, 3.0, 4.0]
        })
        facts_tb = wf.tables[0]
        facts_tb.data = df

        wf.execute()

        proj_tb = wf.tables[1]
        self.assertEqual(len(proj_tb.data),
                         2)  # Number of unique records in the source tables
        self.assertEqual(len(proj_tb.data.columns),
                         1)  # Number of input columns

        out_column = proj_tb.data["B"]
        self.assertEqual(out_column[0], 'a')
        self.assertEqual(out_column[1], 'b')
예제 #12
0
    def test_column_filter(self):
        wf_json = {
            "id": "My workflow",
            "tables": [
                {
                    "id": "My table",
                    "column_filter": {"exclude": ["B", "C"]}
                }
            ]
        }
        wf = Workflow(wf_json)

        # Provide data directly (without table population)
        data = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        self.assertEqual(len(tb.data.columns), 1)
        self.assertEqual(len(tb.data), 3)

        wf_json = {
            "id": "My workflow",
            "tables": [
                {
                    "id": "My table",
                    "column_filter": ["A", "B"]
                }
            ]
        }
        wf = Workflow(wf_json)

        # Provide data directly (without table population)
        data = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        self.assertEqual(len(tb.data.columns), 2)
        self.assertEqual(len(tb.data), 3)
예제 #13
0
    def test_calculate(self):

        #
        # Row-based apply
        #
        wf_json = {
            "id":
            "My workflow",
            "tables": [{
                "id":
                "My table",
                "columns": [{
                    "id": "My column",
                    "function": "builtins:float",
                    "window": "one",
                    "inputs": ["A"],
                    "outputs": ["float(A)"]
                }]
            }]
        }
        wf = Workflow(wf_json)

        # Provide data directly (without table population)
        data = {'A': [1, 2, 3]}
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        v0 = tb.data['float(A)'][0]
        v1 = tb.data['float(A)'][1]
        v2 = tb.data['float(A)'][2]

        self.assertAlmostEqual(v0, 1.0)
        self.assertAlmostEqual(v1, 2.0)
        self.assertAlmostEqual(v2, 3.0)

        self.assertIsInstance(v0, float)
        self.assertIsInstance(v1, float)
        self.assertIsInstance(v2, float)
예제 #14
0
    def test_join_by_columns(self):

        wf_json = {
            "id": "My workflow",
            "tables": [
                {
                    "id": "Main Table"
                },
                {
                    "id": "Second Table"
                },
                {
                    "id": "Merged Table",
                    "function": "lambdo.std:join",
                    "inputs": ["Main Table", "Second Table"],
                    "model": {"keys": ["A", "B"]}
                }
            ]
        }

        wf = Workflow(wf_json)

        # Main table
        data = {'A': ['a', 'a', 'b', 'b']}
        df = pd.DataFrame(data)
        main_tb = wf.tables[0]
        main_tb.data = df

        # Secondary table (more data than required by the main table)
        data = {'B': ['a', 'b', 'c'], 'C': [1, 2, 3]}
        df = pd.DataFrame(data)
        sec_tb = wf.tables[1]
        sec_tb.data = df

        wf.execute()

        merged_tb = wf.tables[2]

        self.assertEqual(len(merged_tb.data.columns), 2)
        self.assertEqual(len(merged_tb.data), 4)
예제 #15
0
    def test_single_columns(self):

        #
        # Weighted rolling mean
        #
        wf_json = {
            "id": "My workflow",
            "tables": [
                {
                    "id": "My table",
                    "columns": [
                        {
                            "id": "mean_w(A)",
                            "function": "lambdo.std:mean_weighted",
                            "window": "2",
                            "inputs": ["A","W"],
                            "model": {}
                        }
                    ]
                }
            ]
        }
        wf = Workflow(wf_json)

        # Provide data directly (without table population)
        data = {'A': [1, 2, 3], 'W': [3, 2, 1]}
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        v0 = tb.data['mean_w(A)'][0]
        v1 = tb.data['mean_w(A)'][1]
        v2 = tb.data['mean_w(A)'][2]

        self.assertTrue(pd.isna(v0))
        self.assertAlmostEqual(v1, 1.4)
        self.assertAlmostEqual(v2, 2.33333333)
예제 #16
0
    def test_roll(self):

        #
        # Rolling sum
        #
        wf_json = {
            "id": "My workflow",
            "tables": [
                {
                    "id": "My table",
                    "columns": [
                        {
                            "id": "sum(A)",
                            "function": "numpy.core.fromnumeric:sum",
                            "window": "2",
                            "inputs": ["A"],
                            "model": {}
                        }
                    ]
                }
            ]
        }
        wf = Workflow(wf_json)

        # Provide data directly (without table population)
        data = {'A': [1, 2, 3]}
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        v0 = tb.data['sum(A)'][0]
        v1 = tb.data['sum(A)'][1]
        v2 = tb.data['sum(A)'][2]

        self.assertTrue(pd.isna(v0))
        self.assertAlmostEqual(v1, 3.0)
        self.assertAlmostEqual(v2, 5.0)
예제 #17
0
    def test_standard_functions(self):

        #
        # Shift one column: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.shift.html
        #
        wf_json = \
            {
                "id": "My workflow",
                "tables": [
                    {
                        "id": "My table",
                        "columns": [
                            {
                                "id": "My Column",
                                "function": "pandas.core.series:Series.shift",
                                "window": "all",
                                "inputs": ["A"],
                                "outputs": ["next(A)"],
                                "model": {"periods": -1}
                            }
                        ]
                    }
                ]
            }
        wf = Workflow(wf_json)

        # Provide data directly (without table population)
        data = {'A': [1, 2, 3]}
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        self.assertAlmostEqual(tb.data['next(A)'][0], 2.0)
        self.assertAlmostEqual(tb.data['next(A)'][1], 3.0)
        self.assertTrue(pd.isna(tb.data['next(A)'][2]))
예제 #18
0
    def test_grouping(self):
        """
        Test only how records are grouped without aggregation.
        """

        wf_json = {
            "id": "My workflow",
            "tables": [
                {
                    "id": "Fact Table"
                },
                {
                    "id": "Group Table",
                    "function": "lambdo.std:aggregate",
                    "inputs": ["Fact Table"],
                    "model": {
                        "keys": ["A"],
                        "aggregations": []
                    }
                }
            ]
        }

        wf = Workflow(wf_json)

        # Fact table
        data = {'A': [0, 1, 0, 1], 'B': [1.0, 2.0, 3.0, 4.0]}
        df = pd.DataFrame(data)
        main_tb = wf.tables[0]
        main_tb.data = df

        wf.execute()

        df2 = wf.tables[1]

        self.assertEqual(len(df2.data.columns), 0)
        self.assertEqual(len(df2.data), 2)
예제 #19
0
    def test_imports(self):
        wf_json = {
            "id":
            "My workflow",
            "imports": ["tests.udf", "os.path"],
            "tables": [{
                "id":
                "My table",
                "columns": [{
                    "id":
                    "A",
                    "inputs": ["A"],
                    "window":
                    "1",
                    "extensions": [{
                        "function": "tests.udf:user_import_fn",
                        "outputs": "Success"
                    }]
                }]
            }]
        }

        wf = Workflow(wf_json)

        self.assertEqual(len(wf.modules), 2)
        self.assertTrue(hasattr(wf.modules[0], 'user_import_fn'))

        # Provide data directly (without table population)
        data = {'A': [1, 2, 3]}
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        self.assertEqual(wf.tables[0].data['Success'][0], 'Success')
        self.assertEqual(wf.tables[0].data['Success'].nunique(), 1)
예제 #20
0
    def test_dropna(self):

        wf_json = {
            "id": "My workflow",
            "tables": [{
                "id": "My table",
                "row_filter": {
                    "dropna": True
                }
            }]
        }

        wf = Workflow(wf_json)

        # Provide data directly (without table population)
        data = {'A': [np.nan, 2, 3], 'B': [np.nan, 5, np.nan]}
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        self.assertEqual(len(tb.data.columns), 2)
        self.assertEqual(len(tb.data), 1)
예제 #21
0
    X_array = X.values
    y_array = y.values.ravel()

    model = ensemble.GradientBoostingClassifier(**hyper_model)

    model.fit(X_array, y_array)

    return model


def rf_fit(X, y, **hyper_model):
    X = X[:-1]
    y = y[:-1]

    X_array = X.values
    y_array = y.values.ravel()

    model = ensemble.RandomForestClassifier(**hyper_model)
    ensemble.RandomForestClassifier()
    model.fit(X_array, y_array)

    return model


if __name__ == '__main__':
    with open('./examples/example10.json', encoding='utf-8') as f:
        wf_json = json.loads(f.read())
    wf = Workflow(wf_json)
    wf.execute()
    pass
예제 #22
0
    def test_single_columns(self):

        #
        # Row-based apply
        #
        wf_json = {
            "id": "My workflow",
            "tables": [
                {
                    "id": "My table",
                    "columns": [
                        {
                            "id": "My column",
                            "function": "builtins:float",
                            "window": "one",
                            "inputs": ["A"],
                            "outputs": ["float(A)"]
                        }
                    ]
                }
            ]
        }
        wf = Workflow(wf_json)

        # Provide data directly (without table population)
        data = {'A': [1, 2, 3]}
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        v0 = tb.data['float(A)'][0]
        v1 = tb.data['float(A)'][1]
        v2 = tb.data['float(A)'][2]

        self.assertAlmostEqual(v0, 1.0)
        self.assertAlmostEqual(v1, 2.0)
        self.assertAlmostEqual(v2, 3.0)

        self.assertIsInstance(v0, float)
        self.assertIsInstance(v1, float)
        self.assertIsInstance(v2, float)

        #
        # Rolling sum
        #
        wf_json = {
            "id": "My workflow",
            "tables": [
                {
                    "id": "My table",
                    "columns": [
                        {
                            "id": "sum(A)",
                            "function": "numpy.core.fromnumeric:sum",
                            "window": "2",
                            "inputs": ["A"],
                            "model": {}
                        }
                    ]
                }
            ]
        }
        wf = Workflow(wf_json)

        # Provide data directly (without table population)
        data = {'A': [1, 2, 3]}
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        v0 = tb.data['sum(A)'][0]
        v1 = tb.data['sum(A)'][1]
        v2 = tb.data['sum(A)'][2]

        self.assertTrue(pd.isna(v0))
        self.assertAlmostEqual(v1, 3.0)
        self.assertAlmostEqual(v2, 5.0)
예제 #23
0
    def test_family_columns(self):
        #
        # Same function and inputs but different windows
        #
        wf_json = {
            "id": "My workflow",
            "tables": [
                {
                    "id": "My table",
                    "columns": [
                        {
                            "id": "sum(A)",
                            "function": "numpy.core.fromnumeric:sum",
                            "inputs": ["A"],
                            "extensions": [
                                {"window": "2"},
                                {"window": "3", "outputs": ["sum(A)_win3"]}
                            ]
                        }
                    ]
                }
            ]
        }

        wf = Workflow(wf_json)

        # Provide data directly (without table population)
        data = {'A': [1, 2, 3, 4]}
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        col0 = tb.data['sum(A)_0']
        col1 = tb.data['sum(A)_win3']

        self.assertAlmostEqual(col0[2], 5.0)
        self.assertAlmostEqual(col1[2], 6.0)

        self.assertAlmostEqual(col0[3], 7.0)
        self.assertAlmostEqual(col1[3], 9.0)

        #
        # Same input, different functions
        #
        wf_json = {
            "id": "My workflow",
            "tables": [
                {
                    "id": "My table",
                    "columns": [
                        {
                            "id": "A",
                            "inputs": ["A"],
                            "window": "2",
                            "extensions": [
                                {"function": "numpy.core.fromnumeric:sum", "outputs": "A_sum"},
                                {"function": "numpy.core.fromnumeric:mean", "outputs": "A_mean"}
                            ]
                        }
                    ]
                }
            ]
        }

        wf = Workflow(wf_json)

        # Provide data directly (without table population)
        data = {'A': [1, 2, 3]}
        df = pd.DataFrame(data)
        tb = wf.tables[0]
        tb.data = df

        wf.execute()

        col0 = tb.data['A_sum']
        col1 = tb.data['A_mean']

        self.assertAlmostEqual(col0[1], 3.0)
        self.assertAlmostEqual(col0[2], 5.0)

        self.assertAlmostEqual(col1[1], 1.5)
        self.assertAlmostEqual(col1[2], 2.5)
예제 #24
0
    def test_compose_simple(self):
        """Materialize a simple (two segments) column path as a new column of the table using an explicit definition"""

        wf_json = {
            "id":
            "My workflow",
            "tables": [{
                "id":
                "Table 1",
                "attributes": ["A"],
                "columns": [{
                    "id": "Compose",
                    "operation": "compose",
                    "inputs": ["Link", "B"]
                }, {
                    "id": "Link",
                    "operation": "link",
                    "keys": ["A"],
                    "linked_table": "Table 2",
                    "linked_keys": ["A"]
                }]
            }, {
                "id": "Table 2",
                "operation": "noop",
                "attributes": ["A", "B"],
                "columns": []
            }]
        }
        wf = Workflow(wf_json)

        # Main table
        df = pd.DataFrame({'A': ['a', 'a', 'b', 'b']})
        main_tb = wf.tables[0]
        main_tb.data = df

        # Secondary table (more data than used in the main table)
        df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [1, 2, 3]})
        sec_tb = wf.tables[1]
        sec_tb.data = df

        tp = Topology(wf)
        tp.translate()

        layers = tp.layers
        # Layers:
        # 0 "Table 1" "Table 2"
        # 1 "Link"
        # 2 "Compose"
        self.assertEqual(len(layers), 3)
        self.assertEqual(len(layers[0]), 2)
        self.assertEqual(len(layers[1]), 1)
        self.assertEqual(len(layers[2]), 1)

        self.assertEqual(layers[1][0].id, 'Link')
        self.assertEqual(layers[2][0].id, 'Compose')

        wf.execute()

        # Complex column values:  [1, 1, 2, 2]
        compose_column = main_tb.data['Compose']
        self.assertEqual(compose_column[0], 1)
        self.assertEqual(compose_column[1], 1)
        self.assertEqual(compose_column[2], 2)
        self.assertEqual(compose_column[3], 2)
예제 #25
0
    def test_aggregate(self):

        #
        # One key to another table
        #
        wf_json = {
            "id": "My workflow",
            "tables": [
                {
                    "id": "Facts",
                    "columns": [
                        {
                            "id": "Group Link",
                            "operation": "link",

                            "keys": ["A"],

                            "linked_table": "Groups",
                            "linked_keys": ["A"]
                        }
                    ]
                },
                {
                    "id": "Groups",
                    "operation": "noop",
                    "columns": [
                        {
                            "id": "Aggregate",
                            "operation": "aggregate",

                            "fact_table": "Facts",
                            "group_column": "Group Link",

                            # Computational (functional) definitions
                            "function": "numpy.core.fromnumeric:sum", # One input is expected
                            "inputs": ["M"],  # Select measure columns from the fact table: single or multiple
                            "model": {}, # Passed to the aggregation function as usual
                            #"outputs": ["M"]  # In the case, the function returns several results we need column ids

                            # Post-processing options
                            "fillna_value": 0.0,  # Replace NaN in the result, for instance, of an empty group has no fact, a function will never be called and the value will be NaN

                            # "function": "numpy.core.fromnumeric:size", # No need in inputs - how it works then? The function is a applied to a subset but this means there are parameters?
                        }
                    ]
                }
            ]
        }
        wf = Workflow(wf_json)

        # Facts
        df = pd.DataFrame({'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0]})
        facts_tb = wf.tables[0]
        facts_tb.data = df

        # Secondary table (more data than used in the main table)
        df = pd.DataFrame({'A': ['a', 'b', 'c']})
        groups_tb = wf.tables[1]
        groups_tb.data = df

        wf.execute()

        groups_tb = wf.tables[1]
        self.assertEqual(len(groups_tb.data), 3)  # Same number of rows
        self.assertEqual(len(groups_tb.data.columns), 3)  # One aggregate column was added (and one technical "id" column was added which might be removed in future)

        agg_column = groups_tb.data['Aggregate']
        self.assertEqual(agg_column[0], 3.0)
        self.assertEqual(agg_column[1], 7.0)
        self.assertEqual(agg_column[2], 0.0)
예제 #26
0
    def test_extend(self):

        wf_json = {
            "id": "My workflow",
            "tables": [
                {
                    "id": "Base Table",
                    "operation": "noop",

                    "columns": [
                        {
                            "id": "B",
                            "operation": "calculate",
                            "function": "lambda x: x + 1",
                            "inputs": ["A"]
                        }
                    ]
                },
                {
                    "id": "Extended Table",
                    # "operation": "extend" - by default

                    "columns": [
                        {
                            "id": "C",
                            "operation": "calculate",
                            "function": "lambda x: x + 1",
                            "inputs": ["B"]
                        }
                    ]
                }
            ]
        }
        wf = Workflow(wf_json)

        tp = Topology(wf)
        tp.translate()

        layers = tp.layers
        # Layers:
        # 0 "Base Table"
        # 1 "B"
        # 2 "Extended Table"
        # 3 "C"

        self.assertEqual(len(layers), 4)

        # Base tables
        df = pd.DataFrame({'A': [1.0, 2.0, 3.0]})
        base_tb = wf.tables[0]
        base_tb.data = df

        wf.execute()

        ext_tb = wf.tables[1]
        self.assertEqual(len(ext_tb.data), 3)
        self.assertEqual(len(ext_tb.data.columns), 3)

        ext_column = ext_tb.data["C"]
        self.assertEqual(ext_column[0], 3.0)
        self.assertEqual(ext_column[1], 4.0)
        self.assertEqual(ext_column[2], 5.0)