def test_exclude(self): wf_json = { "id": "My workflow", "tables": [{ "id": "My table", "columns": [{ "id": "A" }, { "id": "B", "exclude": True }, { "id": "C", "exclude": True }] }] } wf = Workflow(wf_json) # Provide data directly (without table population) data = { 'A': [1, 2, 3], 'B': [True, True, False], 'C': [True, False, False] } df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df wf.execute() self.assertEqual(len(tb.data.columns), 1) self.assertEqual(len(tb.data), 3)
def test_slice(self): wf_json = { "id": "My workflow", "tables": [{ "id": "My table", "row_filter": { "slice": { "start": 1, "end": 4, "step": 2 } } }] } wf = Workflow(wf_json) # Provide data directly (without table population) data = {'A': [1, 2, 3, 4, 5, 6]} df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df wf.execute() self.assertEqual(len(tb.data.columns), 1) # Predicate columns will be removed by default self.assertEqual(len(tb.data), 2) self.assertEqual(tb.data["A"][0], 2) self.assertEqual(tb.data["A"][1], 4)
def test_predicate(self): wf_json = { "id": "My workflow", "tables": [{ "id": "My table", "row_filter": { "predicate": ["B", "C"] } }] } wf = Workflow(wf_json) # Provide data directly (without table population) data = { 'A': [1, 2, 3], 'B': [True, True, False], 'C': [True, False, False] } df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df wf.execute() self.assertEqual(len(tb.data.columns), 1) # Predicate columns will be removed by default self.assertEqual(len(tb.data), 1)
def test_sample(self): wf_json = { "id": "My workflow", "tables": [{ "id": "My table", "row_filter": { "sample": { "frac": 0.6 } } }] } wf = Workflow(wf_json) # Provide data directly (without table population) data = {'A': [1, 2, 3]} df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df wf.execute() self.assertEqual(len(tb.data.columns), 1) # Predicate columns will be removed by default self.assertEqual(len(tb.data), 2)
def test_read_csv(self): wf_json = { "id": "My workflow", "tables": [ { "id": "My table", "function": "pandas:read_csv", "inputs": [], "model": { "filepath_or_buffer": "./tests/test1.csv", "nrows": 4 } } ] } wf = Workflow(wf_json) wf.execute() tb = wf.tables[0].data self.assertEqual(len(tb.columns), 3) self.assertEqual(len(tb), 4)
def run(workflow_file): with open(workflow_file, encoding='utf-8') as f: wf_json = json.loads(f.read()) wf = Workflow(wf_json) wf.execute() return 0
def test_aggregatoin_simple(self): """ Test simple aggregation. """ wf_json = { "id": "My workflow", "tables": [ { "id": "Fact Table" }, { "id": "Group Table", "function": "lambdo.std:aggregate", "inputs": ["Fact Table"], "model": { "keys": ["A"], "aggregations": [ { "id": "size", "function": "numpy.core.fromnumeric:size", "inputs": [] }, { "id": "sum(B)", "function": "numpy.core.fromnumeric:sum", "inputs": ["B"] } ] } } ] } wf = Workflow(wf_json) # Fact table data = {'A': [0, 1, 0, 1], 'B': [1.0, 2.0, 3.0, 4.0]} df = pd.DataFrame(data) main_tb = wf.tables[0] main_tb.data = df wf.execute() df2 = wf.tables[1] self.assertEqual(len(df2.data.columns), 2) self.assertEqual(len(df2.data), 2) self.assertAlmostEqual(df2.data['size'][0], 2) self.assertAlmostEqual(df2.data['size'][0], 2) self.assertAlmostEqual(df2.data['sum(B)'][0], 4.0) self.assertAlmostEqual(df2.data['sum(B)'][1], 6.0)
def test_two_keys(self): # # One key to another table # wf_json = { "id": "My workflow", "tables": [{ "id": "Table 1", "columns": [{ "id": "My Link", "operation": "link", "keys": ["A", "B"], "linked_table": "Table 2", "linked_keys": ["A", "B"] }] }, { "id": "Table 2", "operation": "noop", "columns": [] }] } wf = Workflow(wf_json) # Main table df = pd.DataFrame({ 'A': ['a', 'b', 'b', 'a'], 'B': ['b', 'c', 'c', 'a'] }) main_tb = wf.tables[0] main_tb.data = df # Secondary table (more data than used in the main table) df = pd.DataFrame({ 'A': ['a', 'b', 'a'], 'B': ['b', 'c', 'c'], 'C': [1, 2, 3] }) sec_tb = wf.tables[1] sec_tb.data = df wf.execute() merged_tb = wf.tables[0] self.assertEqual(len(merged_tb.data), 4) # Same number of rows self.assertEqual(len(merged_tb.data.columns), 3) link_column = main_tb.data['My Link'] self.assertEqual(link_column[0], 0) self.assertEqual(link_column[1], 1) self.assertEqual(link_column[2], 1) self.assertTrue(pd.isna(link_column[3]))
def run(workflow_file): with open(workflow_file, encoding='utf-8') as f: wf_str = f.read() # Remove everything starting with // till the end of line wf_str = re.sub(r"//.*$", "", wf_str, flags=re.M) wf_json = json.loads(wf_str) wf = Workflow(wf_json) wf.execute() return 0
def test_project(self): # # Project one column # wf_json = { "id": "My workflow", "tables": [ { "id": "Source", "attributes": ["A"], "columns": [] }, { "id": "Destination", "operation": "project", "source_table": "Source", "inputs": [ "A" ], # Source columns to be projected. If not specified then all columns will be used. "outputs": [ "B" ], # New names in the target table. If not specified then the same names will be used. "attributes": [], # Alternatively, we could declare attributes as names of the target columns "columns": [] } ] } wf = Workflow(wf_json) # Source tables df = pd.DataFrame({ 'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0] }) facts_tb = wf.tables[0] facts_tb.data = df wf.execute() proj_tb = wf.tables[1] self.assertEqual(len(proj_tb.data), 2) # Number of unique records in the source tables self.assertEqual(len(proj_tb.data.columns), 1) # Number of input columns out_column = proj_tb.data["B"] self.assertEqual(out_column[0], 'a') self.assertEqual(out_column[1], 'b')
def test_extend(self): wf_json = { "id": "My workflow", "tables": [ { "id": "Base Table", # noop - no function, no parent "columns": [ ] }, { "id": "Extended Table", # extend - no function, there is parent "columns": [ ] } ] } wf = Workflow(wf_json) tp = Topology(wf) tp.translate() layers = tp.layers # Layers: # 0 "Table 1" # 1 "Table 2" self.assertEqual(len(layers), 2) self.assertEqual(layers[0][0].id, 'Base Table') self.assertEqual(layers[1][0].id, 'Extended Table')
def test_calculate(self): wf_json = { "id": "My workflow", "tables": [ { "id": "My table", "columns": [ { "id": "C", "operation": "calculate", "inputs": ["A", "B"] } ] } ] } wf = Workflow(wf_json) tp = Topology(wf) tp.translate() layers = tp.layers # Layers: # 0 "My table" # 1 "C" self.assertEqual(len(layers), 2) self.assertEqual(len(layers[0]), 1) self.assertEqual(len(layers[1]), 1)
def test_compose_complex(self): """Materialize a long (three segments) column path as a new column of the table using an explicit definition. A nested compose column has to be automatically created.""" wf_json = { "id": "My workflow", "tables": [{ "id": "Table 1", "attributes": ["A"], "columns": [{ "id": "Compose", "operation": "compose", "inputs": ["Link", "Link", "C"] }, { "id": "Link", "operation": "link", "keys": ["A"], "linked_table": "Table 2", "linked_keys": ["A"] }] }, { "id": "Table 2", "operation": "noop", "attributes": ["A", "B"], "columns": [{ "id": "Link", "operation": "link", "keys": ["B"], "linked_table": "Table 3", "linked_keys": ["B"] }] }, { "id": "Table 3", "operation": "noop", "attributes": ["B", "C"], "columns": [] }] } wf = Workflow(wf_json) # Main table df = pd.DataFrame({'A': ['a', 'a', 'b', 'b']}) main_tb = wf.tables[0] main_tb.data = df # Secondary table (more data than used in the main table) df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['e', 'f', 'g']}) sec_tb = wf.tables[1] sec_tb.data = df # Third table df = pd.DataFrame({'B': ['e', 'f'], 'C': [1, 2]}) thd_tb = wf.tables[2] thd_tb.data = df
def test_column_filter(self): wf_json = { "id": "My workflow", "tables": [ { "id": "My table", "column_filter": {"exclude": ["B", "C"]} } ] } wf = Workflow(wf_json) # Provide data directly (without table population) data = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]} df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df tb.execute() self.assertEqual(len(tb.data.columns), 1) self.assertEqual(len(tb.data), 3) wf_json = { "id": "My workflow", "tables": [ { "id": "My table", "column_filter": ["A", "B"] } ] } wf = Workflow(wf_json) # Provide data directly (without table population) data = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]} df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df tb.execute() self.assertEqual(len(tb.data.columns), 2) self.assertEqual(len(tb.data), 3)
def test_calculate(self): # # Row-based apply # wf_json = { "id": "My workflow", "tables": [{ "id": "My table", "columns": [{ "id": "My column", "function": "builtins:float", "window": "one", "inputs": ["A"], "outputs": ["float(A)"] }] }] } wf = Workflow(wf_json) # Provide data directly (without table population) data = {'A': [1, 2, 3]} df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df wf.execute() v0 = tb.data['float(A)'][0] v1 = tb.data['float(A)'][1] v2 = tb.data['float(A)'][2] self.assertAlmostEqual(v0, 1.0) self.assertAlmostEqual(v1, 2.0) self.assertAlmostEqual(v2, 3.0) self.assertIsInstance(v0, float) self.assertIsInstance(v1, float) self.assertIsInstance(v2, float)
def test_join_by_columns(self): wf_json = { "id": "My workflow", "tables": [ { "id": "Main Table" }, { "id": "Second Table" }, { "id": "Merged Table", "function": "lambdo.std:join", "inputs": ["Main Table", "Second Table"], "model": {"keys": ["A", "B"]} } ] } wf = Workflow(wf_json) # Main table data = {'A': ['a', 'a', 'b', 'b']} df = pd.DataFrame(data) main_tb = wf.tables[0] main_tb.data = df # Secondary table (more data than required by the main table) data = {'B': ['a', 'b', 'c'], 'C': [1, 2, 3]} df = pd.DataFrame(data) sec_tb = wf.tables[1] sec_tb.data = df wf.execute() merged_tb = wf.tables[2] self.assertEqual(len(merged_tb.data.columns), 2) self.assertEqual(len(merged_tb.data), 4)
def test_single_columns(self): # # Weighted rolling mean # wf_json = { "id": "My workflow", "tables": [ { "id": "My table", "columns": [ { "id": "mean_w(A)", "function": "lambdo.std:mean_weighted", "window": "2", "inputs": ["A","W"], "model": {} } ] } ] } wf = Workflow(wf_json) # Provide data directly (without table population) data = {'A': [1, 2, 3], 'W': [3, 2, 1]} df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df wf.execute() v0 = tb.data['mean_w(A)'][0] v1 = tb.data['mean_w(A)'][1] v2 = tb.data['mean_w(A)'][2] self.assertTrue(pd.isna(v0)) self.assertAlmostEqual(v1, 1.4) self.assertAlmostEqual(v2, 2.33333333)
def test_roll(self): # # Rolling sum # wf_json = { "id": "My workflow", "tables": [ { "id": "My table", "columns": [ { "id": "sum(A)", "function": "numpy.core.fromnumeric:sum", "window": "2", "inputs": ["A"], "model": {} } ] } ] } wf = Workflow(wf_json) # Provide data directly (without table population) data = {'A': [1, 2, 3]} df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df wf.execute() v0 = tb.data['sum(A)'][0] v1 = tb.data['sum(A)'][1] v2 = tb.data['sum(A)'][2] self.assertTrue(pd.isna(v0)) self.assertAlmostEqual(v1, 3.0) self.assertAlmostEqual(v2, 5.0)
def test_link(self): wf_json = { "id": "My workflow", "tables": [ { "id": "Facts", "columns": [ { "id": "A", "operation": "calculate", "inputs": ["B", "C"] }, { "id": "Link", "operation": "link", "keys": ["A"], "linked_table": "Groups", "linked_keys": ["A"] } ] }, { "id": "Groups", "operation": "noop", "columns": [ ] } ] } wf = Workflow(wf_json) tp = Topology(wf) tp.translate() layers = tp.layers # Layers: # 0 "Facts" "Groups" # 1 "A" # 2 "Link" self.assertEqual(len(layers), 3) self.assertEqual(len(layers[0]), 2) self.assertEqual(len(layers[1]), 1) self.assertEqual(len(layers[2]), 1) self.assertEqual(layers[1][0].id, 'A') self.assertEqual(layers[2][0].id, 'Link')
def test_standard_functions(self): # # Shift one column: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.shift.html # wf_json = \ { "id": "My workflow", "tables": [ { "id": "My table", "columns": [ { "id": "My Column", "function": "pandas.core.series:Series.shift", "window": "all", "inputs": ["A"], "outputs": ["next(A)"], "model": {"periods": -1} } ] } ] } wf = Workflow(wf_json) # Provide data directly (without table population) data = {'A': [1, 2, 3]} df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df wf.execute() self.assertAlmostEqual(tb.data['next(A)'][0], 2.0) self.assertAlmostEqual(tb.data['next(A)'][1], 3.0) self.assertTrue(pd.isna(tb.data['next(A)'][2]))
def test_imports(self): wf_json = { "id": "My workflow", "imports": ["tests.udf", "os.path"], "tables": [{ "id": "My table", "columns": [{ "id": "A", "inputs": ["A"], "window": "1", "extensions": [{ "function": "tests.udf:user_import_fn", "outputs": "Success" }] }] }] } wf = Workflow(wf_json) self.assertEqual(len(wf.modules), 2) self.assertTrue(hasattr(wf.modules[0], 'user_import_fn')) # Provide data directly (without table population) data = {'A': [1, 2, 3]} df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df wf.execute() self.assertEqual(wf.tables[0].data['Success'][0], 'Success') self.assertEqual(wf.tables[0].data['Success'].nunique(), 1)
def test_grouping(self): """ Test only how records are grouped without aggregation. """ wf_json = { "id": "My workflow", "tables": [ { "id": "Fact Table" }, { "id": "Group Table", "function": "lambdo.std:aggregate", "inputs": ["Fact Table"], "model": { "keys": ["A"], "aggregations": [] } } ] } wf = Workflow(wf_json) # Fact table data = {'A': [0, 1, 0, 1], 'B': [1.0, 2.0, 3.0, 4.0]} df = pd.DataFrame(data) main_tb = wf.tables[0] main_tb.data = df wf.execute() df2 = wf.tables[1] self.assertEqual(len(df2.data.columns), 0) self.assertEqual(len(df2.data), 2)
def test_dropna(self): wf_json = { "id": "My workflow", "tables": [{ "id": "My table", "row_filter": { "dropna": True } }] } wf = Workflow(wf_json) # Provide data directly (without table population) data = {'A': [np.nan, 2, 3], 'B': [np.nan, 5, np.nan]} df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df wf.execute() self.assertEqual(len(tb.data.columns), 2) self.assertEqual(len(tb.data), 1)
def test_join_by_key(self): wf_json = { "id": "My workflow", "tables": [ { "id": "Main Table" }, { "id": "Second Table" }, { "id": "Merged Table", "function": "lambdo.std:join", "inputs": ["Main Table", "Second Table"], "model": {"suffixes": ["", "_JOINED"]} } ] } wf = Workflow(wf_json) # Main table data = {'A': [0, 1, 2]} df = pd.DataFrame(data) main_tb = wf.tables[0] main_tb.data = df # Secondary table (more rows than in the main table) data = {'A': [3, 4, 5, 6, 7]} df = pd.DataFrame(data) sec_tb = wf.tables[1] sec_tb.data = df wf.execute() merged_tb = wf.tables[2] self.assertEqual(len(merged_tb.data.columns), 2) self.assertEqual(len(merged_tb.data), 3) self.assertEqual(merged_tb.data.columns[1], 'A_JOINED') # Secondary table (fewer rows than in the main table) data = {'B': [3, 4]} df = pd.DataFrame(data) sec_tb = wf.tables[1] sec_tb.data = df wf.execute() merged_tb = wf.tables[2] self.assertEqual(len(merged_tb.data.columns), 2) self.assertEqual(len(merged_tb.data), 3)
def test_compose_simple(self): """Materialize a simple (two segments) column path as a new column of the table using an explicit definition""" wf_json = { "id": "My workflow", "tables": [{ "id": "Table 1", "attributes": ["A"], "columns": [{ "id": "Compose", "operation": "compose", "inputs": ["Link", "B"] }, { "id": "Link", "operation": "link", "keys": ["A"], "linked_table": "Table 2", "linked_keys": ["A"] }] }, { "id": "Table 2", "operation": "noop", "attributes": ["A", "B"], "columns": [] }] } wf = Workflow(wf_json) # Main table df = pd.DataFrame({'A': ['a', 'a', 'b', 'b']}) main_tb = wf.tables[0] main_tb.data = df # Secondary table (more data than used in the main table) df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [1, 2, 3]}) sec_tb = wf.tables[1] sec_tb.data = df tp = Topology(wf) tp.translate() layers = tp.layers # Layers: # 0 "Table 1" "Table 2" # 1 "Link" # 2 "Compose" self.assertEqual(len(layers), 3) self.assertEqual(len(layers[0]), 2) self.assertEqual(len(layers[1]), 1) self.assertEqual(len(layers[2]), 1) self.assertEqual(layers[1][0].id, 'Link') self.assertEqual(layers[2][0].id, 'Compose') wf.execute() # Complex column values: [1, 1, 2, 2] compose_column = main_tb.data['Compose'] self.assertEqual(compose_column[0], 1) self.assertEqual(compose_column[1], 1) self.assertEqual(compose_column[2], 2) self.assertEqual(compose_column[3], 2)
def test_single_columns(self): # # Row-based apply # wf_json = { "id": "My workflow", "tables": [ { "id": "My table", "columns": [ { "id": "My column", "function": "builtins:float", "scope": "one", "inputs": ["A"], "outputs": ["float(A)"] } ] } ] } wf = Workflow(wf_json) # Provide data directly (without table population) data = {'A': [1, 2, 3]} df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df tb.execute() v0 = tb.data['float(A)'][0] v1 = tb.data['float(A)'][1] v2 = tb.data['float(A)'][2] self.assertAlmostEqual(v0, 1.0) self.assertAlmostEqual(v1, 2.0) self.assertAlmostEqual(v2, 3.0) self.assertIsInstance(v0, float) self.assertIsInstance(v1, float) self.assertIsInstance(v2, float) # # Rolling sum # wf_json = { "id": "My workflow", "tables": [ { "id": "My table", "columns": [ { "id": "sum(A)", "function": "numpy.core.fromnumeric:sum", "scope": "2", "inputs": ["A"], "model": {} } ] } ] } wf = Workflow(wf_json) # Provide data directly (without table population) data = {'A': [1, 2, 3]} df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df tb.execute() v0 = tb.data['sum(A)'][0] v1 = tb.data['sum(A)'][1] v2 = tb.data['sum(A)'][2] self.assertTrue(pd.isna(v0)) self.assertAlmostEqual(v1, 3.0) self.assertAlmostEqual(v2, 5.0)
def test_family_columns(self): # # Same function and inputs but different scopes (windows) # wf_json = { "id": "My workflow", "tables": [ { "id": "My table", "columns": [ { "id": "sum(A)", "function": "numpy.core.fromnumeric:sum", "inputs": ["A"], "extensions": [ {"scope": "2"}, {"scope": "3", "outputs": ["sum(A)_win3"]} ] } ] } ] } wf = Workflow(wf_json) # Provide data directly (without table population) data = {'A': [1, 2, 3, 4]} df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df tb.execute() col0 = tb.data['sum(A)_0'] col1 = tb.data['sum(A)_win3'] self.assertAlmostEqual(col0[2], 5.0) self.assertAlmostEqual(col1[2], 6.0) self.assertAlmostEqual(col0[3], 7.0) self.assertAlmostEqual(col1[3], 9.0) # # Same input, different functions # wf_json = { "id": "My workflow", "tables": [ { "id": "My table", "columns": [ { "id": "A", "inputs": ["A"], "scope": "2", "extensions": [ {"function": "numpy.core.fromnumeric:sum", "outputs": "A_sum"}, {"function": "numpy.core.fromnumeric:mean", "outputs": "A_mean"} ] } ] } ] } wf = Workflow(wf_json) # Provide data directly (without table population) data = {'A': [1, 2, 3]} df = pd.DataFrame(data) tb = wf.tables[0] tb.data = df tb.execute() col0 = tb.data['A_sum'] col1 = tb.data['A_mean'] self.assertAlmostEqual(col0[1], 3.0) self.assertAlmostEqual(col0[2], 5.0) self.assertAlmostEqual(col1[1], 1.5) self.assertAlmostEqual(col1[2], 2.5)
def test_extend(self): wf_json = { "id": "My workflow", "tables": [ { "id": "Base Table", "operation": "noop", "columns": [ { "id": "B", "operation": "calculate", "function": "lambda x: x + 1", "inputs": ["A"] } ] }, { "id": "Extended Table", # "operation": "extend" - by default "columns": [ { "id": "C", "operation": "calculate", "function": "lambda x: x + 1", "inputs": ["B"] } ] } ] } wf = Workflow(wf_json) tp = Topology(wf) tp.translate() layers = tp.layers # Layers: # 0 "Base Table" # 1 "B" # 2 "Extended Table" # 3 "C" self.assertEqual(len(layers), 4) # Base tables df = pd.DataFrame({'A': [1.0, 2.0, 3.0]}) base_tb = wf.tables[0] base_tb.data = df wf.execute() ext_tb = wf.tables[1] self.assertEqual(len(ext_tb.data), 3) self.assertEqual(len(ext_tb.data.columns), 3) ext_column = ext_tb.data["C"] self.assertEqual(ext_column[0], 3.0) self.assertEqual(ext_column[1], 4.0) self.assertEqual(ext_column[2], 5.0)
def test_aggregate(self): # # One key to another table # wf_json = { "id": "My workflow", "tables": [ { "id": "Facts", "columns": [ { "id": "Group Link", "operation": "link", "keys": ["A"], "linked_table": "Groups", "linked_keys": ["A"] } ] }, { "id": "Groups", "operation": "noop", "columns": [ { "id": "Aggregate", "operation": "aggregate", "fact_table": "Facts", "group_column": "Group Link", # Computational (functional) definitions "function": "numpy.core.fromnumeric:sum", # One input is expected "inputs": ["M"], # Select measure columns from the fact table: single or multiple "model": {}, # Passed to the aggregation function as usual #"outputs": ["M"] # In the case, the function returns several results we need column ids # Post-processing options "fillna_value": 0.0, # Replace NaN in the result, for instance, of an empty group has no fact, a function will never be called and the value will be NaN # "function": "numpy.core.fromnumeric:size", # No need in inputs - how it works then? The function is a applied to a subset but this means there are parameters? } ] } ] } wf = Workflow(wf_json) # Facts df = pd.DataFrame({'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0]}) facts_tb = wf.tables[0] facts_tb.data = df # Secondary table (more data than used in the main table) df = pd.DataFrame({'A': ['a', 'b', 'c']}) groups_tb = wf.tables[1] groups_tb.data = df wf.execute() groups_tb = wf.tables[1] self.assertEqual(len(groups_tb.data), 3) # Same number of rows self.assertEqual(len(groups_tb.data.columns), 3) # One aggregate column was added (and one technical "id" column was added which might be removed in future) agg_column = groups_tb.data['Aggregate'] self.assertEqual(agg_column[0], 3.0) self.assertEqual(agg_column[1], 7.0) self.assertEqual(agg_column[2], 0.0)
X_array = X.values y_array = y.values.ravel() model = ensemble.GradientBoostingClassifier(**hyper_model) model.fit(X_array, y_array) return model def rf_fit(X, y, **hyper_model): X = X[:-1] y = y[:-1] X_array = X.values y_array = y.values.ravel() model = ensemble.RandomForestClassifier(**hyper_model) ensemble.RandomForestClassifier() model.fit(X_array, y_array) return model if __name__ == '__main__': with open('./examples/example10.json', encoding='utf-8') as f: wf_json = json.loads(f.read()) wf = Workflow(wf_json) wf.execute() pass