def test_exe_output(): env = make_test_env() env.add_module(core) g = Graph(env) # env.add_storage("python://test") # rt = env.runtimes[0] # TODO: this is error because no data copy between SAME storage engines (but DIFFERENT storage urls) currently # ec = env.get_run_context(g, current_runtime=rt, target_storage=env.storages[0]) # ec = env.get_run_context(g, current_runtime=rt, target_storage=rt.as_storage()) output_alias = "node_output" node = g.create_node(key="node", snap=snap_dl_source, output_alias=output_alias) exe = env.get_executable(node) result = ExecutionManager(exe).execute() with env.md_api.begin(): block = result.get_output_block(env) assert block is not None assert block.as_records() == mock_dl_output assert block.nominal_schema is TestSchema4 assert len(block.realized_schema.fields) == len(TestSchema4.fields) # Test alias was created correctly assert (env.md_api.execute( select(Alias).filter(Alias.alias == output_alias)). scalar_one_or_none().data_block_id == block.data_block_id) assert env.md_api.count(select(DataBlockLog)) == 1 dbl = env.md_api.execute(select(DataBlockLog)).scalar_one_or_none() assert dbl.data_block_id == block.data_block_id assert dbl.direction == Direction.OUTPUT
def test_data_block_methods(): env = make_test_env() db = DataBlockMetadata( id=get_datablock_id(), inferred_schema_key="_test.TestSchema1", nominal_schema_key="_test.TestSchema2", realized_schema_key="_test.TestSchema3", ) strg = env.get_default_local_python_storage() records = [{"a": 1}] sdb = StoredDataBlockMetadata( id=get_stored_datablock_id(), data_block_id=db.id, data_block=db, storage_url=strg.url, data_format=RecordsFormat, ) with env.md_api.begin(): env.md_api.add(db) env.md_api.add(sdb) assert sdb.name is None name = sdb.get_name_for_storage() assert len(name) > 10 assert sdb.name == name strg.get_api().put(sdb.get_name_for_storage(), records) assert db.inferred_schema(env) == TestSchema1 assert db.nominal_schema(env) == TestSchema2 assert db.realized_schema(env) == TestSchema3 db.compute_record_count() assert db.record_count == 1
def test_worker_output(): env = make_test_env() env.add_module(core) g = Graph(env) # env.add_storage("python://test") with env.session_scope() as sess: rt = env.runtimes[0] # TODO: this is error because no data copy between SAME storage engines (but DIFFERENT storage urls) currently # ec = env.get_run_context(g, current_runtime=rt, target_storage=env.storages[0]) ec = env.get_run_context(g, current_runtime=rt, target_storage=rt.as_storage()) output_alias = "node_output" node = g.create_node(key="node", pipe=pipe_dl_source, output_alias=output_alias) w = Worker(ec) dfi_mgr = NodeInterfaceManager(ec, sess, node) bdfi = dfi_mgr.get_bound_interface() r = Executable( node.key, CompiledPipe(node.pipe.key, node.pipe), bdfi, ) run_result = w.execute(r) outputblock = run_result.output_block assert outputblock is not None outputblock = sess.merge(outputblock) block = outputblock.as_managed_data_block(ec, sess) assert block.as_records() == mock_dl_output assert block.nominal_schema is TestSchema4 assert len(block.realized_schema.fields) == len(TestSchema4.fields) # Test alias was created correctly assert ( sess.query(Alias).filter(Alias.alias == output_alias).first().data_block_id == block.data_block_id )
def test_node_no_inputs(): env = make_test_env() g = Graph(env) df = pipe(pipe_t1_source) node1 = g.create_node(key="node1", pipe=df) assert {node1: node1}[node1] is node1 # Test hash pi = node1.get_interface() assert pi.inputs == [] assert pi.output is not None assert node1.declared_inputs == {}
def test_non_terminating_snap(): def never_stop(input: Optional[DataBlock] = None) -> DataFrame: pass env = make_test_env() g = Graph(env) node = g.create_node(key="node", snap=never_stop) exe = env.get_executable(node) result = ExecutionManager(exe).execute() assert result.get_output_block(env) is None
def test_node_no_inputs(): env = make_test_env() g = Graph(env) df = datafunction(function_t1_source) node1 = g.create_node(key="node1", function=df) assert {node1: node1}[node1] is node1 # Test hash pi = node1.get_interface() assert pi.inputs == {} assert pi.outputs != {} assert node1.declared_inputs == {}
def test_node_inputs(): env = make_test_env() g = Graph(env) df = pipe(pipe_t1_source) node = g.create_node(key="node", pipe=df) df = pipe(pipe_t1_sink) node1 = g.create_node(key="node1", pipe=df, upstream=node) pi = node1.get_interface() assert len(pi.inputs) == 1 assert pi.output == make_default_output_annotation() assert list(node1.declared_inputs.keys()) == ["input"]
def test_node_inputs(): env = make_test_env() g = Graph(env) df = Snap(snap_t1_source) node = g.create_node(key="node", snap=df) df = Snap(snap_t1_sink) node1 = g.create_node(key="node1", snap=df, input=node) pi = node1.get_interface() assert len(pi.inputs) == 1 assert pi.output == make_default_output() assert list(node1.declared_inputs.keys()) == ["input"]
def test_node_inputs(): env = make_test_env() g = Graph(env) df = datafunction(function_t1_source) node = g.create_node(key="node", function=df) df = datafunction(function_t1_sink) node1 = g.create_node(key="node1", function=df, input=node) pi = node1.get_interface() assert len(pi.inputs) == 1 assert pi.outputs == DEFAULT_OUTPUTS assert list(node1.declared_inputs.keys()) == ["input"]
def test_node_params(): env = make_test_env() g = Graph(env) param_vals = [] def function_ctx(ctx: DataFunctionContext, test: str): param_vals.append(test) n = g.create_node(key="ctx", function=function_ctx, params={"test": 1}) env.run_node(n, g) assert param_vals == [1]
def test_any_schema_interface(): env = make_test_env() env.add_module(core) def pipe_any(input: DataBlock) -> DataFrame: pass df = pipe(pipe_any) pi = df.get_interface() assert pi.inputs[0].schema_like == "Any" assert pi.output.schema_like == "Any"
def test_any_schema_interface(): env = make_test_env() env.add_module(core) def function_any(input: DataBlock) -> DataFrame: pass df = datafunction(function_any) pi = df.get_interface() assert pi.get_single_non_recursive_input().schema_like == "Any" assert pi.get_default_output().schema_like == "Any"
def test_node_config(): env = make_test_env() g = Graph(env) config_vals = [] def pipe_ctx(ctx: PipeContext): config_vals.append(ctx.get_config_value("test")) n = g.create_node(key="ctx", pipe=pipe_ctx, config={"test": 1, "extra_arg": 2}) with env.run(g) as exe: exe.execute(n) assert config_vals == [1]
def test_non_terminating_pipe(): def never_stop(input: Optional[DataBlock] = None) -> DataFrame: pass env = make_test_env() g = Graph(env) rt = env.runtimes[0] ec = env.get_run_context(g, current_runtime=rt) node = g.create_node(key="node", pipe=never_stop) em = ExecutionManager(ec) output = em.execute(node, to_exhaustion=True) assert output is None
def test_pipe_interface(pipe: PipeLike, expected: PipeInterface): env = make_test_env() if isinstance(pipe, Pipe): val = pipe.get_interface() elif isinstance(pipe, Callable): val = PipeInterface.from_pipe_definition(pipe) else: raise assert val == expected node = DeclaredNode(key="_test", pipe=pipe, upstream={"input": "mock"}).instantiate( env ) assert node.get_interface() == expected
def test_generated_schema(): new_schema = infer_schema_from_records(sample_records) got = GeneratedSchema(key=new_schema.key, definition=asdict(new_schema)) env = make_test_env() with env.session_scope() as sess: sess.add(got) got = (sess.query(GeneratedSchema).filter( GeneratedSchema.key == new_schema.key).first()) got_schema = got.as_schema() assert asdict(got_schema) == asdict(new_schema) assert env.get_generated_schema(new_schema.key, sess).key == new_schema.key assert env.get_generated_schema("pizza", sess) is None
def test_exe(): env = make_test_env() g = Graph(env) node = g.create_node(key="node", snap=snap_t1_source) exe = env.get_executable(node) result = ExecutionManager(exe).execute() with env.md_api.begin(): assert not result.output_blocks assert env.md_api.count(select(SnapLog)) == 1 pl = env.md_api.execute(select(SnapLog)).scalar_one_or_none() assert pl.node_key == node.key assert pl.graph_id == g.get_metadata_obj().hash assert pl.node_start_state == {} assert pl.node_end_state == {} assert pl.snap_key == node.snap.key assert pl.snap_params == {}
def test_node_params(): env = make_test_env() g = Graph(env) param_vals = [] @Param("test", "str") def snap_ctx(ctx: SnapContext): param_vals.append(ctx.get_param("test")) n = g.create_node(key="ctx", snap=snap_ctx, params={ "test": 1, "extra_arg": 2 }) env.run_node(n, g) assert param_vals == [1]
def test_cast_to_schema(cast_level, inferred, nominal, expected): inferred = create_quick_schema("Inf", fields=inferred) nominal = create_quick_schema("Nom", fields=nominal) if expected not in (ERROR, WARN): expected = create_quick_schema("Exp", fields=expected) env = make_test_env() with env.md_api.begin(): if expected == ERROR: with pytest.raises(SchemaTypeError): s = cast_to_realized_schema(env, inferred, nominal, cast_level) elif expected == WARN: with pytest.warns(UserWarning): s = cast_to_realized_schema(env, inferred, nominal, cast_level) else: s = cast_to_realized_schema(env, inferred, nominal, cast_level) for f in s.fields: e = expected.get_field(f.name) assert f == e
def test_worker(): env = make_test_env() g = Graph(env) rt = env.runtimes[0] ec = env.get_run_context(g, current_runtime=rt) with env.session_scope() as sess: node = g.create_node(key="node", pipe=pipe_t1_source) w = Worker(ec) dfi_mgr = NodeInterfaceManager(ec, sess, node) bdfi = dfi_mgr.get_bound_interface() r = Executable( node.key, CompiledPipe(node.pipe.key, node.pipe), bdfi, ) run_result = w.execute(r) output = run_result.output_block assert output is None
def make_graph() -> Graph: env = make_test_env() env.add_module(core) g = Graph(env) g.create_node(key="node1", function=function_t1_source) g.node(key="node2", function=function_t1_source) g.node(key="node3", function=function_t1_to_t2, input="node1") g.node(key="node4", function=function_t1_to_t2, input="node2") g.node(key="node5", function=function_generic, input="node4") g.node(key="node6", function=function_self, input="node4") g.node( key="node7", function=function_multiple_input, inputs={ "input": "node4", "other_t2": "node3" }, ) return g
def test_non_terminating_function_with_reference_input(): def never_stop(input: Optional[Reference]) -> DataFrame: # Does not use input but doesn't matter cause reference pass env = make_test_env() g = Graph(env) source = g.create_node( function="core.import_dataframe", params={"dataframe": pd.DataFrame({"a": range(10)})}, ) node = g.create_node(key="node", function=never_stop, input=source) exe = env.get_executable(source) # TODO: reference inputs need to log too? (So they know when to update) # with env.md_api.begin(): # assert env.md_api.count(select(DataBlockLog)) == 1 result = ExecutionManager(exe).execute() exe = env.get_executable(node) result = ExecutionManager(exe).execute() assert result.get_output_block(env) is None
def test_schema_translation(): env = make_test_env() t_base = create_quick_schema("t_base", fields=[("f1", "Unicode"), ("f2", "Integer")]) t_impl = create_quick_schema( "t_impl", fields=[("g1", "Unicode"), ("g2", "Integer")], implementations=[Implementation("t_base", { "f1": "g1", "f2": "g2" })], ) env.add_schema(t_base) env.add_schema(t_impl) with env.session_scope() as sess: trans = get_schema_translation(env, sess, source_schema=t_impl, target_schema=t_base) assert trans.translation == {"g1": "f1", "g2": "f2"}
def test_any_schema(): env = make_test_env() env.add_module(core) with env.session_scope() as sess: anyschema = env.get_schema("Any", sess) assert anyschema.fields == []