Python InsertFIFO 예제들, finn.transformation.fpgadataflow.insert_fifo.InsertFIFO Python 예제들

예제 #1

0

파일 보기

def test_end2end_cnv_w1a1_fold_and_tlastmarker():
    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_dataflow_model.onnx")
    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
    # each tuple is (PE, SIMD, in_fifo_depth) for a layer
    folding = [
        (16, 3, 128),
        (32, 32, 128),
        (16, 32, 128),
        (16, 32, 128),
        (4, 32, 81),
        (1, 32, 2),
        (1, 4, 2),
        (1, 8, 128),
        (5, 1, 3),
    ]
    for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding):
        fcl_inst = getCustomOp(fcl)
        fcl_inst.set_nodeattr("PE", pe)
        fcl_inst.set_nodeattr("SIMD", simd)
        fcl_inst.set_nodeattr("inFIFODepth", ififodepth)

    swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator")
    for i in range(len(swg_layers)):
        swg_inst = getCustomOp(swg_layers[i])
        simd = folding[i][1]
        swg_inst.set_nodeattr("SIMD", simd)

    model = model.transform(InsertDWC())
    model = model.transform(InsertFIFO())
    model = model.transform(InsertTLastMarker())
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(AnnotateResources("estimate"))
    model.save(build_dir + "/end2end_cnv_w1a1_folded.onnx")

예제 #2

0

파일 보기

파일: custom_steps.py 프로젝트: Xilinx/finn-examples

def step_resnet50_set_fifo_depths(model: ModelWrapper,
                                  cfg: DataflowBuildConfig):
    """
    Depending on the auto_fifo_depths setting, do one of the following:
    * if auto_fifo_depths=True:  Run the `InsertAndSetFIFODepths` transformation
    to attempt to determine the FIFO sizes that provide full throughput. Involves
    running stitched-IP rtlsim and may take a long time.
    * if auto_fifo_depths=False:  Assume the folding config file contains FIFO
    sizes as well. Runs the `InsertFIFO` transformation, then
    `ApplyConfig(cfg.folding_config_file)`, and finally `RemoveShallowFIFOs`.
    Coherency with config file node naming is ensured by calling
    `GiveUniqueNodeNames`.
    """

    if cfg.auto_fifo_depths:
        model = model.transform(
            InsertAndSetFIFODepths(
                cfg._resolve_fpga_part(),
                cfg._resolve_hls_clk_period(),
                vivado_ram_style=cfg.large_fifo_mem_style.value,
            ))
    else:
        # assume folding cfg json contains FIFO sizes too
        # insert DWCs, FIFOs and run ApplyConfig once more
        model = model.transform(InsertDWC())
        # need to make sure all FIFOs are created so that their depth can be
        # set by ApplyConfig, so create_shallow_fifos=True
        model = model.transform(InsertFIFO(create_shallow_fifos=True))
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(GiveReadableTensorNames())
        if cfg.folding_config_file is not None:
            model = model.transform(ApplyConfig(cfg.folding_config_file))
        # remove any shallow FIFOs
        model = model.transform(RemoveShallowFIFOs())

    # extract the final configuration and save it as json
    hw_attrs = [
        "PE",
        "SIMD",
        "ram_style",
        "depth",
        "impl_style",
        "resType",
        "mem_mode",
        "runtime_writeable_weights",
    ]
    extract_model_config_to_json(model,
                                 cfg.output_dir + "/final_hw_config.json",
                                 hw_attrs)

    # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
    # this will only run for the new nodes (e.g. FIFOs and DWCs)
    model = model.transform(
        PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
    model = model.transform(HLSSynthIP())
    model = model.transform(ReplaceVerilogRelPaths())
    return model

예제 #3

0

파일 보기

파일: vitis_build.py 프로젝트: yuanchunyu/finn

    def apply(self, model):
        _check_vitis_envvars()
        # first infer layouts
        model = model.transform(InferDataLayouts())
        # prepare at global level, then break up into kernels
        prep_transforms = [
            MakePYNQDriver(platform="alveo"),
            InsertIODMA(512),
            InsertDWC(),
        ]
        for trn in prep_transforms:
            model = model.transform(trn)
            model = model.transform(GiveUniqueNodeNames())
            model = model.transform(GiveReadableTensorNames())

        model = model.transform(Floorplan(floorplan=self.floorplan_file))

        model = model.transform(CreateDataflowPartition())
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(GiveReadableTensorNames())

        # Build each kernel individually
        sdp_nodes = model.get_nodes_by_op_type("StreamingDataflowPartition")
        for sdp_node in sdp_nodes:
            sdp_node = getCustomOp(sdp_node)
            dataflow_model_filename = sdp_node.get_nodeattr("model")
            kernel_model = ModelWrapper(dataflow_model_filename)
            kernel_model = kernel_model.transform(InsertFIFO())
            kernel_model = kernel_model.transform(
                InsertTLastMarker(both=True, external=False, dynamic=False))
            kernel_model = kernel_model.transform(GiveUniqueNodeNames())
            kernel_model.save(dataflow_model_filename)
            kernel_model = kernel_model.transform(
                PrepareIP(self.fpga_part, self.period_ns))
            kernel_model = kernel_model.transform(HLSSynthIP())
            kernel_model = kernel_model.transform(
                CreateStitchedIP(self.fpga_part, self.period_ns,
                                 sdp_node.onnx_node.name, True))
            kernel_model = kernel_model.transform(
                CreateVitisXO(sdp_node.onnx_node.name))
            kernel_model.set_metadata_prop("platform", "alveo")
            kernel_model.save(dataflow_model_filename)
        # Assemble design from kernels
        model = model.transform(
            VitisLink(
                self.platform,
                round(1000 / self.period_ns),
                strategy=self.strategy,
                enable_debug=self.enable_debug,
            ))
        # set platform attribute for correct remote execution
        model.set_metadata_prop("platform", "alveo")

        return (model, False)

예제 #4

0

파일 보기

파일: make_zynq_proj.py 프로젝트: pete-lennart/finn

    def apply(self, model):
        # first infer layouts
        model = model.transform(InferDataLayouts())
        # prepare at global level, then break up into kernels
        prep_transforms = [
            InsertIODMA(64),
            InsertDWC(),
            Floorplan(),
            CreateDataflowPartition(),
        ]
        for trn in prep_transforms:
            model = model.transform(trn)
            model = model.transform(GiveUniqueNodeNames())
            model = model.transform(GiveReadableTensorNames())
        # Build each kernel individually
        sdp_nodes = model.get_nodes_by_op_type("StreamingDataflowPartition")
        for sdp_node in sdp_nodes:
            prefix = sdp_node.name + "_"
            sdp_node = getCustomOp(sdp_node)
            dataflow_model_filename = sdp_node.get_nodeattr("model")
            kernel_model = ModelWrapper(dataflow_model_filename)
            kernel_model = kernel_model.transform(InsertFIFO())
            kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix))
            kernel_model.save(dataflow_model_filename)
            kernel_model = kernel_model.transform(
                PrepareIP(self.fpga_part, self.period_ns))
            kernel_model = kernel_model.transform(HLSSynthIP())
            kernel_model = kernel_model.transform(
                CreateStitchedIP(self.fpga_part, self.period_ns,
                                 sdp_node.onnx_node.name, True))
            kernel_model.set_metadata_prop("platform", "zynq-iodma")
            kernel_model.save(dataflow_model_filename)
        # Assemble design from IPs
        model = model.transform(
            MakeZYNQProject(self.platform, enable_debug=self.enable_debug))

        # set platform attribute for correct remote execution
        model.set_metadata_prop("platform", "zynq-iodma")

        # create driver
        model = model.transform(MakePYNQDriver(platform="zynq-iodma"))
        return (model, False)

예제 #5

0

파일 보기

 def test_ipstitch_rtlsim(self, topology, wbits, abits, kind):
     prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind)
     model = load_test_checkpoint_or_skip(prev_chkpt_name)
     test_fpga_part = get_build_env(kind, target_clk_ns)["part"]
     model = model.transform(InsertDWC())
     model = model.transform(InsertFIFO())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(AnnotateCycles())
     perf = model.analysis(dataflow_performance)
     latency = perf["critical_path_cycles"]
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
     model = model.transform(PrepareRTLSim())
     model.set_metadata_prop("exec_mode", "rtlsim")
     os.environ["LIVENESS_THRESHOLD"] = str(int(latency * 1.1))
     if rtlsim_trace:
         model.set_metadata_prop(
             "rtlsim_trace", "%s_w%da%d.vcd" % (topology, wbits, abits)
         )
         os.environ["RTLSIM_TRACE_DEPTH"] = "3"
     rtlsim_chkpt = get_checkpoint_name(
         topology, wbits, abits, "ipstitch_rtlsim_" + kind
     )
     model.save(rtlsim_chkpt)
     parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
     (input_tensor_npy, output_tensor_npy) = get_golden_io_pair(
         topology, wbits, abits, return_topk=1
     )
     y = execute_parent(parent_chkpt, rtlsim_chkpt, input_tensor_npy)
     model = ModelWrapper(rtlsim_chkpt)
     perf["cycles_rtlsim"] = model.get_metadata_prop("cycles_rtlsim")
     # warnings.warn("Estimated & rtlsim performance: " + str(perf))
     # for (k, v) in perf.items():
     #    update_dashboard_data(topology, wbits, abits, k, v)
     update_dashboard_data(
         topology, wbits, abits, "cycles_rtlsim", perf["cycles_rtlsim"]
     )
     assert np.isclose(y, output_tensor_npy).all()

예제 #6

0

파일 보기

파일: test_end2end_tfc_w1a2.py 프로젝트: zxh0717/finn

def test_end2end_tfc_w1a2_fold_and_tlastmarker():
    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_dataflow_model.onnx")
    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
    # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer
    config = [
        (16, 49, 16, 64, "block"),
        (8, 8, 64, 64, "auto"),
        (8, 8, 64, 64, "auto"),
        (10, 8, 64, 10, "distributed"),
    ]
    for fcl, (pe, simd, ififo, ofifo, ramstyle) in zip(fc_layers, config):
        fcl_inst = getCustomOp(fcl)
        fcl_inst.set_nodeattr("PE", pe)
        fcl_inst.set_nodeattr("SIMD", simd)
        fcl_inst.set_nodeattr("inFIFODepth", ififo)
        fcl_inst.set_nodeattr("outFIFODepth", ofifo)
        fcl_inst.set_nodeattr("ram_style", ramstyle)
    model = model.transform(InsertDWC())
    model = model.transform(InsertFIFO())
    model = model.transform(InsertTLastMarker())
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(AnnotateResources("estimate"))
    model.save(build_dir + "/end2end_tfc_w1a2_folded.onnx")

예제 #7

0

파일 보기

def test_runtime_thresholds_single_layer():
    mem_mode = "decoupled"
    act = DataType["INT4"]
    idt = DataType["INT16"]
    nf = 8
    ich = 16
    pe = ich // nf
    assert ich % pe == 0

    # generate input data
    in_tensor = gen_finn_dt_tensor(idt, (1, ich))

    odt = act
    n_steps = act.get_num_possible_values() - 1
    T = np.random.randint(idt.min(),
                          idt.max() + 1, (ich, n_steps)).astype(np.float32)
    # provide non-decreasing thresholds
    T = np.sort(T, axis=1)

    if odt == DataType["BIPOLAR"]:
        actval = 0
    else:
        actval = odt.min()

    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval,
                                                  mem_mode)
    op_inst = getCustomOp(model.graph.node[0])
    op_inst.set_nodeattr("runtime_writeable_weights", 1)
    op_inst.make_weight_file(T, "decoupled_runtime", "old_weights.dat")
    with open("old_weights.dat", "r") as f:
        old_weight_stream = f.read().strip()
    os.remove("old_weights.dat")
    old_weight_stream = map(lambda x: int(x, 16),
                            old_weight_stream.split("\n"))
    old_weight_stream = list(old_weight_stream)
    # need to create stitched IP for runtime weight testing
    model = model.transform(InsertFIFO(True))
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
    model = model.transform(HLSSynthIP())
    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
    model = model.transform(PrepareRTLSim())
    model.set_metadata_prop("exec_mode", "rtlsim")
    # add two copies of the input tensor as the first one is just used to
    # "flush out" the pipeline (as mvau already starts receiving old weights while
    # we read/write new ones and reads seem to cause a disturbance too)
    in_tensor = np.tile(in_tensor, (2, 1))
    exec_ctx = {"inp": in_tensor}
    extracted_weight_stream = []

    def read_weights(sim):
        addr = 0
        for i in range(len(old_weight_stream)):
            extracted_weight_stream.append(
                axilite_read(sim, addr, basename="s_axilite_0_"))
            addr += 4

    rtlsim_exec(model, exec_ctx, pre_hook=read_weights)
    assert extracted_weight_stream == old_weight_stream
    # only use second batch element in output; first will be invalid due to
    # old weights (see above)
    y = exec_ctx["outp"][1]
    expected = multithreshold(in_tensor, T)[1]
    if act == DataType["BIPOLAR"]:
        # binary to bipolar
        expected = 2 * expected - 1
    else:
        # signed offset
        expected += act.min()
    assert (y == expected).all()

    new_weights = np.random.randint(idt.min(),
                                    idt.max() + 1,
                                    (ich, n_steps)).astype(np.float32)
    # provide non-decreasing thresholds
    new_weights = np.sort(T, axis=1)
    op_inst.make_weight_file(new_weights, "decoupled_runtime",
                             "new_weights.dat")
    with open("new_weights.dat", "r") as f:
        new_weight_stream = f.read().strip()
    os.remove("new_weights.dat")
    new_weight_stream = map(lambda x: int(x, 16),
                            new_weight_stream.split("\n"))
    new_weight_stream = list(new_weight_stream)

    def write_weights(sim):
        addr = 0
        for nw in new_weight_stream:
            axilite_write(sim, addr, nw, basename="s_axilite_0_")
            addr += 4

    rtlsim_exec(model, exec_ctx, pre_hook=write_weights)
    y = exec_ctx["outp"][1]
    expected = multithreshold(in_tensor, new_weights)[1]
    if act == DataType["BIPOLAR"]:
        # binary to bipolar
        expected = 2 * expected - 1
    else:
        # signed offset
        expected += act.min()
    assert (y == expected).all()

예제 #8

0

파일 보기

    def apply(self, model):
        # change external to decoupled and warn user
        # this way we are sure we have exactly one input/output
        modified_fc_nodes = []
        for node in model.graph.node:
            # verify assumptions
            assert is_fpgadataflow_node(
                node), "Found non-fpgadataflow node: " + str(node)
            assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node"
            node = getCustomOp(node)
            node.set_nodeattr("inFIFODepth", self.max_depth)
            node.set_nodeattr("outFIFODepth", self.max_depth)
            if node.onnx_node.op_type == "StreamingFCLayer_Batch":
                mmode = node.get_nodeattr("mem_mode")
                if mmode == "external":
                    modified_fc_nodes.append(node.onnx_node.name)
                    node.set_nodeattr("mem_mode", "decoupled")
                    reset_implementation(node)
                    warnings.warn(
                        "Changed mem_mode from external to decoupled for " +
                        node.onnx_node.name)

        # insert stream infrastructure (DWC/FIFO)
        model = model.transform(InsertDWC())
        model = model.transform(InsertFIFO())
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(GiveReadableTensorNames())

        # gather FIFO names, check they are of expected depth
        fifos = {}
        for node in model.graph.node:
            if node.op_type == "StreamingFIFO":
                fifos[node.name] = 0
                node = getCustomOp(node)
                # check depths and fix as necessary
                if node.get_nodeattr("depth") != self.max_depth:
                    node.set_nodeattr("depth", self.max_depth)

        # insert FIFOs and do all transformations for RTLsim
        model = model.transform(AnnotateCycles())
        perf = model.analysis(dataflow_performance)
        latency = perf["critical_path_cycles"]
        max_cycles = perf["max_cycles"]
        model = model.transform(PrepareIP(self.fpgapart, self.clk_ns))
        model = model.transform(HLSSynthIP())
        model = model.transform(CreateStitchedIP(self.fpgapart, self.clk_ns))
        model.set_metadata_prop("exec_mode", "rtlsim")

        # calculate input frequency (number of cycles for each input word)
        first_node = getCustomOp(model.graph.node[0])
        ncycles_per_input = max(
            1,
            int(
                math.ceil(perf["max_cycles"] /
                          (np.prod(first_node.get_folded_input_shape()) /
                           first_node.get_folded_input_shape()[-1]))),
        )

        # set sufficiently large threshold for 1 image to  fully execute and exit
        ncycles = int(latency + max_cycles)

        # prepare pyverilator model
        sim = pyverilate_stitched_ip(model)

        reset_rtlsim(sim)
        toggle_clk(sim)

        # set all input valids to 0 and output readies to 1
        # set input data to some constant
        set_signal(sim, "tvalid", 0)
        set_signal(sim, "tready", 1)
        set_signal(sim, "tdata", 0)

        output_detected = False
        while ncycles > 0:
            toggle_clk(sim)
            # set/unset valids
            if ncycles % ncycles_per_input == 0:
                set_signal(sim, "tvalid", 1)
            else:
                set_signal(sim, "tvalid", 0)

            # check/update all fifo counts
            for key in fifos:
                current_state = sim.internals["finn_design_i"][key]["inst"][
                    key + "_" + key]["state"]
                current_addr = sim.internals["finn_design_i"][key]["inst"][
                    key + "_" + key]["addr"]
                if current_state == 2:
                    current_count = current_addr + 2
                else:
                    current_count = current_state
                if current_count > fifos[key]:
                    fifos[key] = current_count

            # since latency estimation is very pessimistic, detect first output
            # and fast-forward the sim
            if get_signal(sim, "tvalid") != 0 and not output_detected:
                ncycles = max_cycles
                output_detected = True
            else:
                ncycles = ncycles - 1

        if not output_detected:
            warnings.warn(
                "No output detected, calculated FIFO depths may not be correct"
            )

        # Apply depths back into the model;
        # also set in/outFIFODepth to zero for non-FIFO
        # nodes, preventing further FIFO insertion
        for node in model.graph.node:
            # set FIFO depth, reset FIFO implementation,
            # and set implementation/ram styles
            if node.op_type == "StreamingFIFO":
                assert node.name in fifos, "FIFO node not found in size dictionary"
                # set depth of FIFO
                depth = optimize_depth(fifos[node.name])
                node_inst = getCustomOp(node)
                node_inst.set_nodeattr("depth", depth)
                # Set FIFO implementation/ram styles
                if depth > self.max_qsrl_depth:
                    node_inst.set_nodeattr("impl_style", "vivado")
                    node_inst.set_nodeattr("ram_style", self.vivado_ram_style)
                else:
                    node_inst.set_nodeattr("impl_style", "rtl")
                # reset implementation
                reset_implementation(node_inst)
                del fifos[node.name]
            else:
                getCustomOp(node).set_nodeattr("inFIFODepth", 0)
                getCustomOp(node).set_nodeattr("outFIFODepth", 0)
                # for every FC node we changed from external to decoupled,
                # change back and reset implementation
                if node.op_type == "StreamingFCLayer_Batch":
                    if node.name in modified_fc_nodes:
                        node_inst = getCustomOp(node)
                        node_inst.set_nodeattr("mem_mode", "external")
                        reset_implementation(node_inst)
                        modified_fc_nodes.remove(node.name)

        assert (len(modified_fc_nodes) == 0 and len(fifos.keys()) == 0
                ), "FIFO/FC nodes left untouched after model reconfiguration"

        # handle custom sizing for SWG FIFOs if desired
        if self.swg_exception:
            model = model.transform(
                CapConvolutionFIFODepths(max_qsrl_depth=self.max_qsrl_depth))
        # remove shallow FIFOs
        model = model.transform(RemoveShallowFIFOs())

        return (model, False)

예제 #9

0

파일 보기

파일: test_runtime_weights.py 프로젝트: Xilinx/finn

def test_runtime_weights_single_layer():
    idt = DataType["UINT32"]
    wdt = DataType["UINT4"]
    act = None
    mw = 64
    mh = 32
    pe = 4
    simd = 16
    layer_spec = {
        "idt": idt,
        "wdt": wdt,
        "mw": mw,
        "mh": mh,
        "act": act,
        "pe": pe,
        "simd": simd,
    }
    layer_spec_list = [layer_spec]
    model = hls_random_mlp_maker(layer_spec_list)
    fcl = model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0]
    op_inst = getCustomOp(fcl)
    op_inst.set_nodeattr("mem_mode", "decoupled")
    op_inst.set_nodeattr("runtime_writeable_weights", 1)
    old_weights = model.get_initializer(fcl.input[1])
    op_inst.make_weight_file(old_weights, "decoupled_runtime",
                             "old_weights.dat")
    with open("old_weights.dat", "r") as f:
        old_weight_stream = f.read().strip()
    os.remove("old_weights.dat")
    old_weight_stream = map(lambda x: int(x, 16),
                            old_weight_stream.split("\n"))
    old_weight_stream = list(old_weight_stream)
    model = model.transform(InsertFIFO(True))
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
    model = model.transform(HLSSynthIP())
    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
    model.set_metadata_prop("exec_mode", "rtlsim")
    in_tensor = np.asarray(range(mw), dtype=np.float32)
    # add two copies of the input tensor as the first one is just used to
    # "flush out" the pipeline (as mvau already starts receiving old weights while
    # we read/write new ones and reads seem to cause a disturbance too)
    in_tensor = np.tile(in_tensor, (2, 1))
    exec_ctx = {"act_0": in_tensor}
    extracted_weight_stream = []

    def read_weights(sim):
        addr = 0
        for i in range(len(old_weight_stream)):
            extracted_weight_stream.append(
                axilite_read(sim, addr, basename="s_axilite_0_"))
            addr += 4

    rtlsim_exec(model, exec_ctx, pre_hook=read_weights)
    assert extracted_weight_stream == old_weight_stream
    y = exec_ctx["act_1"]
    # only use second batch element in output; first will be invalid due to
    # old weights (see above)
    assert (y[1] == np.dot(in_tensor[1], old_weights)).all()

    new_weights = gen_finn_dt_tensor(wdt, (mw, mh))
    op_inst.make_weight_file(new_weights, "decoupled_runtime",
                             "new_weights.dat")
    with open("new_weights.dat", "r") as f:
        new_weight_stream = f.read().strip()
    os.remove("new_weights.dat")
    new_weight_stream = map(lambda x: int(x, 16),
                            new_weight_stream.split("\n"))
    new_weight_stream = list(new_weight_stream)

    def write_weights(sim):
        addr = 0
        for nw in new_weight_stream:
            axilite_write(sim, addr, nw, basename="s_axilite_0_")
            addr += 4

    rtlsim_exec(model, exec_ctx, pre_hook=write_weights)
    y = exec_ctx["act_1"]
    # only use second batch element in output; first will be invalid due to
    # old weights (see above)
    assert (y[1] == np.dot(in_tensor[1], new_weights)).all()