示例#1
0
def rtlsim_exec(model, execution_context):
    """Use PyVerilator to execute given model with stitched IP. The execution
    context contains the input values."""

    if PyVerilator is None:
        raise ImportError("Installation of PyVerilator is required.")
    # ensure stitched ip project already exists
    assert os.path.isfile(model.get_metadata_prop("wrapper_filename")), """The
    file name from metadata property "wrapper_filename" doesn't exist."""
    assert os.path.isdir(model.get_metadata_prop("vivado_stitch_proj")), """The
    directory from metadata property "vivado_stitch_proj" doesn't exist"""
    trace_file = model.get_metadata_prop("rtlsim_trace")
    # extract input shape
    # TODO extend for multiple inputs
    i_name = model.graph.input[0].name
    i_tensor = execution_context[i_name]
    i_dt = model.get_tensor_datatype(i_name)
    first_node = getCustomOp(model.find_consumer(i_name))
    i_stream_w = first_node.get_instream_width()
    # convert input into time multiplexed shape
    i_folded_shape = first_node.get_folded_input_shape()
    # TODO any other layout transformations need to happen here!
    i_tensor = i_tensor.reshape(i_folded_shape)
    # extract output shape
    o_name = model.graph.output[0].name
    o_shape = model.get_tensor_shape(o_name)
    o_dt = model.get_tensor_datatype(o_name)
    last_node = getCustomOp(model.find_producer(o_name))
    o_folded_shape = last_node.get_folded_output_shape()
    o_stream_w = last_node.get_outstream_width()
    packedBits = o_stream_w
    targetBits = o_dt.bitwidth()
    # pack input
    packed_input = npy_to_rtlsim_input(i_tensor, i_dt, i_stream_w)
    num_out_values = last_node.get_number_output_values()
    # prepare pyverilator model
    rtlsim_so = model.get_metadata_prop("rtlsim_so")
    if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)):
        sim = pyverilate_stitched_ip(model)
        model.set_metadata_prop("rtlsim_so", sim.lib._name)
    else:
        sim = PyVerilator(rtlsim_so)
    _reset_rtlsim(sim)
    _toggle_clk(sim)
    ret = _run_rtlsim(sim, packed_input, num_out_values, trace_file)
    packed_output = ret[0]
    model.set_metadata_prop("sim_cycles", str(ret[1]))
    # unpack output and put into context
    o_folded_tensor = rtlsim_output_to_npy(packed_output, None, o_dt,
                                           o_folded_shape, packedBits,
                                           targetBits)
    execution_context[o_name] = o_folded_tensor.reshape(o_shape)
示例#2
0
def test_fpgadataflow_ipstitch_rtlsim(mem_mode):
    model = load_test_checkpoint_or_skip(
        ip_stitch_model_dir +
        "/test_fpgadataflow_ip_stitch_%s.onnx" % mem_mode)
    model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd")
    sim = pyverilate_stitched_ip(model)
    exp_io = [
        "ap_clk",
        "ap_rst_n",
        "s_axis_0_tdata",
        "s_axis_0_tready",
        "s_axis_0_tvalid",
        "m_axis_0_tdata",
        "m_axis_0_tkeep",
        "m_axis_0_tlast",
        "m_axis_0_tready",
        "m_axis_0_tvalid",
        "s_axi_control_araddr",
        "s_axi_control_arready",
        "s_axi_control_arvalid",
        "s_axi_control_awaddr",
        "s_axi_control_awready",
        "s_axi_control_awvalid",
        "s_axi_control_bready",
        "s_axi_control_bresp",
        "s_axi_control_bvalid",
        "s_axi_control_rdata",
        "s_axi_control_rready",
        "s_axi_control_rresp",
        "s_axi_control_rvalid",
        "s_axi_control_wdata",
        "s_axi_control_wready",
        "s_axi_control_wstrb",
        "s_axi_control_wvalid",
    ]
    assert sorted(dir(sim.io)) == sorted(exp_io)
    model.set_metadata_prop("exec_mode", "rtlsim")
    idt = model.get_tensor_datatype("inp")
    ishape = model.get_tensor_shape("inp")
    x = gen_finn_dt_tensor(idt, ishape)
    # x = np.zeros(ishape, dtype=np.float32)
    # x = np.asarray([[-2, -1, 0, 1]], dtype=np.float32)
    rtlsim_res = execute_onnx(model, {"inp": x})["outp"]
    assert (rtlsim_res == x).all()
示例#3
0
def test_fpgadataflow_ipstitch_rtlsim():
    model = ModelWrapper(ip_stitch_model_dir +
                         "/test_fpgadataflow_ip_stitch.onnx")
    model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd")
    sim = pyverilate_stitched_ip(model)
    exp_io = [
        "ap_clk_0",
        "ap_rst_n_0",
        "in0_V_V_0_tdata",
        "in0_V_V_0_tready",
        "in0_V_V_0_tvalid",
        "out_r_0_tdata",
        "out_r_0_tkeep",
        "out_r_0_tlast",
        "out_r_0_tready",
        "out_r_0_tvalid",
        "s_axi_control_0_araddr",
        "s_axi_control_0_arready",
        "s_axi_control_0_arvalid",
        "s_axi_control_0_awaddr",
        "s_axi_control_0_awready",
        "s_axi_control_0_awvalid",
        "s_axi_control_0_bready",
        "s_axi_control_0_bresp",
        "s_axi_control_0_bvalid",
        "s_axi_control_0_rdata",
        "s_axi_control_0_rready",
        "s_axi_control_0_rresp",
        "s_axi_control_0_rvalid",
        "s_axi_control_0_wdata",
        "s_axi_control_0_wready",
        "s_axi_control_0_wstrb",
        "s_axi_control_0_wvalid",
    ]
    assert dir(sim.io) == exp_io
    model.set_metadata_prop("exec_mode", "rtlsim")
    idt = model.get_tensor_datatype("inp")
    ishape = model.get_tensor_shape("inp")
    x = gen_finn_dt_tensor(idt, ishape)
    # x = np.zeros(ishape, dtype=np.float32)
    # x = np.asarray([[-2, -1, 0, 1]], dtype=np.float32)
    rtlsim_res = execute_onnx(model, {"inp": x})["outp"]
    assert (rtlsim_res == x).all()
示例#4
0
def test_fpgadataflow_ipstitch_rtlsim():
    model = ModelWrapper(ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx")
    sim = pyverilate_stitched_ip(model)
    exp_io = [
        "ap_clk_0",
        "ap_rst_n_0",
        "in0_V_V_0_tdata",
        "in0_V_V_0_tready",
        "in0_V_V_0_tvalid",
        "out_r_0_tdata",
        "out_r_0_tkeep",
        "out_r_0_tlast",
        "out_r_0_tready",
        "out_r_0_tvalid",
    ]
    assert dir(sim.io) == exp_io
    model.set_metadata_prop("exec_mode", "rtlsim")
    idt = model.get_tensor_datatype("inp")
    ishape = model.get_tensor_shape("inp")
    x = gen_finn_dt_tensor(idt, ishape)
    rtlsim_res = execute_onnx(model, {"inp": x})["outp"]
    assert (rtlsim_res == x).all()
示例#5
0
def rtlsim_exec(model, execution_context, pre_hook=None, post_hook=None):
    """Use PyVerilator to execute given model with stitched IP. The execution
    context contains the input values. Hook functions can be optionally
    specified to observe/alter the state of the circuit, receiving the
    PyVerilator sim object as their first argument:
    - pre_hook : hook function to be called before sim start (after reset)
    - post_hook : hook function to be called after sim end
    """

    if PyVerilator is None:
        raise ImportError("Installation of PyVerilator is required.")
    # ensure stitched ip project already exists
    assert os.path.isfile(
        model.get_metadata_prop("wrapper_filename")
    ), """The
    file name from metadata property "wrapper_filename" doesn't exist."""
    assert os.path.isdir(
        model.get_metadata_prop("vivado_stitch_proj")
    ), """The
    directory from metadata property "vivado_stitch_proj" doesn't exist"""
    trace_file = model.get_metadata_prop("rtlsim_trace")
    # extract input shape
    # TODO extend for multiple inputs
    i_name = model.graph.input[0].name
    i_tensor = execution_context[i_name]
    i_dt = model.get_tensor_datatype(i_name)
    first_node = getCustomOp(model.find_consumer(i_name))
    i_stream_w = first_node.get_instream_width()
    # convert input into time multiplexed shape
    i_folded_shape = first_node.get_folded_input_shape()
    batchsize = i_tensor.shape[0]
    # override batch size for input
    i_folded_shape = list(i_folded_shape)
    i_folded_shape[0] = batchsize
    i_folded_shape = tuple(i_folded_shape)
    # TODO any other layout transformations need to happen here!
    i_tensor = i_tensor.reshape(i_folded_shape)
    # extract output shape
    o_name = model.graph.output[0].name
    o_shape = model.get_tensor_shape(o_name)
    o_dt = model.get_tensor_datatype(o_name)
    last_node = getCustomOp(model.find_producer(o_name))
    o_folded_shape = last_node.get_folded_output_shape()
    # override batch size from actual input
    o_shape = list(o_shape)
    o_shape[0] = batchsize
    o_shape = tuple(o_shape)
    o_folded_shape = list(o_folded_shape)
    o_folded_shape[0] = batchsize
    o_folded_shape = tuple(o_folded_shape)
    o_stream_w = last_node.get_outstream_width()
    packedBits = o_stream_w
    targetBits = o_dt.bitwidth()
    # pack input
    packed_input = npy_to_rtlsim_input(i_tensor, i_dt, i_stream_w)
    num_out_values = last_node.get_number_output_values()
    num_out_values *= batchsize
    # prepare pyverilator model
    rtlsim_so = model.get_metadata_prop("rtlsim_so")
    if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)):
        sim = pyverilate_stitched_ip(model)
        model.set_metadata_prop("rtlsim_so", sim.lib._name)
    else:
        sim = PyVerilator(rtlsim_so, auto_eval=False)
    ret = _run_rtlsim(
        sim,
        packed_input,
        num_out_values,
        trace_file,
        pre_hook=pre_hook,
        post_hook=post_hook,
    )
    packed_output = ret[0]
    model.set_metadata_prop("cycles_rtlsim", str(ret[1]))
    # unpack output and put into context
    o_folded_tensor = rtlsim_output_to_npy(
        packed_output, None, o_dt, o_folded_shape, packedBits, targetBits
    )
    execution_context[o_name] = o_folded_tensor.reshape(o_shape)
示例#6
0
    def apply(self, model):
        # change external to decoupled and warn user
        # this way we are sure we have exactly one input/output
        modified_fc_nodes = []
        for node in model.graph.node:
            # verify assumptions
            assert is_fpgadataflow_node(
                node), "Found non-fpgadataflow node: " + str(node)
            assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node"
            node = getCustomOp(node)
            node.set_nodeattr("inFIFODepth", self.max_depth)
            node.set_nodeattr("outFIFODepth", self.max_depth)
            if node.onnx_node.op_type == "StreamingFCLayer_Batch":
                mmode = node.get_nodeattr("mem_mode")
                if mmode == "external":
                    modified_fc_nodes.append(node.onnx_node.name)
                    node.set_nodeattr("mem_mode", "decoupled")
                    reset_implementation(node)
                    warnings.warn(
                        "Changed mem_mode from external to decoupled for " +
                        node.onnx_node.name)

        # insert stream infrastructure (DWC/FIFO)
        model = model.transform(InsertDWC())
        model = model.transform(InsertFIFO())
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(GiveReadableTensorNames())

        # gather FIFO names, check they are of expected depth
        fifos = {}
        for node in model.graph.node:
            if node.op_type == "StreamingFIFO":
                fifos[node.name] = 0
                node = getCustomOp(node)
                # check depths and fix as necessary
                if node.get_nodeattr("depth") != self.max_depth:
                    node.set_nodeattr("depth", self.max_depth)

        # insert FIFOs and do all transformations for RTLsim
        model = model.transform(AnnotateCycles())
        perf = model.analysis(dataflow_performance)
        latency = perf["critical_path_cycles"]
        max_cycles = perf["max_cycles"]
        model = model.transform(PrepareIP(self.fpgapart, self.clk_ns))
        model = model.transform(HLSSynthIP())
        model = model.transform(CreateStitchedIP(self.fpgapart, self.clk_ns))
        model.set_metadata_prop("exec_mode", "rtlsim")

        # calculate input frequency (number of cycles for each input word)
        first_node = getCustomOp(model.graph.node[0])
        ncycles_per_input = max(
            1,
            int(
                math.ceil(perf["max_cycles"] /
                          (np.prod(first_node.get_folded_input_shape()) /
                           first_node.get_folded_input_shape()[-1]))),
        )

        # set sufficiently large threshold for 1 image to  fully execute and exit
        ncycles = int(latency + max_cycles)

        # prepare pyverilator model
        sim = pyverilate_stitched_ip(model)

        reset_rtlsim(sim)
        toggle_clk(sim)

        # set all input valids to 0 and output readies to 1
        # set input data to some constant
        set_signal(sim, "tvalid", 0)
        set_signal(sim, "tready", 1)
        set_signal(sim, "tdata", 0)

        output_detected = False
        while ncycles > 0:
            toggle_clk(sim)
            # set/unset valids
            if ncycles % ncycles_per_input == 0:
                set_signal(sim, "tvalid", 1)
            else:
                set_signal(sim, "tvalid", 0)

            # check/update all fifo counts
            for key in fifos:
                current_state = sim.internals["finn_design_i"][key]["inst"][
                    key + "_" + key]["state"]
                current_addr = sim.internals["finn_design_i"][key]["inst"][
                    key + "_" + key]["addr"]
                if current_state == 2:
                    current_count = current_addr + 2
                else:
                    current_count = current_state
                if current_count > fifos[key]:
                    fifos[key] = current_count

            # since latency estimation is very pessimistic, detect first output
            # and fast-forward the sim
            if get_signal(sim, "tvalid") != 0 and not output_detected:
                ncycles = max_cycles
                output_detected = True
            else:
                ncycles = ncycles - 1

        if not output_detected:
            warnings.warn(
                "No output detected, calculated FIFO depths may not be correct"
            )

        # Apply depths back into the model;
        # also set in/outFIFODepth to zero for non-FIFO
        # nodes, preventing further FIFO insertion
        for node in model.graph.node:
            # set FIFO depth, reset FIFO implementation,
            # and set implementation/ram styles
            if node.op_type == "StreamingFIFO":
                assert node.name in fifos, "FIFO node not found in size dictionary"
                # set depth of FIFO
                depth = optimize_depth(fifos[node.name])
                node_inst = getCustomOp(node)
                node_inst.set_nodeattr("depth", depth)
                # Set FIFO implementation/ram styles
                if depth > self.max_qsrl_depth:
                    node_inst.set_nodeattr("impl_style", "vivado")
                    node_inst.set_nodeattr("ram_style", self.vivado_ram_style)
                else:
                    node_inst.set_nodeattr("impl_style", "rtl")
                # reset implementation
                reset_implementation(node_inst)
                del fifos[node.name]
            else:
                getCustomOp(node).set_nodeattr("inFIFODepth", 0)
                getCustomOp(node).set_nodeattr("outFIFODepth", 0)
                # for every FC node we changed from external to decoupled,
                # change back and reset implementation
                if node.op_type == "StreamingFCLayer_Batch":
                    if node.name in modified_fc_nodes:
                        node_inst = getCustomOp(node)
                        node_inst.set_nodeattr("mem_mode", "external")
                        reset_implementation(node_inst)
                        modified_fc_nodes.remove(node.name)

        assert (len(modified_fc_nodes) == 0 and len(fifos.keys()) == 0
                ), "FIFO/FC nodes left untouched after model reconfiguration"

        # handle custom sizing for SWG FIFOs if desired
        if self.swg_exception:
            model = model.transform(
                CapConvolutionFIFODepths(max_qsrl_depth=self.max_qsrl_depth))
        # remove shallow FIFOs
        model = model.transform(RemoveShallowFIFOs())

        return (model, False)