예제 #1
0
def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch,
                                       exec_mode):
    ifm_dim_h = ifm_dim
    k_h = k
    if dim_1d:
        ifm_dim_w = 1
        k_w = 1
    else:
        ifm_dim_w = ifm_dim_h
        k_w = k_h
    ifm_dim = (ifm_dim_h, ifm_dim_w)
    k = (k_h, k_w)

    stride_h = k_h
    stride_w = k_w
    ofm_dim_h = int(((ifm_dim_h - k_h) / stride_h) + 1)
    ofm_dim_w = int(((ifm_dim_w - k_w) / stride_w) + 1)
    ofm_dim = (ofm_dim_h, ofm_dim_w)
    if idt == DataType["BIPOLAR"] and dim_1d:
        pytest.skip("Skipping binary StreamingMaxPool_1d (not implemented)")
    if ifm_dim_h % k_h != 0 or ifm_dim_w % k_w != 0:
        pytest.skip("Skipping StreamingMaxPool test w/ ImgDim % PoolDim != 0")

    x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch))
    # prepare input data
    input_dict = prepare_inputs(x)

    golden = make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim,
                                                  idt)
    y_expected = oxe.execute_onnx(golden, input_dict)["outp"]

    model = make_single_streamingmaxpool_modelwrapper(k, ifm_ch, ifm_dim,
                                                      ofm_dim, idt)

    if exec_mode == "cppsim":
        model = model.transform(SetExecMode("cppsim"))
        model = model.transform(PrepareCppSim())
        model = model.transform(CompileCppSim())
    elif exec_mode == "rtlsim":
        model = model.transform(SetExecMode("rtlsim"))
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
        model = model.transform(HLSSynthIP())
        model = model.transform(PrepareRTLSim())
    else:
        raise Exception(
            "Unknown exec_mode in test_layer_streaming_maxpool_batch")

    # execute model
    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
    assert (y_produced == y_expected).all()

    if exec_mode == "rtlsim":
        node = model.get_nodes_by_op_type("StreamingMaxPool_Batch")[0]
        inst = getCustomOp(node)
        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
        exp_cycles = exp_cycles_dict[node.name]
        assert np.isclose(exp_cycles, cycles_rtlsim, atol=15)
        assert exp_cycles != 0
예제 #2
0
def test_end2end_tfc_w1a1_verify_dataflow_part():
    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_ipstitch.onnx")
    x = np.zeros((1, 784), dtype=np.float32)
    inp_name = model.graph.input[0].name
    out_name = model.graph.output[0].name
    inp_dict = {inp_name: x}
    # cppsim
    model = model.transform(PrepareCppSim())
    model = model.transform(CompileCppSim())
    model = model.transform(SetExecMode("cppsim"))
    model.save(build_dir + "/end2end_tfc_w1a1_ipstitch_cppsim.onnx")
    ret_cppsim = execute_onnx(model, inp_dict, True)
    res_cppsim = ret_cppsim[out_name]
    # node-by-node rtlsim
    model = model.transform(SetExecMode("rtlsim"))
    model = model.transform(PrepareRTLSim())
    model.save(build_dir + "/end2end_tfc_w1a1_ipstitch_nodebynode_rtlsim.onnx")
    ret_rtlsim_nodebynode = execute_onnx(model, inp_dict, True)
    res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name]
    # whole-network (ip-stitched) rtlsim
    model.set_metadata_prop("exec_mode", "rtlsim")
    model.save(build_dir + "/end2end_tfc_w1a1_ipstitch_whole_rtlsim.onnx")
    ret_rtlsim_whole = execute_onnx(model, inp_dict, True)
    res_rtlsim_whole = ret_rtlsim_whole[out_name]
    assert np.isclose(res_cppsim, res_rtlsim_nodebynode).all()
    assert np.isclose(res_cppsim, res_rtlsim_whole).all()
def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride):
    simd = ifm_ch
    ofm_dim = int(((ifm_dim - k) / stride) + 1)

    x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim))
    model = make_single_slidingwindow_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim,
                                                   simd, stride, idt)
    model = model.transform(SetExecMode("npysim"))
    model = model.transform(CodeGen_npysim())
    model = model.transform(Compile())

    # prepare input data
    input_dict = prepare_inputs(x, idt)

    # execute model
    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
    y_expected = im2col_indices(x, k, stride)
    # reshape expected output to match node output
    oshape = y_produced.shape
    y_expected = y_expected.reshape(oshape)

    assert (y_produced == y_expected).all(), "npysim failed"

    model = model.transform(SetExecMode("rtlsim"))
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(CodeGen_ipgen("xc7z020clg400-1", 5))
    model = model.transform(HLSSynth_IPGen())
    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
    assert (y_produced == y_expected).all(), "rtlsim failed"
예제 #4
0
def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode):
    np.random.seed(0)
    if fold == -1:
        pe = 1
    else:
        pe = labels // fold
    assert labels % pe == 0

    if k == -1:
        k = labels

    # generate input data
    x = gen_finn_dt_tensor(idt, (1, labels))

    model = make_labelselect_modelwrapper(labels, pe, k, idt)

    if exec_mode == "cppsim":
        model = model.transform(PrepareCppSim())
        model = model.transform(CompileCppSim())
        model = model.transform(SetExecMode("cppsim"))
    elif exec_mode == "rtlsim":
        model = model.transform(SetExecMode("rtlsim"))
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
        model = model.transform(HLSSynthIP())
        model = model.transform(PrepareRTLSim())
    else:
        raise Exception("Unknown exec_mode")

    # prepare input data and execute
    input_dict = prepare_inputs(x, idt)
    y = oxe.execute_onnx(model, input_dict)["outp"]

    assert soft_verify_topk(x, y, k), exec_mode + " failed"
def test_fpgadataflow_streamingmaxpool(idt, k, ifm_dim, ifm_ch, exec_mode):
    stride = k
    ofm_dim = int(((ifm_dim - k) / stride) + 1)
    if ifm_dim % k != 0:
        pytest.skip("Skipping StreamingMaxPool test w/ ImgDim % PoolDim != 0")

    x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch))
    # prepare input data
    input_dict = prepare_inputs(x)

    golden = make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim,
                                                  idt)
    y_expected = oxe.execute_onnx(golden, input_dict)["outp"]

    model = make_single_streamingmaxpool_modelwrapper(k, ifm_ch, ifm_dim,
                                                      ofm_dim, idt)

    if exec_mode == "cppsim":
        model = model.transform(SetExecMode("cppsim"))
        model = model.transform(PrepareCppSim())
        model = model.transform(CompileCppSim())
    elif exec_mode == "rtlsim":
        model = model.transform(SetExecMode("rtlsim"))
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
        model = model.transform(HLSSynthIP())
        model = model.transform(PrepareRTLSim())
    else:
        raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow")

    # execute model
    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
    assert (y_produced == y_expected).all()
예제 #6
0
def test_fpgadataflow_packed_dsp(ich, och, idim, k, s, pad, wdt, idt, tdt, odt, mode):
    model = make_model(ich, och, idim, k, s, pad, wdt, idt, tdt, odt)
    cdp_model = model.transform(InferDoublePackedConv())
    assert (len(cdp_model.graph.node) == 3 and
            cdp_model.graph.node[1].op_type == "ConvDoublePacked_Batch" and
            cdp_model.graph.node[0].op_type == "Transpose" and
            cdp_model.graph.node[-1].op_type == "Transpose"), "Incorrect model"
    # execute models and compare
    x = gen_finn_dt_tensor(idt, (1, ich, idim, idim))
    input_dict = {"inp": x}
    y_expected = oxe.execute_onnx(model, input_dict)["outp"]

    if mode == "cppsim":
        cdp_model = cdp_model.transform(SetExecMode("cppsim"))
        cdp_model = cdp_model.transform(PrepareCppSim())
        cdp_model = cdp_model.transform(CompileCppSim())
        y_produced = oxe.execute_onnx(cdp_model, input_dict)["outp"]
    elif mode == "rtlsim":
        cdp_model = cdp_model.transform(SetExecMode("rtlsim"))
        cdp_model = cdp_model.transform(GiveUniqueNodeNames())
        cdp_model = cdp_model.transform(GiveReadableTensorNames())
        cdp_model = cdp_model.transform(PrepareIP("xc7z020clg400-1", 5))
        cdp_model = cdp_model.transform(HLSSynthIP())
        cdp_model = cdp_model.transform(PrepareRTLSim())
        input_dict = {"global_in": x}
        y_produced = oxe.execute_onnx(cdp_model, input_dict)["global_out"]

    assert (y_produced.flatten() == y_expected.flatten()).all(), "cppsim failed"
예제 #7
0
def test_end2end_cnv_w1a1_verify_dataflow_part():
    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_ipstitch.onnx")
    x = np.zeros((1, 32, 32, 3), dtype=np.float32)
    inp_name = model.graph.input[0].name
    out_name = model.graph.output[0].name
    inp_dict = {inp_name: x}
    # cppsim
    model = model.transform(PrepareCppSim())
    model = model.transform(CompileCppSim())
    model = model.transform(SetExecMode("cppsim"))
    model.save(build_dir + "/end2end_cnv_w1a1_ipgen_cppsim.onnx")
    ret_cppsim = execute_onnx(model, inp_dict, True)
    res_cppsim = ret_cppsim[out_name]
    # node-by-node rtlsim
    model = model.transform(SetExecMode("rtlsim"))
    model = model.transform(PrepareRTLSim())
    model.save(build_dir + "/end2end_cnv_w1a1_ipgen_nodebynode_rtlsim.onnx")
    ret_rtlsim_nodebynode = execute_onnx(model, inp_dict, True)
    res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name]
    # whole-network (ip-stitched) rtlsim
    model.set_metadata_prop("exec_mode", "rtlsim")
    model.save(build_dir + "/end2end_cnv_w1a1_ipstitch_whole_rtlsim.onnx")
    # this is a particularly long-running test, set liveness thr. to unlimited
    os.environ["LIVENESS_THRESHOLD"] = "-1"
    ret_rtlsim_whole = execute_onnx(model, inp_dict, True)
    res_rtlsim_whole = ret_rtlsim_whole[out_name]
    assert np.isclose(res_cppsim, res_rtlsim_nodebynode).all()
    assert np.isclose(res_cppsim, res_rtlsim_whole).all()
def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs,
                                      exec_mode):
    if nf == -1:
        nf = ich
    pe = ich // nf
    assert ich % pe == 0

    # generate input and param data
    x = gen_finn_dt_tensor(idt, tuple(vecs + [ich]))
    # C = np.random.randint(idt.min(), idt.max() + 1, ich).astype(np.float32)
    C = gen_finn_dt_tensor(pdt, (ich))

    odt = act

    model = make_modelwrapper(C, pe, idt, odt, pdt, func, vecs)

    if exec_mode == "cppsim":
        model = model.transform(PrepareCppSim())
        model = model.transform(CompileCppSim())
        model = model.transform(SetExecMode("cppsim"))
    elif exec_mode == "rtlsim":
        model = model.transform(SetExecMode("rtlsim"))
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
        model = model.transform(HLSSynthIP())
        model = model.transform(PrepareRTLSim())
    else:
        raise Exception("Unknown exec_mode")

    # package input data as dictionary
    input_dict = {"inp": x}

    oshape = model.get_tensor_shape("outp")

    C_reshaped = np.broadcast_to(C.flatten(), x.shape)
    if func == "add":
        y = x + C_reshaped
    elif func == "mul":
        y = x * C_reshaped

    y_expected = y.reshape(oshape)
    # execute model
    y_produced = oxe.execute_onnx(model, input_dict)["outp"]

    y_produced = y_produced.reshape(y_expected.shape)

    assert (y_produced == y_expected).all(), "cppsim failed"

    if exec_mode == "rtlsim":
        hls_synt_res_est = model.analysis(hls_synth_res_estimation)
        assert "ChannelwiseOp_Batch_0" in hls_synt_res_est

        node = model.get_nodes_by_op_type("ChannelwiseOp_Batch")[0]
        inst = getCustomOp(node)
        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
        exp_cycles = exp_cycles_dict[node.name]
        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
        assert exp_cycles != 0
예제 #9
0
def test_convert_to_hls_channelwise_layer(pdt, idt, onnx_op_name, scalar_param,
                                          exec_mode):
    ifm_ch = 16
    ifm_dim = 5
    ishape = (1, ifm_ch, ifm_dim, ifm_dim)
    if scalar_param:
        pshape = (1, )
    else:
        pshape = (1, ifm_ch, 1, 1)

    np.random.seed(0)
    model = make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt,
                                             pshape)

    # Since the aren't Data types with a bit width of a non power of 2,
    # there are cases where the input won't use it full range.
    if idt == DataType["INT32"]:
        x = gen_finn_dt_tensor(DataType["INT16"],
                               (1, ifm_ch, ifm_dim, ifm_dim))
    elif idt == DataType["UINT32"]:
        x = gen_finn_dt_tensor(DataType["UINT16"],
                               (1, ifm_ch, ifm_dim, ifm_dim))
    else:
        x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim))

    input_dict = prepare_inputs(x)
    y_expected = oxe.execute_onnx(model, input_dict)["outp"]

    new_model = model.transform(to_hls.InferChannelwiseLinearLayer())
    new_model = new_model.transform(GiveUniqueNodeNames())

    if exec_mode == "cppsim":
        new_model = new_model.transform(PrepareCppSim())
        new_model = new_model.transform(CompileCppSim())
        new_model = new_model.transform(SetExecMode("cppsim"))
    elif exec_mode == "rtlsim":
        new_model = new_model.transform(SetExecMode("rtlsim"))
        new_model = new_model.transform(GiveUniqueNodeNames())
        new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5))
        new_model = new_model.transform(HLSSynthIP())
        new_model = new_model.transform(PrepareRTLSim())
    else:
        raise Exception("Unknown exec_mode")

    ctx_produced = oxe.execute_onnx(new_model,
                                    input_dict,
                                    return_full_exec_context=True)
    y_produced = ctx_produced["outp"]

    assert (y_produced == y_expected).all()
    assert new_model.graph.node[1].op_type == "ChannelwiseOp_Batch"
def test_fpgadataflow_slidingwindow(
    idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw
):
    ofm_dim = int(((ifm_dim - k) / stride) + 1)

    x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch))
    model = make_single_slidingwindow_modelwrapper(
        k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw
    )

    if exec_mode == "cppsim":
        model = model.transform(SetExecMode("cppsim"))
        model = model.transform(PrepareCppSim())
        model = model.transform(CompileCppSim())
    elif exec_mode == "rtlsim":
        model = model.transform(SetExecMode("rtlsim"))
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
        model = model.transform(HLSSynthIP())
        model = model.transform(PrepareRTLSim())
    else:
        raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow")

    # prepare input data
    input_dict = prepare_inputs(x)
    # execute model
    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
    golden = make_single_im2col_modelwrapper(
        k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt
    )
    y_expected = oxe.execute_onnx(golden, input_dict)["outp"]

    if dw == 0:
        assert (y_produced == y_expected).all()
    else:
        y_expected = y_expected.reshape(
            1, ofm_dim, ofm_dim, k * k, ifm_ch // simd, simd
        )
        y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
        y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, ifm_ch * k * k)
        assert (y_produced == y_expected).all()

    if exec_mode == "rtlsim":
        node = model.get_nodes_by_op_type("ConvolutionInputGenerator")[0]
        inst = getCustomOp(node)
        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
        exp_cycles = exp_cycles_dict[node.name]
        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
        assert exp_cycles != 0
예제 #11
0
def test_fpgadataflow_fclayer_npysim(idt, wdt, act, nf, sf, mw, mh):
    if nf == -1:
        nf = mh
    if sf == -1:
        sf = mw
    pe = mh // nf
    simd = mw // sf
    assert mh % pe == 0
    assert mw % sf == 0
    # generate weights
    W = gen_finn_dt_tensor(wdt, (mw, mh))
    # generate input data
    x = gen_finn_dt_tensor(idt, (1, mw))
    if act is None:
        # no activation, produce accumulators
        T = None
        tdt = None
        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
            odt = DataType.UINT32
        else:
            odt = DataType.INT32
    else:
        odt = act
        (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw)
        n_steps = act.get_num_possible_values() - 1
        T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32)
        # provide non-decreasing thresholds
        T = np.sort(T, axis=1)
        # generate thresholds for activation
        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
            tdt = DataType.UINT32
            # bias thresholds to be positive
            T = np.ceil((T + mw) / 2)
            assert (T >= 0).all()
        else:
            tdt = DataType.INT32
    model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
    model = model.transform(SetExecMode("npysim"))
    model = model.transform(CodeGen_npysim())
    model = model.transform(Compile())
    # prepare input data
    input_dict = prepare_inputs(x, idt, wdt)
    if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
        # convert inputs to binary and use xnorpopcountmatmul
        y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2)
    else:
        y = np.matmul(x, W)
    if T is not None:
        y = multithreshold(y, T)
        if act == DataType.BIPOLAR:
            # binary to bipolar
            y = 2 * y - 1
        else:
            # signed offset
            y += act.min()
    oshape = model.get_tensor_shape("outp")
    y_expected = y.reshape(oshape)
    # execute model
    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
    assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "npysim failed"
예제 #12
0
def test_partitioned_model_using_rtlsim(stage,
                                        parent_model,
                                        list_of_child_models,
                                        input_dict,
                                        produced_golden,
                                        src_dir="/tmp/finn_dev_justin",
                                        return_full_exec_context=False):
    print("Making deep copies of each of the models")
    parent_model_for_rtlsim = deepcopy(parent_model)
    list_of_child_models_for_rtlsim = deepcopy(list_of_child_models)
    num_child_models = len(list_of_child_models_for_rtlsim)
    prepped_child_model_filepaths = []
    # Prepare each child model for RTL sim testing
    for i in range(0, num_child_models):
        print(f"Preparing child model {i}")
        list_of_child_models_for_rtlsim[i] = prep_model_for_rtlsim(
            list_of_child_models_for_rtlsim[i], src_dir)
        prepped_child_model_path = f"/workspace/finn/tutorial/sfc_onnx_models/SFC1W1A_Child_RTLSim{i}.onnx"
        list_of_child_models_for_rtlsim[i].save(prepped_child_model_path)
        prepped_child_model_filepaths.append(prepped_child_model_path)
    parent_model_for_rtlsim = attach_child_models_to_parent_model(
        parent_model_for_rtlsim, prepped_child_model_filepaths)
    parent_model_for_rtlsim = parent_model_for_rtlsim.transform(
        SetExecMode("rtlsim"))
    parent_model_for_rtlsim.save("/workspace/finn/tutorial/parent_rtlsim.onnx")
    print(f"Running RTL Simulation")
    output = test_onnx_model(stage,
                             parent_model_for_rtlsim,
                             input_dict,
                             produced_golden,
                             return_full_exec_context=return_full_exec_context)
    if (return_full_exec_context):
        print(f"Context from run: {output}")
    parent_model_for_rtlsim.save("/workspace/finn/tutorial/parent_rtlsim.onnx")
    return output
예제 #13
0
def test_end2end_mobilenet_rtlsim():
    model = load_test_checkpoint_or_skip(build_dir +
                                         "/end2end_mobilenet_ipgen.onnx")
    x = np.load(build_dir + "/end2end_mobilenet_input.npy")
    inp_name = model.graph.input[0].name
    out_name = model.graph.output[0].name
    inp_dict = {inp_name: x}
    # node-by-node rtlsim
    model = model.transform(SetExecMode("rtlsim"))
    model = model.transform(PrepareRTLSim())
    model.save(build_dir + "/end2end_mobilenet_ipgen_nodebynode_rtlsim.onnx")
    ret_rtlsim_nodebynode = execute_onnx(model, inp_dict, True)
    res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name]
    np.save(
        build_dir + "/end2end_mobilenet_result_rtlsim_nodebynode.npy",
        res_rtlsim_nodebynode,
    )
    a0 = np.load(build_dir + "/end2end_mobilenet_topk_scale.npy")
    res_rtlsim_nodebynode_prob = (
        ret_rtlsim_nodebynode[model.graph.node[-2].output[0]] * a0)
    np.save(
        build_dir + "/end2end_mobilenet_result_rtlsim_nodebynode_prob.npy",
        res_rtlsim_nodebynode_prob,
    )

    # check result with golden values
    golden = np.load(build_dir + "/end2end_mobilenet_golden_top5.npy")
    golden_prob = np.load(build_dir +
                          "/end2end_mobilenet_golden_top5_prob.npy")

    assert (golden == res_rtlsim_nodebynode).all()
    assert np.isclose(golden_prob, res_rtlsim_nodebynode_prob).all()
예제 #14
0
def test_end2end_mobilenet_cppsim():
    model = load_test_checkpoint_or_skip(build_dir +
                                         "/end2end_mobilenet_folded.onnx")
    x = np.load(build_dir + "/end2end_mobilenet_input.npy")
    inp_name = model.graph.input[0].name
    out_name = model.graph.output[0].name
    inp_dict = {inp_name: x}
    start = time.time()
    # cppsim
    model = model.transform(PrepareCppSim())
    model = model.transform(CompileCppSim())
    model = model.transform(SetExecMode("cppsim"))
    end = time.time()
    elapsed_time = end - start
    f = open(build_dir + "/end2end_mobilenet_compile_time.txt", "w+")
    f.write("Execution time in seconds: " + str(elapsed_time))
    f.close()
    model.save(build_dir + "/end2end_mobilenet_cppsim.onnx")
    ret_cppsim = execute_onnx(model, inp_dict, True)
    res_cppsim = ret_cppsim[out_name]
    np.save(build_dir + "/end2end_mobilenet_result_cppsim.npy", res_cppsim)
    a0 = np.load(build_dir + "/end2end_mobilenet_topk_scale.npy")
    res_cppsim_prob = ret_cppsim[model.graph.node[-2].output[0]] * a0
    np.save(build_dir + "/end2end_mobilenet_result_cppsim_prob.npy",
            res_cppsim_prob)

    # check result with golden values
    golden = np.load(build_dir + "/end2end_mobilenet_golden_top5.npy")
    golden_prob = np.load(build_dir +
                          "/end2end_mobilenet_golden_top5_prob.npy")

    assert (golden == res_cppsim).all()
    assert np.isclose(golden_prob, res_cppsim_prob).all()
예제 #15
0
def test_depthwise_conv_hls_rtlsim(act, pe, k, stride, padding):
    idt = wdt = DataType.INT4
    ifm_dim = 6
    ifm_ch = 4

    # set up reference model consisting of Im2Col + MatMul (+ MultiThreshold)
    model = set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride,
                                   padding)

    input_tensor = gen_finn_dt_tensor(idt, [1, ifm_dim, ifm_dim, ifm_ch])
    input_dict = {"inp": input_tensor}

    new_model = model.transform(InferConvInpGen())
    new_model = new_model.transform(InferVVAU())

    # set SIMD in ConvInputGen node and PE in VVAU node

    for n in new_model.graph.node:
        if n.op_type == "ConvolutionInputGenerator":
            convinputgen_node = getCustomOp(n)
            convinputgen_node.set_nodeattr("SIMD", pe)
        elif n.op_type == "Vector_Vector_Activate_Batch":
            vvau_node = getCustomOp(n)
            vvau_node.set_nodeattr("PE", pe)

    new_model = new_model.transform(SetExecMode("rtlsim"))
    new_model = new_model.transform(GiveUniqueNodeNames())
    new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5))
    new_model = new_model.transform(HLSSynthIP())
    new_model = new_model.transform(PrepareRTLSim())

    assert oxe.compare_execution(model, new_model, input_dict)
예제 #16
0
def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode):
    if fold == -1:
        pe = 1
    else:
        pe = max(1, ch // fold)
    assert ch % pe == 0

    # generate input data
    x1 = gen_finn_dt_tensor(idt, (1, ch))
    x2 = gen_finn_dt_tensor(idt, (1, ch))

    model = make_addstreams_modelwrapper(ch, pe, idt)

    if exec_mode == "cppsim":
        model = model.transform(PrepareCppSim())
        model = model.transform(CompileCppSim())
        model = model.transform(SetExecMode("cppsim"))
    elif exec_mode == "rtlsim":
        model = model.transform(SetExecMode("rtlsim"))
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
        model = model.transform(HLSSynthIP())
        model = model.transform(PrepareRTLSim())
    else:
        raise Exception("Unknown exec_mode")

    # prepare input data
    input_dict = prepare_inputs(x1, x2)

    oshape = model.get_tensor_shape("outp")
    y = x1 + x2
    y_expected = y.reshape(oshape)
    # execute model
    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
    y_produced = y_produced.reshape(y_expected.shape)

    assert (y_produced == y_expected).all(), exec_mode + " failed"

    if exec_mode == "rtlsim":
        node = model.get_nodes_by_op_type("AddStreams_Batch")[0]
        inst = getCustomOp(node)
        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
        exp_cycles = exp_cycles_dict[node.name]
        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
        assert exp_cycles != 0
예제 #17
0
def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode):
    if fold == -1:
        pe = 1
    else:
        pe = ch // fold
    assert ch % pe == 0

    # generate input data
    x = gen_finn_dt_tensor(idt, (1, imdim, imdim, ch))

    model = make_accpool_modelwrapper(ch, pe, imdim, idt)

    if exec_mode == "cppsim":
        model = model.transform(PrepareCppSim())
        model = model.transform(CompileCppSim())
        model = model.transform(SetExecMode("cppsim"))
    elif exec_mode == "rtlsim":
        model = model.transform(SetExecMode("rtlsim"))
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
        model = model.transform(HLSSynthIP())
        model = model.transform(PrepareRTLSim())
    else:
        raise Exception("Unknown exec_mode")

    # prepare input data and execute
    input_dict = prepare_inputs(x, idt)
    y = oxe.execute_onnx(model, input_dict)["outp"]
    expected_y = np.sum(x, axis=(1, 2)).flatten()

    assert (y == expected_y).all(), exec_mode + " failed"

    if exec_mode == "rtlsim":
        node = model.get_nodes_by_op_type("GlobalAccPool_Batch")[0]
        inst = getCustomOp(node)
        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
        exp_cycles = exp_cycles_dict[node.name]
        # commented out, needs performance debug:
        # test_fpgadataflow_globalaccpool[rtlsim-7-1-64-DataType.UINT4]
        # assert False where False =
        # <function isclose at 0x7eff26d5ca60>(50, 103, atol=(0.1 * 103))
        # assert np.isclose(exp_cycles, cycles_rtlsim, atol=0.1 * cycles_rtlsim)
        assert exp_cycles != 0
        assert cycles_rtlsim != 0
예제 #18
0
def prep_model_for_rtlsim(model, src_dir="/tmp/finn_dev_justin"):
    # Make copies of all IP Cores, so that ReplaceVerilogRelPaths is not a permanent change
    randstring = get_rand_string(10)
    copy_dir = "/tmp/finn_dev_justin/rtlsim_" + randstring
    print(f"Copying IP to folder {copy_dir}")
    model = copy_ip(model, copy_dir, src_dir)
    model = model.transform(ReplaceVerilogRelPaths())
    model = model.transform(SetExecMode("rtlsim"))
    model = model.transform(PrepareRTLSim())
    return model
예제 #19
0
def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, exec_mode):
    if fold == -1:
        pe = 1
    else:
        pe = ch // fold
    assert ch % pe == 0

    # generate input data
    x = gen_finn_dt_tensor(idt, (1, imdim, imdim, ch))

    model = make_dupstreams_modelwrapper(ch, pe, imdim, idt)

    if exec_mode == "cppsim":
        model = model.transform(PrepareCppSim())
        model = model.transform(CompileCppSim())
        model = model.transform(SetExecMode("cppsim"))
    elif exec_mode == "rtlsim":
        model = model.transform(SetExecMode("rtlsim"))
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
        model = model.transform(HLSSynthIP())
        model = model.transform(PrepareRTLSim())
    else:
        raise Exception("Unknown exec_mode")

    # prepare input data and execute
    input_dict = prepare_inputs(x, idt)
    output_dict = oxe.execute_onnx(model, input_dict)
    y0 = output_dict["outp0"]
    y1 = output_dict["outp1"]
    expected_y = x

    assert (y0 == expected_y).all(), exec_mode + " failed"
    assert (y1 == expected_y).all(), exec_mode + " failed"

    if exec_mode == "rtlsim":
        node = model.get_nodes_by_op_type("DuplicateStreams_Batch")[0]
        inst = getCustomOp(node)
        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
        exp_cycles = exp_cycles_dict[node.name]
        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
        assert exp_cycles != 0
def test_fpgadataflow_streamingmaxpool(idt, k, ifm_dim, ifm_ch, exec_mode):
    stride = k
    ofm_dim = int(((ifm_dim - k) / stride) + 1)
    if ifm_dim % k != 0:
        pytest.skip("Skipping StreamingMaxPool test w/ ImgDim % PoolDim != 0")

    x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch))
    # prepare input data
    input_dict = prepare_inputs(x)

    golden = make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim,
                                                  idt)
    y_expected = oxe.execute_onnx(golden, input_dict)["outp"]

    model = make_single_streamingmaxpool_modelwrapper(k, ifm_ch, ifm_dim,
                                                      ofm_dim, idt)

    if exec_mode == "cppsim":
        model = model.transform(SetExecMode("cppsim"))
        model = model.transform(PrepareCppSim())
        model = model.transform(CompileCppSim())
    elif exec_mode == "rtlsim":
        model = model.transform(SetExecMode("rtlsim"))
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
        model = model.transform(HLSSynthIP())
        model = model.transform(PrepareRTLSim())
    else:
        raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow")

    # execute model
    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
    assert (y_produced == y_expected).all()

    if exec_mode == "rtlsim":
        node = model.get_nodes_by_op_type("StreamingMaxPool_Batch")[0]
        inst = getCustomOp(node)
        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
        exp_cycles = exp_cycles_dict[node.name]
        assert np.isclose(exp_cycles, cycles_rtlsim, atol=15)
        assert exp_cycles != 0
예제 #21
0
def test_fpgadataflow_lookup(edt, embedding_cfg, exec_mode):
    ishape = (1, 10)
    num_embeddings, idt, embedding_dim = embedding_cfg
    eshape = (num_embeddings, embedding_dim)
    exp_oshape = tuple(list(ishape) + [embedding_dim])
    embeddings = gen_finn_dt_tensor(edt, eshape)
    model = make_lookup_model(embeddings, ishape, idt, edt)
    assert len(model.graph.node) == 1
    assert model.graph.node[0].op_type == "Gather"
    iname = model.graph.input[0].name
    ename = model.graph.node[0].input[0]
    oname = model.graph.output[0].name
    assert model.get_tensor_datatype(iname) == idt
    assert model.get_tensor_datatype(ename) == edt
    assert model.get_tensor_datatype(oname) == edt
    assert tuple(model.get_tensor_shape(ename)) == eshape
    assert tuple(model.get_tensor_shape(oname)) == exp_oshape
    assert (model.get_initializer(ename) == embeddings).all()
    itensor = gen_finn_dt_tensor(idt, ishape).astype(np.int64)
    itensor = np.clip(itensor, 0, num_embeddings - 1)
    ret = execute_onnx(model, {iname: itensor})
    exp_out = np.take(embeddings, itensor, axis=0)
    assert (exp_out == ret[oname]).all()
    # call transformation to convert to HLS and verify conversion
    model = model.transform(InferLookupLayer())
    assert model.graph.node[0].op_type == "Lookup"
    assert model.graph.node[0].input[0] == iname
    assert model.graph.node[0].input[1] == ename
    assert model.graph.node[0].output[0] == oname
    if exec_mode == "cppsim":
        model = model.transform(PrepareCppSim())
        model = model.transform(CompileCppSim())
        model = model.transform(SetExecMode("cppsim"))
    elif exec_mode == "rtlsim":
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(PrepareIP("xc7z020clg400-1", 10))
        model = model.transform(HLSSynthIP())
        model = model.transform(SetExecMode("rtlsim"))
        model = model.transform(PrepareRTLSim())
    ret_sim = execute_onnx(model, {iname: itensor})
    assert (exp_out == ret_sim[oname]).all()
예제 #22
0
def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
    if num_ch % simd != 0:
        pytest.skip(" num_ch % simd != 0, skipping")
    # generate input data
    x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch])
    input_dict = {"inp": x}
    odim = idim + pad

    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt,
                                               pad_style)
    model = model.transform(InferShapes())
    model = model.transform(SetExecMode(mode))
    model = model.transform(GiveUniqueNodeNames())
    if mode == "cppsim":
        model = model.transform(PrepareCppSim())
        model = model.transform(CompileCppSim())
    elif mode == "rtlsim":
        model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
        model = model.transform(HLSSynthIP())
        model = model.transform(PrepareRTLSim())
    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
    expected_oshape = (1, odim, odim, num_ch)
    assert y_produced.shape == expected_oshape

    # calculate reference
    # calculate correct pad according to parameters
    if pad_style == 2:
        if pad % 2 == 0:
            pad_up = pad // 2
            pad_left = pad // 2
        else:
            pad_up = pad // 2 + 1
            pad_left = pad // 2 + 1
    else:
        pad_up = pad // 2
        pad_left = pad // 2

    pad_down = pad - pad_up
    pad_right = pad - pad_left

    y_expected = np.pad(x, ((0, 0), (pad_up, pad_down), (pad_left, pad_right),
                            (0, 0)), "constant")

    assert (y_produced == y_expected).all()

    if mode == "rtlsim":
        node = model.get_nodes_by_op_type("FMPadding_Batch")[0]
        inst = getCustomOp(node)
        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
        exp_cycles = exp_cycles_dict[node.name]
        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
        assert exp_cycles != 0
예제 #23
0
 def test_cppsim(self, topology, wbits, abits):
     prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fold")
     model = load_test_checkpoint_or_skip(prev_chkpt_name)
     model = model.transform(PrepareCppSim())
     model = model.transform(CompileCppSim())
     model = model.transform(SetExecMode("cppsim"))
     cppsim_chkpt = get_checkpoint_name(topology, wbits, abits, "cppsim")
     model.save(cppsim_chkpt)
     parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
     (input_tensor_npy, output_tensor_npy) = get_golden_io_pair(
         topology, wbits, abits, return_topk=1
     )
     y = execute_parent(parent_chkpt, cppsim_chkpt, input_tensor_npy)
     assert np.isclose(y, output_tensor_npy).all()
예제 #24
0
def step_apply_folding_config(model: ModelWrapper, cfg: DataflowBuildConfig):
    """Apply the folding configuration file onto the model to set folding (parallelization)
    and other attributes, if config file is specified."""

    if cfg.folding_config_file is not None:
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(ApplyConfig(cfg.folding_config_file))

    if VerificationStepType.FOLDED_HLS_CPPSIM in cfg._resolve_verification_steps():
        # prepare cppsim
        model = model.transform(PrepareCppSim())
        model = model.transform(CompileCppSim())
        model = model.transform(SetExecMode("cppsim"))
        verify_step(model, cfg, "folded_hls_cppsim", need_parent=True)
    return model
예제 #25
0
def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype):

    # generate input data
    x = gen_finn_dt_tensor(finn_dtype, Shape)
    input_dict = prepare_inputs(x, finn_dtype)

    model = make_single_fifo_modelwrapper(Shape, depth, folded_shape, finn_dtype)

    model = model.transform(SetExecMode("rtlsim"))
    model = model.transform(InsertTLastMarker())
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
    model = model.transform(HLSSynthIP())
    model = model.transform(PrepareRTLSim())
    y = oxe.execute_onnx(model, input_dict)["outp"]
    assert (
        y == x
    ).all(), """The output values are not the same as the
       input values anymore."""
    assert y.shape == tuple(Shape), """The output shape is incorrect."""

    model = model.transform(ReplaceVerilogRelPaths())
    model = model.transform(CreateStitchedIP(test_fpga_part))
    model = model.transform(MakePYNQProject(test_pynq_board))
    model = model.transform(SynthPYNQProject())
    model = model.transform(MakePYNQDriver())
    ip = os.environ["PYNQ_IP"]
    username = os.getenv("PYNQ_USERNAME", "xilinx")
    password = os.getenv("PYNQ_PASSWORD", "xilinx")
    port = os.getenv("PYNQ_PORT", 22)
    target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
    model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))

    res = throughput_test(model)
    expected_dict = {}
    expected_dict["runtime[ms]"] = []
    expected_dict["throughput[images/s]"] = []
    expected_dict["DRAM_in_bandwidth[Mb/s]"] = []
    expected_dict["DRAM_out_bandwidth[Mb/s]"] = []
    for key in expected_dict:
        assert (
            key in res
        ), """Throughput test not successful, no value for {}
        in result dictionary""".format(
            key
        )
예제 #26
0
def test_fpgadataflow_dwc_rtlsim(Shape, INWidth, OUTWidth, finn_dtype):

    # generate input data
    x = gen_finn_dt_tensor(finn_dtype, Shape)
    input_dict = prepare_inputs(x, finn_dtype)

    model = make_single_dwc_modelwrapper(Shape, INWidth, OUTWidth, finn_dtype)

    model = model.transform(SetExecMode("rtlsim"))
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
    model = model.transform(HLSSynthIP())
    model = model.transform(PrepareRTLSim())
    y = oxe.execute_onnx(model, input_dict)["outp"]

    assert (y == x).all(), """The output values are not the same as the
        input values anymore."""
    assert y.shape == tuple(Shape), """The output shape is incorrect."""
예제 #27
0
def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype):

    # generate input data
    x = gen_finn_dt_tensor(finn_dtype, Shape)
    input_dict = prepare_inputs(x, finn_dtype)

    model = make_single_fifo_modelwrapper(Shape, depth, folded_shape,
                                          finn_dtype)

    model = model.transform(SetExecMode("rtlsim"))
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
    model = model.transform(HLSSynthIP())
    model = model.transform(PrepareRTLSim())
    y = oxe.execute_onnx(model, input_dict)["outp"]
    assert (y == x).all(), """The output values are not the same as the
       input values anymore."""
    assert y.shape == tuple(Shape), """The output shape is incorrect."""
예제 #28
0
def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
    if nf == -1:
        nf = ich
    pe = ich // nf
    assert ich % pe == 0

    # generate input data
    x = gen_finn_dt_tensor(idt, (1, ich))

    odt = act
    n_steps = act.get_num_possible_values() - 1
    T = np.random.randint(idt.min(),
                          idt.max() + 1, (ich, n_steps)).astype(np.float32)
    # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
    # threshold of first channel is zero, while using BIPOLAR output)
    if act == DataType["BIPOLAR"]:
        T[0][0] = 0
    # provide non-decreasing thresholds
    T = np.sort(T, axis=1)

    if odt == DataType["BIPOLAR"]:
        actval = 0
    else:
        actval = odt.min()

    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval,
                                                  mem_mode)

    if exec_mode == "cppsim":
        model = model.transform(PrepareCppSim())
        model = model.transform(CompileCppSim())
        model = model.transform(SetExecMode("cppsim"))
    elif exec_mode == "rtlsim":
        model = model.transform(SetExecMode("rtlsim"))
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
        model = model.transform(HLSSynthIP())
        model = model.transform(PrepareRTLSim())
    else:
        raise Exception("Unknown exec_mode")

    # package input data as dictionary
    input_dict = {"inp": x}

    y = multithreshold(x, T)
    if act == DataType["BIPOLAR"]:
        # binary to bipolar
        y = 2 * y - 1
    else:
        # signed offset
        y += act.min()

    oshape = model.get_tensor_shape("outp")
    y_expected = y.reshape(oshape)
    # execute model
    y_produced = oxe.execute_onnx(model, input_dict)["outp"]

    y_produced = y_produced.reshape(y_expected.shape)

    assert (y_produced == y_expected).all(), "cppsim failed"

    if exec_mode == "rtlsim":
        hls_synt_res_est = model.analysis(hls_synth_res_estimation)
        assert "Thresholding_Batch_0" in hls_synt_res_est

        node = model.get_nodes_by_op_type("Thresholding_Batch")[0]
        inst = getCustomOp(node)
        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
        exp_cycles = exp_cycles_dict[node.name]
        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
        assert exp_cycles != 0
def test_convert_to_hls_layers_cnv_w1a1(fused_activation):
    cnv = get_test_model_trained("CNV", 1, 1)
    bo.export_finn_onnx(cnv, (1, 3, 32, 32), export_onnx_path_cnv)
    model = ModelWrapper(export_onnx_path_cnv)
    model = model.transform(InferShapes())
    model = model.transform(FoldConstants())
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(GiveReadableTensorNames())
    model = model.transform(Streamline())
    model = model.transform(LowerConvsToMatMul())
    model = model.transform(MakeMaxPoolNHWC())
    model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
    model = model.transform(ConvertBipolarMatMulToXnorPopcount())
    model = model.transform(Streamline())
    model = model.transform(InferDataLayouts())
    # model.save("golden.onnx")
    # load one of the test vectors
    fn = pk.resource_filename("finn.qnn-data",
                              "cifar10/cifar10-test-data-class3.npz")
    input_tensor = np.load(fn)["arr_0"].astype(np.float32)
    input_tensor = input_tensor / 255
    assert input_tensor.shape == (1, 3, 32, 32)
    # generate expected value from streamlined net
    input_dict = {"global_in": input_tensor}
    expected_ctx = oxe.execute_onnx(model, input_dict, True)
    expected = expected_ctx[model.graph.output[0].name]

    # if we infer thresholding first, all MultiThresholds get converted to HLS
    # subsequently, the FC inference will generate passthrough MVAUs
    if not fused_activation:
        model = model.transform(to_hls.InferThresholdingLayer())
    model = model.transform(to_hls.InferBinaryStreamingFCLayer())
    model = model.transform(to_hls.InferQuantizedStreamingFCLayer())
    for node in model.graph.node:
        if node.op_type == "StreamingFCLayer_Batch":
            inst = getCustomOp(node)
            inst.set_nodeattr("mem_mode", "decoupled")
            mw = inst.get_nodeattr("MW")
            mh = inst.get_nodeattr("MH")
            if mh % 4 == 0:
                pe = mh // 4
            else:
                pe = mh
            inst.set_nodeattr("PE", pe)
            if mw % 16 == 0:
                simd = mw // 16
            else:
                simd = mw
            inst.set_nodeattr("SIMD", simd)
    model = model.transform(to_hls.InferConvInpGen())
    model = model.transform(to_hls.InferStreamingMaxPool())
    # check topology status
    finn_nodes = model.get_finn_nodes()
    if fused_activation:
        assert len(finn_nodes) == 18
    else:
        assert len(finn_nodes) == 26
        thr_nodes = model.get_nodes_by_op_type("Thresholding_Batch")
        assert len(thr_nodes) == 8
    non_finn_nodes = model.get_non_finn_nodes()
    assert len(non_finn_nodes) == 4
    exp_non_finn_nodes = ["Transpose", "Reshape", "Mul", "Add"]
    assert [x.op_type for x in non_finn_nodes] == exp_non_finn_nodes
    fc_nodes = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
    assert len(fc_nodes) == 9
    swg_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator")
    assert len(swg_nodes) == 6
    mp_nodes = model.get_nodes_by_op_type("StreamingMaxPool_Batch")
    assert len(mp_nodes) == 2
    # model.save("cnv-pre-compile.onnx")
    model = model.transform(PrepareCppSim())
    model = model.transform(CompileCppSim())
    model = model.transform(SetExecMode("cppsim"))
    # model.save("cnv-post-compile.onnx")
    produced_ctx = oxe.execute_onnx(model, input_dict, True)
    produced = produced_ctx[model.graph.output[0].name]
    assert np.isclose(expected, produced, atol=1e-3).all()
    assert np.argmax(produced) == 3
    os.remove(export_onnx_path_cnv)
def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
    kernel_size, stride, pad = conv_config
    np.random.seed(0)
    idt = DataType.UINT4

    in_feature_dim = 7
    in_chn = 16

    if depthwise is True:
        group = out_chn = in_chn
        conv_param_shape = [out_chn, 1, kernel_size, kernel_size]
    else:
        group = 1
        out_chn = 20
        conv_param_shape = [out_chn, in_chn, kernel_size, kernel_size]

    out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad)

    input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
    output_shape = [1, out_chn, out_feature_dim, out_feature_dim]

    conv_weight_dt = DataType.UINT4

    conv_config = {}
    conv_config["dilations"] = [1, 1]
    conv_config["group"] = group
    conv_config["kernel_shape"] = [kernel_size, kernel_size]
    conv_config["pads"] = [pad, pad, pad, pad]
    conv_config["strides"] = [stride, stride]

    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
    value_info = [
        helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)
    ]

    modelproto = helper.make_model(
        helper.make_graph(
            name="conv_test",
            inputs=[top_in],
            outputs=[top_out],
            value_info=value_info,
            nodes=[
                helper.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config)
            ],
        )
    )

    model = ModelWrapper(modelproto)
    model.set_tensor_datatype("top_in", idt)
    model.set_tensor_datatype("top_out", idt)
    model.set_tensor_datatype("p1", conv_weight_dt)
    model.set_initializer("p1", gen_finn_dt_tensor(conv_weight_dt, conv_param_shape))

    model = model.transform(InferShapes())
    model = model.transform(InferDataTypes())

    new_model = model.transform(LowerConvsToMatMul())
    new_model = new_model.transform(to_hls.InferConvInpGen())
    if depthwise is True:
        new_model = new_model.transform(to_hls.InferVVAU())
    else:
        new_model = new_model.transform(to_hls.InferQuantizedStreamingFCLayer())
        fc_node = new_model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0]
        fc_inst = getCustomOp(fc_node)
        mw = fc_inst.get_nodeattr("MW")
        mh = fc_inst.get_nodeattr("MH")
        pe_cands = list(filter(lambda x: mh % x == 0, range(2, mh + 1)))
        simd_cands = list(filter(lambda x: mw % x == 0, range(2, mw + 1)))
        fc_inst.set_nodeattr("PE", pe_cands[0])
        fc_inst.set_nodeattr("SIMD", simd_cands[0])

    new_model = new_model.transform(GiveUniqueNodeNames())
    new_model = new_model.transform(InferShapes())
    new_model = new_model.transform(InferDataTypes())

    if exec_mode == "cppsim":
        new_model = new_model.transform(PrepareCppSim())
        new_model = new_model.transform(CompileCppSim())
        new_model = new_model.transform(SetExecMode("cppsim"))
    elif exec_mode == "rtlsim":
        new_model = new_model.transform(SetExecMode("rtlsim"))
        new_model = new_model.transform(GiveUniqueNodeNames())
        new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5))
        new_model = new_model.transform(HLSSynthIP())
        new_model = new_model.transform(PrepareRTLSim())
    else:
        raise Exception("Unknown exec_mode")

    x = gen_finn_dt_tensor(idt, input_shape)
    inp_dict = {model.graph.input[0].name: x}
    assert oxe.compare_execution(model, new_model, inp_dict)
    if kernel_size == 1 and stride > 1 and pad == 0:
        assert new_model.graph.node[1].op_type == "DownSampler"
        if exec_mode == "rtlsim":
            node = new_model.get_nodes_by_op_type("DownSampler")[0]
            inst = getCustomOp(node)
            cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
            exp_cycles_dict = new_model.analysis(exp_cycles_per_layer)
            exp_cycles = exp_cycles_dict[node.name]
            assert np.isclose(exp_cycles, cycles_rtlsim, atol=11)
            assert exp_cycles != 0

    if pad == 1:
        padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0]
        padding_inst = getCustomOp(padding_node)
        assert padding_inst.get_nodeattr("SIMD") == in_chn

    if depthwise is True and exec_mode == "rtlsim":
        node = new_model.get_nodes_by_op_type("Vector_Vector_Activate_Batch")[0]
        inst = getCustomOp(node)
        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
        exp_cycles_dict = new_model.analysis(exp_cycles_per_layer)
        exp_cycles = exp_cycles_dict[node.name]
        assert np.isclose(exp_cycles, cycles_rtlsim, atol=11)
        assert exp_cycles != 0