def test_end2end_cnv_w1a1_fold_and_tlastmarker(): model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_dataflow_model.onnx") fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") # each tuple is (PE, SIMD, in_fifo_depth) for a layer folding = [ (16, 3, 128), (32, 32, 128), (16, 32, 128), (16, 32, 128), (4, 32, 81), (1, 32, 2), (1, 4, 2), (1, 8, 128), (5, 1, 3), ] for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding): fcl_inst = getCustomOp(fcl) fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("inFIFODepth", ififodepth) swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] swg_inst.set_nodeattr("SIMD", simd) model = model.transform(InsertDWC()) model = model.transform(InsertFIFO()) model = model.transform(InsertTLastMarker()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(AnnotateResources("estimate")) model.save(build_dir + "/end2end_cnv_w1a1_folded.onnx")
def step_resnet50_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): """ Depending on the auto_fifo_depths setting, do one of the following: * if auto_fifo_depths=True: Run the `InsertAndSetFIFODepths` transformation to attempt to determine the FIFO sizes that provide full throughput. Involves running stitched-IP rtlsim and may take a long time. * if auto_fifo_depths=False: Assume the folding config file contains FIFO sizes as well. Runs the `InsertFIFO` transformation, then `ApplyConfig(cfg.folding_config_file)`, and finally `RemoveShallowFIFOs`. Coherency with config file node naming is ensured by calling `GiveUniqueNodeNames`. """ if cfg.auto_fifo_depths: model = model.transform( InsertAndSetFIFODepths( cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period(), vivado_ram_style=cfg.large_fifo_mem_style.value, )) else: # assume folding cfg json contains FIFO sizes too # insert DWCs, FIFOs and run ApplyConfig once more model = model.transform(InsertDWC()) # need to make sure all FIFOs are created so that their depth can be # set by ApplyConfig, so create_shallow_fifos=True model = model.transform(InsertFIFO(create_shallow_fifos=True)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) if cfg.folding_config_file is not None: model = model.transform(ApplyConfig(cfg.folding_config_file)) # remove any shallow FIFOs model = model.transform(RemoveShallowFIFOs()) # extract the final configuration and save it as json hw_attrs = [ "PE", "SIMD", "ram_style", "depth", "impl_style", "resType", "mem_mode", "runtime_writeable_weights", ] extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs) # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again # this will only run for the new nodes (e.g. FIFOs and DWCs) model = model.transform( PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())) model = model.transform(HLSSynthIP()) model = model.transform(ReplaceVerilogRelPaths()) return model
def apply(self, model): _check_vitis_envvars() # first infer layouts model = model.transform(InferDataLayouts()) # prepare at global level, then break up into kernels prep_transforms = [ MakePYNQDriver(platform="alveo"), InsertIODMA(512), InsertDWC(), ] for trn in prep_transforms: model = model.transform(trn) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model = model.transform(Floorplan(floorplan=self.floorplan_file)) model = model.transform(CreateDataflowPartition()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) # Build each kernel individually sdp_nodes = model.get_nodes_by_op_type("StreamingDataflowPartition") for sdp_node in sdp_nodes: sdp_node = getCustomOp(sdp_node) dataflow_model_filename = sdp_node.get_nodeattr("model") kernel_model = ModelWrapper(dataflow_model_filename) kernel_model = kernel_model.transform(InsertFIFO()) kernel_model = kernel_model.transform( InsertTLastMarker(both=True, external=False, dynamic=False)) kernel_model = kernel_model.transform(GiveUniqueNodeNames()) kernel_model.save(dataflow_model_filename) kernel_model = kernel_model.transform( PrepareIP(self.fpga_part, self.period_ns)) kernel_model = kernel_model.transform(HLSSynthIP()) kernel_model = kernel_model.transform( CreateStitchedIP(self.fpga_part, self.period_ns, sdp_node.onnx_node.name, True)) kernel_model = kernel_model.transform( CreateVitisXO(sdp_node.onnx_node.name)) kernel_model.set_metadata_prop("platform", "alveo") kernel_model.save(dataflow_model_filename) # Assemble design from kernels model = model.transform( VitisLink( self.platform, round(1000 / self.period_ns), strategy=self.strategy, enable_debug=self.enable_debug, )) # set platform attribute for correct remote execution model.set_metadata_prop("platform", "alveo") return (model, False)
def test_ipstitch_rtlsim(self, topology, wbits, abits, kind): prev_chkpt_name = get_checkpoint_name( topology, wbits, abits, "fifodepth_" + kind ) model = load_test_checkpoint_or_skip(prev_chkpt_name) test_fpga_part = get_build_env(kind, target_clk_ns)["part"] model = model.transform(InsertDWC()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(AnnotateCycles()) perf = model.analysis(dataflow_performance) latency = perf["critical_path_cycles"] # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that for fifo_layer in model.get_nodes_by_op_type("StreamingFIFO"): getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl") model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) model = model.transform(PrepareRTLSim()) model.set_metadata_prop("exec_mode", "rtlsim") os.environ["LIVENESS_THRESHOLD"] = str(int(latency * 1.1)) if rtlsim_trace: model.set_metadata_prop( "rtlsim_trace", "%s_w%da%d.vcd" % (topology, wbits, abits) ) os.environ["RTLSIM_TRACE_DEPTH"] = "3" rtlsim_chkpt = get_checkpoint_name( topology, wbits, abits, "ipstitch_rtlsim_" + kind ) model.save(rtlsim_chkpt) parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent") (input_tensor_npy, output_tensor_npy) = get_golden_io_pair( topology, wbits, abits, return_topk=1 ) y = execute_parent(parent_chkpt, rtlsim_chkpt, input_tensor_npy) model = ModelWrapper(rtlsim_chkpt) perf["cycles_rtlsim"] = model.get_metadata_prop("cycles_rtlsim") # warnings.warn("Estimated & rtlsim performance: " + str(perf)) # for (k, v) in perf.items(): # update_dashboard_data(topology, wbits, abits, k, v) update_dashboard_data( topology, wbits, abits, "cycles_rtlsim", perf["cycles_rtlsim"] ) assert np.isclose(y, output_tensor_npy).all()
def apply(self, model): # first infer layouts model = model.transform(InferDataLayouts()) # prepare at global level, then break up into kernels prep_transforms = [ InsertIODMA(64), InsertDWC(), Floorplan(), CreateDataflowPartition(), ] for trn in prep_transforms: model = model.transform(trn) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) # Build each kernel individually sdp_nodes = model.get_nodes_by_op_type("StreamingDataflowPartition") for sdp_node in sdp_nodes: prefix = sdp_node.name + "_" sdp_node = getCustomOp(sdp_node) dataflow_model_filename = sdp_node.get_nodeattr("model") kernel_model = ModelWrapper(dataflow_model_filename) kernel_model = kernel_model.transform(InsertFIFO()) kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix)) kernel_model.save(dataflow_model_filename) kernel_model = kernel_model.transform( PrepareIP(self.fpga_part, self.period_ns)) kernel_model = kernel_model.transform(HLSSynthIP()) kernel_model = kernel_model.transform( CreateStitchedIP(self.fpga_part, self.period_ns, sdp_node.onnx_node.name, True)) kernel_model.set_metadata_prop("platform", "zynq-iodma") kernel_model.save(dataflow_model_filename) # Assemble design from IPs model = model.transform( MakeZYNQProject(self.platform, enable_debug=self.enable_debug)) # set platform attribute for correct remote execution model.set_metadata_prop("platform", "zynq-iodma") # create driver model = model.transform(MakePYNQDriver(platform="zynq-iodma")) return (model, False)
def test_end2end_tfc_w1a2_fold_and_tlastmarker(): model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_dataflow_model.onnx") fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer config = [ (16, 49, 16, 64, "block"), (8, 8, 64, 64, "auto"), (8, 8, 64, 64, "auto"), (10, 8, 64, 10, "distributed"), ] for fcl, (pe, simd, ififo, ofifo, ramstyle) in zip(fc_layers, config): fcl_inst = getCustomOp(fcl) fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("inFIFODepth", ififo) fcl_inst.set_nodeattr("outFIFODepth", ofifo) fcl_inst.set_nodeattr("ram_style", ramstyle) model = model.transform(InsertDWC()) model = model.transform(InsertFIFO()) model = model.transform(InsertTLastMarker()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(AnnotateResources("estimate")) model.save(build_dir + "/end2end_tfc_w1a2_folded.onnx")
def apply(self, model): # change external to decoupled and warn user # this way we are sure we have exactly one input/output modified_fc_nodes = [] for node in model.graph.node: # verify assumptions assert is_fpgadataflow_node( node), "Found non-fpgadataflow node: " + str(node) assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node" node = getCustomOp(node) node.set_nodeattr("inFIFODepth", self.max_depth) node.set_nodeattr("outFIFODepth", self.max_depth) if node.onnx_node.op_type == "StreamingFCLayer_Batch": mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) node.set_nodeattr("mem_mode", "decoupled") reset_implementation(node) warnings.warn( "Changed mem_mode from external to decoupled for " + node.onnx_node.name) # insert stream infrastructure (DWC/FIFO) model = model.transform(InsertDWC()) model = model.transform(InsertFIFO()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) # gather FIFO names, check they are of expected depth fifos = {} for node in model.graph.node: if node.op_type == "StreamingFIFO": fifos[node.name] = 0 node = getCustomOp(node) # check depths and fix as necessary if node.get_nodeattr("depth") != self.max_depth: node.set_nodeattr("depth", self.max_depth) # insert FIFOs and do all transformations for RTLsim model = model.transform(AnnotateCycles()) perf = model.analysis(dataflow_performance) latency = perf["critical_path_cycles"] max_cycles = perf["max_cycles"] model = model.transform(PrepareIP(self.fpgapart, self.clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(self.fpgapart, self.clk_ns)) model.set_metadata_prop("exec_mode", "rtlsim") # calculate input frequency (number of cycles for each input word) first_node = getCustomOp(model.graph.node[0]) ncycles_per_input = max( 1, int( math.ceil(perf["max_cycles"] / (np.prod(first_node.get_folded_input_shape()) / first_node.get_folded_input_shape()[-1]))), ) # set sufficiently large threshold for 1 image to fully execute and exit ncycles = int(latency + max_cycles) # prepare pyverilator model sim = pyverilate_stitched_ip(model) reset_rtlsim(sim) toggle_clk(sim) # set all input valids to 0 and output readies to 1 # set input data to some constant set_signal(sim, "tvalid", 0) set_signal(sim, "tready", 1) set_signal(sim, "tdata", 0) output_detected = False while ncycles > 0: toggle_clk(sim) # set/unset valids if ncycles % ncycles_per_input == 0: set_signal(sim, "tvalid", 1) else: set_signal(sim, "tvalid", 0) # check/update all fifo counts for key in fifos: current_state = sim.internals["finn_design_i"][key]["inst"][ key + "_" + key]["state"] current_addr = sim.internals["finn_design_i"][key]["inst"][ key + "_" + key]["addr"] if current_state == 2: current_count = current_addr + 2 else: current_count = current_state if current_count > fifos[key]: fifos[key] = current_count # since latency estimation is very pessimistic, detect first output # and fast-forward the sim if get_signal(sim, "tvalid") != 0 and not output_detected: ncycles = max_cycles output_detected = True else: ncycles = ncycles - 1 if not output_detected: warnings.warn( "No output detected, calculated FIFO depths may not be correct" ) # Apply depths back into the model; # also set in/outFIFODepth to zero for non-FIFO # nodes, preventing further FIFO insertion for node in model.graph.node: # set FIFO depth, reset FIFO implementation, # and set implementation/ram styles if node.op_type == "StreamingFIFO": assert node.name in fifos, "FIFO node not found in size dictionary" # set depth of FIFO depth = optimize_depth(fifos[node.name]) node_inst = getCustomOp(node) node_inst.set_nodeattr("depth", depth) # Set FIFO implementation/ram styles if depth > self.max_qsrl_depth: node_inst.set_nodeattr("impl_style", "vivado") node_inst.set_nodeattr("ram_style", self.vivado_ram_style) else: node_inst.set_nodeattr("impl_style", "rtl") # reset implementation reset_implementation(node_inst) del fifos[node.name] else: getCustomOp(node).set_nodeattr("inFIFODepth", 0) getCustomOp(node).set_nodeattr("outFIFODepth", 0) # for every FC node we changed from external to decoupled, # change back and reset implementation if node.op_type == "StreamingFCLayer_Batch": if node.name in modified_fc_nodes: node_inst = getCustomOp(node) node_inst.set_nodeattr("mem_mode", "external") reset_implementation(node_inst) modified_fc_nodes.remove(node.name) assert (len(modified_fc_nodes) == 0 and len(fifos.keys()) == 0 ), "FIFO/FC nodes left untouched after model reconfiguration" # handle custom sizing for SWG FIFOs if desired if self.swg_exception: model = model.transform( CapConvolutionFIFODepths(max_qsrl_depth=self.max_qsrl_depth)) # remove shallow FIFOs model = model.transform(RemoveShallowFIFOs()) return (model, False)