def finnpy_to_packed_bytearray(ndarray, dtype, reverse_inner=False, reverse_endian=False): """Given a numpy ndarray with FINN DataType dtype, pack the innermost dimension and return the packed representation as an ndarray of uint8. The packed innermost dimension will be padded to the nearest multiple of 8 bits. The returned ndarray has the same number of dimensions as the input. """ if (not issubclass(type(ndarray), np.ndarray)) or ndarray.dtype != np.float32: # try to convert to a float numpy array (container dtype is float) ndarray = np.asarray(ndarray, dtype=np.float32) # pack innermost dim to hex strings padded to 8 bits bits = dtype.bitwidth() * ndarray.shape[-1] bits_padded = roundup_to_integer_multiple(bits, 8) packed_hexstring = pack_innermost_dim_as_hex_string( ndarray, dtype, bits_padded, reverse_inner=reverse_inner) def fn(x): return np.asarray(list(map(hexstring2npbytearray, x))) if packed_hexstring.ndim == 0: # scalar, call hexstring2npbytearray directly ret = hexstring2npbytearray(np.asscalar(packed_hexstring)) else: # convert ndarray of hex strings to byte array ret = np.apply_along_axis(fn, packed_hexstring.ndim - 1, packed_hexstring) if reverse_endian: # reverse the endianness of packing dimension ret = np.flip(ret, axis=-1) return ret
def npy_to_rtlsim_input(input_file, input_dtype, pad_to_nbits, reverse_inner=True): """Convert the multidimensional NumPy array of integers (stored as floats) from input_file into a flattened sequence of Python arbitrary-precision integers, packing the innermost dimension. See finn.util.basic.pack_innermost_dim_as_hex_string() for more info on how the packing works. If reverse_inner is set, the innermost dimension will be reversed prior to packing.""" pad_to_nbits = roundup_to_integer_multiple(pad_to_nbits, 4) if issubclass(type(input_file), np.ndarray): inp = input_file elif os.path.isfile(input_file): inp = np.load(input_file) else: raise Exception("input_file must be ndarray or filename for .npy") if inp.shape[-1] == 1 and input_dtype.is_integer(): packed_data = inp.flatten().astype(input_dtype.to_numpy_dt()) else: packed_data = pack_innermost_dim_as_hex_string( inp, input_dtype, pad_to_nbits, reverse_inner=reverse_inner) packed_data = packed_data.flatten() packed_data = [int(x[2:], 16) for x in packed_data] return packed_data
def numpy_to_hls_code(ndarray, dtype, hls_var_name, pack_innermost_dim=True, no_decl=False): """Return C++ code representation of a numpy ndarray with FINN DataType dtype, using hls_var_name as the resulting C++ variable name. If pack_innermost_dim is specified, the innermost dimension of the ndarray will be packed into a hex string using array2hexstring. If no_decl is set to True, no variable name and type will be generated as part of the emitted string. """ hls_dtype = dtype.get_hls_datatype_str() if type(ndarray) != np.ndarray or ndarray.dtype != np.float32: # try to convert to a float numpy array (container dtype is float) ndarray = np.asarray(ndarray, dtype=np.float32) if pack_innermost_dim: idimlen = ndarray.shape[-1] idimbits = idimlen * dtype.bitwidth() idimbits = roundup_to_integer_multiple(idimbits, 4) ndarray = pack_innermost_dim_as_hex_string(ndarray, dtype, idimbits) hls_dtype = "ap_uint<%d>" % idimbits ndims = ndarray.ndim # add type string and variable name # e.g. "const ap_uint<64>" "weightMem0" ret = "%s %s" % (hls_dtype, hls_var_name) # add dimensions for d in range(ndims): ret += "[%d]" % ndarray.shape[d] orig_printops = np.get_printoptions() np.set_printoptions(threshold=sys.maxsize) # define a function to convert a single element into a C++ init string # a single element can be a hex string if we are using packing def elem2str(x): if type(x) == str or type(x) == np.str_ or type(x) == np.str: return '%s("%s", 16)' % (hls_dtype, x) elif type(x) == np.float32: if dtype.is_integer(): return str(int(x)) else: return str(x) else: raise Exception("Unsupported type for numpy_to_hls_code") strarr = np.array2string(ndarray, separator=", ", formatter={"all": elem2str}) np.set_printoptions(**orig_printops) strarr = strarr.replace("[", "{").replace("]", "}") if no_decl: ret = strarr + ";" else: ret = ret + " = \n" + strarr + ";" return ret
def __init__(self, onnx_node): super().__init__(onnx_node) odt_name = self.get_nodeattr("outputDataType") if odt_name == "": # If not provided compute min size labels = self.get_nodeattr("Labels") odt = DataType.get_smallest_possible(labels - 1) # ensure a datatype divisible by 8-bits in case this is the last node bw = roundup_to_integer_multiple(odt.bitwidth(), 8) new_odt_name = odt.name.replace(str(odt.bitwidth()), str(bw)) odt = DataType[new_odt_name] odt_name = odt.name self.set_nodeattr("outputDataType", odt_name)
def code_generation_ipgen(self, model, fpgapart, clk): # generate code for all mem_mode of MVAU/FCLayer unit super().code_generation_ipgen(model, fpgapart, clk) # if mem_mode = "decoupled" generate code for verilog wrapper mem_mode = self.get_nodeattr("mem_mode") if mem_mode == "decoupled": # empty code gen dictionary for new entries self.code_gen_dict.clear() self.code_gen_dict["$TOPNAME$"] = [ "{}_memstream".format(self.onnx_node.name) ] self.code_gen_dict["$LAYER_NAME$"] = [ "{}_{}".format(self.onnx_node.name, self.onnx_node.name) ] # make instream width a multiple of 8 for AXI stream interface in_width = self.get_instream_width_padded() self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)] self.code_gen_dict["$OUT_RANGE$"] = [ "[{}:0]".format(self.get_outstream_width_padded() - 1) ] # make weight stream width a multiple of 8 for AXI stream interface weight_width = self.get_weightstream_width_padded() self.code_gen_dict["$WEIGHT_RANGE$"] = ["[{}:0]".format(weight_width - 1)] self.code_gen_dict["$WEIGHT_WIDTH$"] = [str(weight_width)] self.code_gen_dict["$WSTREAM_DEPTH$"] = [str(self.calc_wmem())] self.code_gen_dict["$MEM_DEPTH$"] = [ str(roundup_to_integer_multiple(self.calc_wmem(), 1024)) ] self.code_gen_dict["$RAM_STYLE$"] = [self.get_nodeattr("ram_style")] template = self.decoupled_wrapper for key in self.code_gen_dict: # transform list into long string separated by '\n' code_gen_line = "\n".join(self.code_gen_dict[key]) template = template.replace(key, code_gen_line) code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") f = open( os.path.join( code_gen_dir, "{}_memstream.v".format(self.onnx_node.name) ), "w", ) f.write(template) f.close() self.code_gen_dict.clear()
def to_external_tensor(init, w_dtype): """Return an appropriately formatted and packed numpy byte array for given external parameter tensor.""" weight_width = init.shape[1] * w_dtype.bitwidth() weight_width_padded = roundup_to_integer_multiple(weight_width, 4) hex_init = pack_innermost_dim_as_hex_string( init, w_dtype, weight_width_padded, prefix="0x" ) ext_weight = np.array([], dtype=np.uint8) for line in hex_init: array_line = [ x for x in reversed(hexstring2npbytearray(line, remove_prefix="0x")) ] ext_weight = np.append(ext_weight, array_line) return ext_weight
def finnpy_to_packed_bytearray( ndarray, dtype, reverse_inner=False, reverse_endian=False, fast_mode=False ): """Given a numpy ndarray with FINN DataType dtype, pack the innermost dimension and return the packed representation as an ndarray of uint8. The packed innermost dimension will be padded to the nearest multiple of 8 bits. The returned ndarray has the same number of dimensions as the input. If fast_mode is enabled, will attempt to use shortcuts (casting) to save on runtime for certain cases. This mode is currently not well-tested, use at your own risk. """ # handle no-packing cases (if fast_mode) via casting to save on compute if issubclass(type(ndarray), np.ndarray) and fast_mode: inp_is_byte = ndarray.dtype in [np.uint8, np.int8] out_is_byte = dtype.bitwidth() == 8 double_reverse = reverse_inner and reverse_endian if inp_is_byte and out_is_byte and double_reverse: return ndarray.view(np.uint8) if (not issubclass(type(ndarray), np.ndarray)) or ndarray.dtype != np.float32: # try to convert to a float numpy array (container dtype is float) ndarray = np.asarray(ndarray, dtype=np.float32) # pack innermost dim to hex strings padded to 8 bits bits = dtype.bitwidth() * ndarray.shape[-1] bits_padded = roundup_to_integer_multiple(bits, 8) packed_hexstring = pack_innermost_dim_as_hex_string( ndarray, dtype, bits_padded, reverse_inner=reverse_inner ) def fn(x): return np.asarray(list(map(hexstring2npbytearray, x))) if packed_hexstring.ndim == 0: # scalar, call hexstring2npbytearray directly ret = hexstring2npbytearray(np.asscalar(packed_hexstring)) else: # convert ndarray of hex strings to byte array ret = np.apply_along_axis(fn, packed_hexstring.ndim - 1, packed_hexstring) if reverse_endian: # reverse the endianness of packing dimension ret = np.flip(ret, axis=-1) return ret
def minimize_accumulator_width(self, model): weights = model.get_initializer(self.onnx_node.input[1]) if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) else: thresholds = None idt = self.get_input_datatype() # calculate minimum and maximum values of accumulator (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) if thresholds is not None: threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) # set threshold datatype (and accumulator datatype implicitly) min_threshold = thresholds.min() max_threshold = thresholds.max() # get range required by threshold values tdt_min = min(acc_min, min_threshold) tdt_max = max(acc_max, max_threshold) if tdt_min < 0: if abs(tdt_min) > tdt_max: tdt = DataType.get_smallest_possible(tdt_min) else: tdt = DataType.get_smallest_possible(0 - tdt_max) else: tdt = DataType.get_smallest_possible(tdt_max) assert np.vectorize(tdt.allowed)( threshold_tensor ).all(), "Thresholds can't be expressed with type %s" % str(tdt) self.set_nodeattr("accDataType", tdt.name) else: if acc_min < 0: if abs(acc_min) > acc_max: adt = DataType.get_smallest_possible(acc_min) else: adt = DataType.get_smallest_possible(0 - acc_max) else: adt = DataType.get_smallest_possible(acc_max) # ensure a datatype divisible by 8-bits in case this is the last node bw = roundup_to_integer_multiple(adt.bitwidth(), 8) new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) adt = DataType[new_adt_name] self.set_nodeattr("accDataType", adt.name) # for no-activation nodes, output dt = acc dt self.set_nodeattr("outputDataType", adt.name) return DataType[self.get_nodeattr("accDataType")]
def finnpy_to_packed_bytearray(ndarray, dtype, reverse_inner=False, reverse_endian=False, fast_mode=False): """Given a numpy ndarray with FINN DataType dtype, pack the innermost dimension and return the packed representation as an ndarray of uint8. The packed innermost dimension will be padded to the nearest multiple of 8 bits. The returned ndarray has the same number of dimensions as the input. If fast_mode is enabled, will attempt to use shortcuts to save on runtime for certain cases: * 8-bit ndarray -> 8-bit * ndarray -> 1-bit and total bits % 8 == 0 This mode is currently not well-tested, use at your own risk! """ # handle fast_mode cases (currently only called from driver): if issubclass(type(ndarray), np.ndarray) and fast_mode: inp_is_byte = ndarray.dtype in [np.uint8, np.int8] out_is_byte = dtype.bitwidth() == 8 double_reverse = reverse_inner and reverse_endian # fast mode case: byte -> byte: cast if inp_is_byte and out_is_byte and double_reverse: return ndarray.view(np.uint8) # fast mode case: xxx -> bit with nbits % 8 == 0: np.packbits out_is_bit = dtype.bitwidth() == 1 bits = dtype.bitwidth() * ndarray.shape[-1] bits_padded = roundup_to_integer_multiple(bits, 8) no_pad = bits_padded == bits if out_is_bit and no_pad and double_reverse: in_as_int8 = ndarray.astype(np.int8) # bipolar -> binary if needed if dtype == DataType["BIPOLAR"]: in_as_int8 = (in_as_int8 + 1) // 2 # reverse inner in_as_int8 = np.flip(in_as_int8, axis=-1) # pack with numpy packed_data = np.packbits(in_as_int8, axis=-1) # reverse endianness and return return np.flip(packed_data, axis=-1) if (not issubclass(type(ndarray), np.ndarray)) or ndarray.dtype != np.float32: # try to convert to a float numpy array (container dtype is float) ndarray = np.asarray(ndarray, dtype=np.float32) # pack innermost dim to hex strings padded to 8 bits bits = dtype.bitwidth() * ndarray.shape[-1] bits_padded = roundup_to_integer_multiple(bits, 8) packed_hexstring = pack_innermost_dim_as_hex_string( ndarray, dtype, bits_padded, reverse_inner=reverse_inner) def fn(x): return np.asarray(list(map(hexstring2npbytearray, x))) if packed_hexstring.ndim == 0: # scalar, call hexstring2npbytearray directly ret = hexstring2npbytearray(np.asscalar(packed_hexstring)) else: # convert ndarray of hex strings to byte array ret = np.apply_along_axis(fn, packed_hexstring.ndim - 1, packed_hexstring) if reverse_endian: # reverse the endianness of packing dimension ret = np.flip(ret, axis=-1) return ret
def generate_params(self, model, path): mem_mode = self.get_nodeattr("mem_mode") code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) # convert weights into hlslib-compatible format weight_tensor = self.get_hls_compatible_weight_tensor(weights) export_wdt = self.get_weight_datatype() # we have converted bipolar weights to binary for export, # so use it as such for weight generation if self.get_weight_datatype() == DataType.BIPOLAR: export_wdt = DataType.BINARY if mem_mode == "const": """Saves weights into params.h""" weight_hls_code = numpy_to_hls_code( weight_tensor, export_wdt, "weights", True, True ) # write weights into params.h f_weights = open("{}/params.h".format(code_gen_dir), "w") if export_wdt.bitwidth() != 1: f_weights.write( "const FixedPointWeights<{},{},{},{}> weights = ".format( self.get_nodeattr("SIMD"), export_wdt.get_hls_datatype_str(), self.get_nodeattr("PE"), self.calc_wmem(), ) ) else: f_weights.write( "const BinaryWeights<{},{},{}> weights = ".format( self.get_nodeattr("SIMD"), self.get_nodeattr("PE"), self.calc_wmem(), ) ) f_weights.write(weight_hls_code) f_weights.close() elif mem_mode == "decoupled" or mem_mode == "external": """Saves weights in corresponding file format for cppsim or rtlsim""" # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) # reverse SIMD flip for saving weights in .npy weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) # PE flip for saving weights in .dat weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) # reshape weight tensor (simd_flipped and pe_flipped) to desired shape pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") # simd_flipped weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape( 1, -1, pe * simd ) weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy() # flipped weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape( 1, -1, pe * simd ) weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() """Saves weights into .npy file""" np.save( os.path.join(code_gen_dir, "weights.npy"), weight_tensor_simd_flipped ) if mem_mode == "decoupled": """Saves weights into .dat file""" # convert weight values into hexstring weight_width = self.get_weightstream_width() # pad to nearest 4 bits to get hex strings weight_width_padded = roundup_to_integer_multiple(weight_width, 4) weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" ) # add zeroes to pad out file to 1024 entries weight_stream = weight_tensor_pe_flipped.flatten() weight_stream = weight_stream.copy() with open("{}/memblock_0.dat".format(code_gen_dir), "a+") as f: for val in weight_stream: f.write(val + "\n") else: raise Exception( """Please set mem_mode to "const", "decoupled", or "external", currently no other parameter value is supported!""" ) # save thresholds in thresh.h if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) if thresholds is not None: threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) # use UINT32 threshold export for bipolar times bipolar inp_is_bipolar = self.get_input_datatype() == DataType.BIPOLAR wt_is_bipolar = self.get_weight_datatype() == DataType.BIPOLAR # reinterpret inp/wt as bipolar if bin_xnor_mode is iset inp_is_binary = self.get_input_datatype() == DataType.BINARY wt_is_binary = self.get_weight_datatype() == DataType.BINARY bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) # get computed threshold datatype from attribute tdt = DataType[self.get_nodeattr("accDataType")] assert np.vectorize(tdt.allowed)( threshold_tensor ).all(), "Thresholds can't be expressed with type %s" % str(tdt) thresholds_hls_code = numpy_to_hls_code( threshold_tensor, tdt, "thresholds", False, True ) # write thresholds into thresh.h f_thresh = open("{}/thresh.h".format(code_gen_dir), "w") tdt_hls = tdt.get_hls_datatype_str() # use binary to export bipolar activations export_odt = self.get_output_datatype() if self.get_output_datatype() == DataType.BIPOLAR: export_odt = DataType.BINARY odt_hls = export_odt.get_hls_datatype_str() f_thresh.write( "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ = ".format( self.calc_tmem(), self.get_nodeattr("PE"), threshold_tensor.shape[-1], tdt_hls, odt_hls, self.get_nodeattr("ActVal"), "std::less_equal<%s>" % tdt_hls, ) ) f_thresh.write(thresholds_hls_code) f_thresh.close()
def get_weightstream_width_padded(self): """Returns weight stream width padded to a multiple of 8. This is required by the AXI Stream spec. Used in decoupled mode.""" weight_width = self.get_weightstream_width() return roundup_to_integer_multiple(weight_width, 8)
def get_outstream_width_padded(self): """Returns output stream width padded to a multiple of 8. This is required by the AXI Stream spec.""" out_width = self.get_outstream_width() return roundup_to_integer_multiple(out_width, 8)
def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights (thresholds) in appropriate format for this layer. This file can be used for either synthesis or run-time reconfig of weights. Arguments: * weights : numpy array with weights to be put into the file * weight_file_mode : one of {hls_header, decoupled_verilog_dat, decoupled_runtime} * weight_file_name : filename for the weight file to be generated """ threshold_tensor = self.get_hls_compatible_threshold_tensor(weights) tdt = self.get_weight_datatype() assert np.vectorize(tdt.allowed)(threshold_tensor).all( ), "Thresholds can't be expressed with type %s" % str(tdt) if weight_file_mode == "hls_header": # save thresholds in thresh.h thresholds_hls_code = numpy_to_hls_code(threshold_tensor, tdt, "thresholds", False, True) # write thresholds into thresh.h f_thresh = open(weight_file_name, "w") tdt_hls = tdt.get_hls_datatype_str() # use binary to export bipolar activations export_odt = self.get_output_datatype() if self.get_output_datatype() == DataType.BIPOLAR: export_odt = DataType.BINARY odt_hls = export_odt.get_hls_datatype_str() f_thresh.write( "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ = ".format( self.calc_tmem(), self.get_nodeattr("PE"), threshold_tensor.shape[-1], tdt_hls, odt_hls, self.get_nodeattr("ActVal"), "comp::less_equal<%s>" % tdt_hls, )) f_thresh.write(thresholds_hls_code) f_thresh.close() elif "decoupled" in weight_file_mode: # streaming thresholds need to be organized differently # (1, pe, tmem, n_thres_steps) -> (1, tmem, pe, n_thres_steps) decoupled_thres = np.transpose(threshold_tensor, (0, 2, 1, 3)) # TODO add flips/reversals as needed here # (1, tmem, pe, n_thres_steps) -(1, tmem, pe * n_thres_steps) pe = self.get_nodeattr("PE") n_thres_steps = self.get_nodeattr("numSteps") decoupled_thres_pe_flipped = np.flip(decoupled_thres, axis=-2) decoupled_thres = decoupled_thres.reshape(1, -1, pe * n_thres_steps) decoupled_thres = decoupled_thres.copy() decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.reshape( 1, -1, pe * n_thres_steps) decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.copy() if weight_file_mode == "decoupled_npy": # save weight stream into npy for cppsim np.save(weight_file_name, decoupled_thres) elif weight_file_mode == "decoupled_verilog_dat": # convert weight values into hexstring weight_width = self.get_weightstream_width() # pad to nearest 4 bits to get hex strings weight_width_padded = roundup_to_integer_multiple( weight_width, 4) weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix="") weight_stream = weight_tensor_pe_flipped.flatten() weight_stream = weight_stream.copy() with open(weight_file_name, "w") as f: for val in weight_stream: f.write(val + "\n") elif weight_file_mode == "decoupled_runtime": # memstream axi-lite interface will map each mem line to # one or multiple 32-bit words weight_width = self.get_weightstream_width() words_per_memwidth = 2**ceil(log2(weight_width / 32)) if words_per_memwidth < 1: words_per_memwidth = 1 weight_width_padded = words_per_memwidth * 32 # first, pack and ensure padding to 32 bits weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix="") weight_stream = weight_tensor_pe_flipped.flatten() weight_stream = weight_stream.copy() with open(weight_file_name, "w") as f: for val in weight_stream: # split into groups of 8 hex digits (= 32 bits) words_32b = textwrap.wrap(val, 8) words_32b.reverse() for word_32b in words_32b: f.write(word_32b + "\n") else: raise Exception("Decoupled weight export not yet implemented") else: raise Exception("Unknown weight_file_mode")
def apply(self, model): pynq_shell_path = os.environ["PYNQSHELL_PATH"] if not os.path.isdir(pynq_shell_path): raise Exception( "Ensure the PYNQ-HelloWorld utility repo is cloned.") ipstitch_path = model.get_metadata_prop("vivado_stitch_proj") if ipstitch_path is None or (not os.path.isdir(ipstitch_path)): raise Exception( "No stitched IPI design found, apply CreateStitchedIP first.") vivado_stitch_vlnv = model.get_metadata_prop("vivado_stitch_vlnv") if vivado_stitch_vlnv is None: raise Exception( "No vlnv for stitched IP found, apply CreateStitchedIP first.") # collect list of all IP dirs ip_dirs = ["list"] for node in model.graph.node: ip_dir_attribute = get_by_name(node.attribute, "ip_path") assert (ip_dir_attribute is not None), """Node attribute "ip_path" is empty. Please run transformation HLSSynth_ipgen first.""" ip_dir_value = ip_dir_attribute.s.decode("UTF-8") assert os.path.isdir(ip_dir_value), """The directory that should contain the generated ip blocks doesn't exist.""" ip_dirs += [ip_dir_value] ip_dirs += [ipstitch_path + "/ip"] ip_dirs_str = "[%s]" % (" ".join(ip_dirs)) # extract HLSCustomOp instances to get i/o stream widths i_tensor_name = model.graph.input[0].name o_tensor_name = model.graph.output[0].name first_node = getCustomOp(model.find_consumer(i_tensor_name)) last_node = getCustomOp(model.find_producer(o_tensor_name)) i_bits_per_cycle = first_node.get_instream_width() o_bits_per_cycle = last_node.get_outstream_width() # ensure i/o is padded to bytes i_bits_per_cycle_padded = roundup_to_integer_multiple( i_bits_per_cycle, 8) o_bits_per_cycle_padded = roundup_to_integer_multiple( o_bits_per_cycle, 8) assert (i_bits_per_cycle_padded % 8 == 0), """Padded input bits are not a multiple of 8.""" assert (o_bits_per_cycle_padded % 8 == 0), """Padded output bits are not a multiple of 8.""" in_bytes = i_bits_per_cycle_padded / 8 out_bytes = o_bits_per_cycle_padded / 8 in_if_name = "in0_V_V_0" out_if_name = "out_r_0" clk_name = "ap_clk_0" nrst_name = "ap_rst_n_0" axi_lite_if_name = "s_axi_control_0" vivado_ip_cache = os.getenv("VIVADO_IP_CACHE", default="") # TODO get from Transformation arg or metadata_prop fclk_mhz = 100.0 # create a temporary folder for the project vivado_pynq_proj_dir = make_build_dir(prefix="vivado_pynq_proj_") model.set_metadata_prop("vivado_pynq_proj", vivado_pynq_proj_dir) # filename for the synth utilization report synth_report_filename = vivado_pynq_proj_dir + "/synth_report.xml" model.set_metadata_prop("vivado_synth_rpt", synth_report_filename) ip_config_tcl = templates.ip_config_tcl_template % ( vivado_pynq_proj_dir, ip_dirs_str, vivado_pynq_proj_dir, synth_report_filename, vivado_stitch_vlnv, in_bytes, out_bytes, in_if_name, out_if_name, clk_name, nrst_name, axi_lite_if_name, vivado_ip_cache, fclk_mhz, ) with open(vivado_pynq_proj_dir + "/ip_config.tcl", "w") as f: f.write(ip_config_tcl) # create a shell script for project creation and synthesis make_project_sh = vivado_pynq_proj_dir + "/make_project.sh" working_dir = os.environ["PWD"] ipcfg = vivado_pynq_proj_dir + "/ip_config.tcl" with open(make_project_sh, "w") as f: f.write(templates.call_pynqshell_makefile_template % (pynq_shell_path, self.platform, ipcfg, "block_design", working_dir)) synth_project_sh = vivado_pynq_proj_dir + "/synth_project.sh" with open(synth_project_sh, "w") as f: f.write(templates.call_pynqshell_makefile_template % (pynq_shell_path, self.platform, ipcfg, "bitstream", working_dir)) # call the project creation script # synthesis script will be called with a separate transformation bash_command = ["bash", make_project_sh] process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) process_compile.communicate() return (model, False)
def generate_params(self, model, path): mem_mode = self.get_nodeattr("mem_mode") # weights weights = model.get_initializer(self.onnx_node.input[1]) # convert weights into hlslib-compatible format weight_tensor = self.get_hls_compatible_weight_tensor(weights) export_wdt = self.get_weight_datatype() # we have converted bipolar weights to binary for export, # so use it as such for weight generation if self.get_weight_datatype() == DataType.BIPOLAR: export_wdt = DataType.BINARY code_gen_dir = path if mem_mode == "const": """Saves weights into params.h""" weight_hls_code = numpy_to_hls_code( weight_tensor, export_wdt, "weights", True, True ) # write weights into params.h f_weights = open("{}/params.h".format(code_gen_dir), "w") if export_wdt.bitwidth() != 1: f_weights.write( "const FixedPointWeights<{},{},{},{}> weights = ".format( self.get_nodeattr("SIMD"), export_wdt.get_hls_datatype_str(), self.get_nodeattr("PE"), self.calc_wmem(), ) ) else: f_weights.write( "const BinaryWeights<{},{},{}> weights = ".format( self.get_nodeattr("SIMD"), self.get_nodeattr("PE"), self.calc_wmem(), ) ) f_weights.write(weight_hls_code) f_weights.close() elif mem_mode == "decoupled": """Saves weights in corresponding file format for cppsim or rtlsim""" # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) # and save as unflipped weight tensor to be able to differentiate between # flipped an unflipped weight tensor (has to be flipped for cppsim) weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) # flip PE dimension and reverse SIMD flip for saving weights in .npy weight_tensor_flipped = np.flip(weight_tensor_unflipped, axis=-2) weight_tensor_flipped = np.flip(weight_tensor_flipped, axis=-1) # reshape weight tensor (flipped and unflipped) to desired shape pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") # unflipped weight_tensor_unflipped = weight_tensor_unflipped.reshape(1, -1, pe * simd) weight_tensor_unflipped = weight_tensor_unflipped.copy() # flipped weight_tensor_flipped = weight_tensor_flipped.reshape(1, -1, pe * simd) weight_tensor_flipped = weight_tensor_flipped.copy() """Saves weights into .npy file""" np.save(os.path.join(code_gen_dir, "weights.npy"), weight_tensor_flipped) """Saves weights into .dat file""" # convert weight values into hexstring weight_width = self.get_weightstream_width() # pad to nearest 4 bits to get hex strings weight_width_padded = roundup_to_integer_multiple(weight_width, 4) weight_tensor_unflipped = pack_innermost_dim_as_hex_string( weight_tensor_unflipped, export_wdt, weight_width_padded, prefix="" ) weight_stream_len = np.prod(weight_tensor_unflipped.shape) factor = math.ceil(weight_stream_len / 1024) # add zeroes to pad out file to 1024 entries weight_stream = weight_tensor_unflipped.flatten() pad_amt = (factor * 1024) - weight_stream_len weight_stream = np.pad( weight_stream, (0, pad_amt), mode="constant", constant_values="0" ) weight_stream = weight_stream.copy() i = 0 j = 0 for val in weight_stream: if i == 1024: i = 0 j += 1 with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f: f.write(val + "\n") i += 1 else: raise Exception( """Please set mem_mode to "const"i or "decoupled", currently no other parameter value is supported!""" ) # save thresholds in thresh.h if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) if thresholds is not None: threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) tdt = DataType.INT32 # use UINT32 threshold export for bipolar times bipolar inp_is_bipolar = self.get_input_datatype() == DataType.BIPOLAR wt_is_bipolar = self.get_weight_datatype() == DataType.BIPOLAR # reinterpret inp/wt as bipolar if bin_xnor_mode is iset inp_is_binary = self.get_input_datatype() == DataType.BINARY wt_is_binary = self.get_weight_datatype() == DataType.BINARY bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) if inp_is_bipolar and wt_is_bipolar: tdt = DataType.UINT32 thresholds_hls_code = numpy_to_hls_code( threshold_tensor, tdt, "thresholds", False, True ) # write thresholds into thresh.h f_thresh = open("{}/thresh.h".format(code_gen_dir), "w") tdt_hls = tdt.get_hls_datatype_str() # use binary to export bipolar activations export_odt = self.get_output_datatype() if self.get_output_datatype() == DataType.BIPOLAR: export_odt = DataType.BINARY odt_hls = export_odt.get_hls_datatype_str() f_thresh.write( "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ = ".format( self.calc_tmem(), self.get_nodeattr("PE"), threshold_tensor.shape[-1], tdt_hls, odt_hls, self.get_nodeattr("ActVal"), "std::less_equal<%s>" % tdt_hls, ) ) f_thresh.write(thresholds_hls_code) f_thresh.close()
def minimize_accumulator_width(self, model): weights = model.get_initializer(self.onnx_node.input[1]) k_h, k_w = self.get_nodeattr("Kernel") fm = self.get_nodeattr("Channels") # put weights into the shape expected by calculate_matvec_accumulator_range weights = weights.reshape(fm, k_h * k_w).transpose() if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) else: thresholds = None idt = self.get_input_datatype() # calculate minimum and maximum values of accumulator (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) if thresholds is not None: threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) # set threshold datatype (and accumulator datatype implicitly) min_threshold = thresholds.min() max_threshold = thresholds.max() # clip threshold values clip_upper = None clip_lower = None if max_threshold > acc_max + 1: clip_upper = acc_max + 1 if min_threshold < acc_min: clip_lower = acc_min if (clip_lower is not None) or (clip_upper is not None): warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name) thresholds = np.clip(thresholds, clip_lower, clip_upper) model.set_initializer(self.onnx_node.input[2], thresholds) threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) min_threshold = thresholds.min() max_threshold = thresholds.max() # get range required by threshold values tdt_min = min(acc_min, min_threshold) tdt_max = max(acc_max, max_threshold) if tdt_min < 0: if abs(tdt_min) > tdt_max: tdt = DataType.get_smallest_possible(tdt_min) else: tdt = DataType.get_smallest_possible(0 - tdt_max) else: tdt = DataType.get_smallest_possible(tdt_max) assert np.vectorize(tdt.allowed)( threshold_tensor ).all(), "Thresholds in %s can't be expressed with type %s" % ( self.onnx_node.name, str(tdt), ) self.set_nodeattr("accDataType", tdt.name) else: if acc_min < 0: if abs(acc_min) > acc_max: adt = DataType.get_smallest_possible(acc_min) else: adt = DataType.get_smallest_possible(0 - acc_max) else: adt = DataType.get_smallest_possible(acc_max) # ensure a datatype divisible by 8-bits in case this is the last node bw = roundup_to_integer_multiple(adt.bitwidth(), 8) new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) adt = DataType[new_adt_name] self.set_nodeattr("accDataType", adt.name) # for no-activation nodes, output dt = acc dt self.set_nodeattr("outputDataType", adt.name) return DataType[self.get_nodeattr("accDataType")]