def fft(ol,data_in): out = ol.axi_dma_out re = ol.axi_dma_re im = ol.axi_dma_im data_size = 512 xlnk = Xlnk() input_buffer = xlnk.cma_array(shape=(data_size,), dtype=np.int32) output_buffer_re = xlnk.cma_array(shape=(data_size,), dtype=np.int32) output_buffer_im = xlnk.cma_array(shape=(data_size,), dtype=np.int32) for i in range(512): input_buffer[i]=data_in[i] out.sendchannel.transfer(input_buffer) re.recvchannel.transfer(output_buffer_re) im.recvchannel.transfer(output_buffer_im) data_re=np.zeros(512) data_im=np.zeros(512) FFT=np.zeros(512) for i in range(512): if output_buffer_im[i]>=0x4000000: data_im[i]=-(0x8000000-output_buffer_im[i]) else: data_im[i]=output_buffer_im[i] for i in range(512): if output_buffer_re[i]>=0x4000000: data_re[i]=-(0x8000000-output_buffer_re[i]) else: data_re[i]=output_buffer_re[i] FFT=data_re*data_re+data_im*data_im return FFT
def __init__(self, description, pkt_config=1, pkt_reload=128): # Find out the correct length of config and reload super().__init__(description) xlnk = Xlnk() self.buf_config = xlnk.cma_array(shape=(pkt_config, ), dtype=np.int8) self.buf_reload = xlnk.cma_array(shape=(pkt_reload, ), dtype=np.int16) self.BWSelector.enable = 1 self._taps = 128 self._fs = 256e6 self.set_downsample(2)
def __init__(self): self.ol = Overlay("/home/xilinx/jupyter_notebooks/IP core/end.bit") self.dma_0 = self.ol.axi_dma_0 self.dma_1 = self.ol.axi_dma_1 self.dma_2 = self.ol.axi_dma_2 self.dma_3 = self.ol.axi_dma_3 self.dma_4 = self.ol.axi_dma_4 self.dma_5 = self.ol.axi_dma_5 self.top = self.ol.top_0 xlnk = Xlnk() self.map_in_buffer = xlnk.cma_array(shape=(128, 34, 34), dtype=np.float16) self.weight_buffer = xlnk.cma_array(shape=(128, 64, 3, 3), dtype=np.float16) self.bias_buffer = xlnk.cma_array(shape=(64), dtype=np.float16) self.out_buffer = xlnk.cma_array(shape=(64, 32, 32), dtype=np.float16) self.map_in_buffer_2 = xlnk.cma_array(shape=(3, 68, 68), dtype=np.float16) self.weight_buffer_2 = xlnk.cma_array(shape=(3, 64, 6, 6), dtype=np.float16) self.bias_buffer_2 = xlnk.cma_array(shape=(64), dtype=np.float16) self.out_buffer_2 = xlnk.cma_array(shape=(64, 32, 32), dtype=np.float16) self.map_in_64 = xlnk.cma_array(shape=(64, 34, 34), dtype=np.float16)
def compute(self, rays, tri_ids, tris): from pynq import Xlnk xlnk = Xlnk() num_tris = len(tris) // 9 num_rays = len(rays) // 6 self._out_ids = None self._out_inter = None self._tids = None self._tris = None self._rays = None log.info(f'{self.name}: Allocating shared input arrays') self._tids = xlnk.cma_array(shape=(num_tris, ), dtype=np.int32) self._tris = xlnk.cma_array(shape=(num_tris * 9, ), dtype=np.float32) self._rays = xlnk.cma_array(shape=(num_rays * 6, ), dtype=np.float32) log.info(f'{self.name}: Allocating shared output arrays') self._out_ids = xlnk.cma_array(shape=(num_rays, ), dtype=np.int32) self._out_inter = xlnk.cma_array(shape=(num_rays, ), dtype=np.float32) log.info(f'{self.name}: Setting accelerator input physical addresses') self.intersect_ip.write(self.ADDR_I_TNUMBER_DATA, num_tris) self.intersect_ip.write(self.ADDR_I_TDATA_DATA, self._tris.physical_address) self.intersect_ip.write(self.ADDR_I_TIDS_DATA, self._tids.physical_address) self.intersect_ip.write(self.ADDR_I_RNUMBER_DATA, num_rays) self.intersect_ip.write(self.ADDR_I_RDATA_DATA, self._rays.physical_address) self.intersect_ip.write(self.ADDR_O_TIDS_DATA, self._out_ids.physical_address) self.intersect_ip.write(self.ADDR_O_TINTERSECTS_DATA, self._out_inter.physical_address) ti = time() log.info(f'{self.name}: Filling input memory arrays') for t in range(num_tris): self._tids[t] = tri_ids[t] for i in range(9): self._tris[t * 9 + i] = tris[t * 9 + i] log.info( f'{self.name}: Triangle arrays filled in {time() - ti} seconds') ti = time() for i, r in enumerate(rays): self._rays[i] = r log.info(f'{self.name}: Ray arrays filled in {time() - ti} seconds') log.info(f'Starting co-processor {self.name}') self.intersect_ip.write(0x00, 1)
def matrixAvg(red,green,blue): dma0 = ol.axi_dma_0 xlnk = Xlnk() inputs = xlnk.cma_array(shape=(2700), dtype=np.int32) outputs = xlnk.cma_array(shape=(27), dtype=np.int32) inputs= red+green+blue dma0.sendchannel.transfer(inputs) dma0.sendchannel.wait() dma0.recvchannel.transfer(outputs) dma0.recvchannel.wait() return outputs
def __init__(self, description): super().__init__(description) xlnk = Xlnk() self.buf_data = xlnk.cma_array(shape=(2048, ), dtype=np.single) self.type = 1 self.data_inspector.transfer = 0 self.data_inspector.reset = 1
def send(self, window_coeffs, n): xlnk = Xlnk() self.input_buffer = xlnk.cma_array(shape=(self.window_length, ), dtype=np.int16) dma = self.axi_dma_window np.copyto(self.input_buffer, window_coeffs) dma.sendchannel.transfer(self.input_buffer) dma.sendchannel.wait() self.input_buffer.close()
class MatrixOpServicer(matrix_op_pb2_grpc.MatrixOpServicer): DIM = 128 def __init__(self): self.overlay = Overlay( '/home/xilinx/matmult/overlay/matmult/matmult.bit') self.dma = self.overlay.dma self.mmult_ip = self.overlay.accel self.xlnk = Xlnk() self.in_buf = self.xlnk.cma_array(shape=(2, MatrixOpServicer.DIM, MatrixOpServicer.DIM), dtype=np.float32) self.out_buf = self.xlnk.cma_array(shape=(MatrixOpServicer.DIM, MatrixOpServicer.DIM), dtype=np.float32) def MatMult(self, request, context): print('request received: matrix mult') before = time.time() # load np arrays from bytes a = pickle.loads(request.a) b = pickle.loads(request.b) lat = round((time.time() - before) * 1000000, 2) print(f'unpickled data in {lat} microsec') # run kernel before = time.time() self.in_buf[:] = np.stack((a, b)) self.dma.sendchannel.transfer(self.in_buf) self.dma.recvchannel.transfer(self.out_buf) self.mmult_ip.write(CTRL_REG, (AP_START | AUTO_RESTART)) self.dma.sendchannel.wait() self.dma.recvchannel.wait() ret = matrix_op_pb2.OpReply(res=pickle.dumps(self.out_buf)) lat = round((time.time() - before) * 1000000, 2) print(f'mult done in {lat} microsec') return ret
class CmaBufferFactory(): def __init__(self): self._xlnk = Xlnk() def make_cma_buf(self, shape, data_type): assert shape != [], RuntimeError return self._xlnk.cma_array(shape=shape, cacheable=1, dtype=data_type) def del_cma_buf(self, cma_buf): cma_buf.close()
def alloc_descriptor(Control, data_size, NDPL = 0x0, NDPU = 0x0, Status = 0x0, APP0 = 0x0, APP1 = 0x0, APP2 = 0x0, APP3 = 0x0, APP4 = 0x0): mmu = Xlnk() descriptor = mmu.cma_array([13, ]) descriptor[0] = NDPL descriptor[1] = NDPU buffer = mmu.cma_array([1, data_size]) descriptor[2] = buffer.physical_address & 0xffffffff descriptor[3] = (buffer.physical_address >> 32) & 0xffffffff # Reversed descriptor[4] = 0x0 descriptor[5] = 0x0 descriptor[6] = Control descriptor[7] = Status descriptor[8] = APP0 descriptor[9] = APP1 descriptor[10] = APP2 descriptor[11] = APP3 descriptor[12] = APP4 return descriptor, buffer
def Predict(self): print('Start Predict.......') self.dma = spi.axi_dma_0 xlnk = Xlnk() dma_in = xlnk.cma_array(shape=(25, ), dtype=np.uint32) dma_out = xlnk.cma_array(shape=(25, ), dtype=np.uint32) for i in range(25): dma_in[i] = int(self.data_in[i]) self.dma.sendchannel.transfer(dma_in) self.dma.recvchannel.transfer(dma_out) self.dma.sendchannel.wait() self.dma.recvchannel.wait() self.data_out = dma_out
def fft2(image,FDV): fft2_design = Overlay("./bitstream/fft2.bit") dma = fft2.axi_dma_0 fft2 = fft2.fft2_0 input_array = np.array(image) xlnk = Xlnk() in_buffer = xlnk.cma_array(shape=(pic_height, pic_width), dtype=np.uint8) out_buffer = xlnk.cma_array(shape=(pic_height, pic_width), dtype=np.uint8) np.copyto(in_buffer,input_array) dma.sendchannel.transfer(in_buffer) dma.recvchannel.transfer(out_buffer) fft2.write(0x00,FDV) # start dma.sendchannel.wait() dma.recvchannel.wait() result = Image.fromarray(out_buffer) in_buffer.close() out_buffer.close() xlnk.xlnk_reset() return result
def __init__(self, description, pkt_size, buf_dtype=np.int16, buf_words_per_pkt=2): super().__init__(description) # Init config register self.reset = 1 self.enable = 1 self.pkt_size = pkt_size-1 self.auto_restart = 0 self.reset = 0 # Init buffer xlnk = Xlnk() self.buf = xlnk.cma_array(shape=(pkt_size * buf_words_per_pkt, ), dtype=np.int16)
def __init__(self, description, pkt_sym=16, pkt_time=128, pkt_fft=1024): """Driver for our QPSK TX IP hierarchy This encompasses the qpsk tx logic and the DMAs for data transfer of exposed signals. """ super().__init__(description) xlnk = Xlnk() self.buf_fft = xlnk.cma_array(shape=(pkt_fft, ), dtype=np.uint32) self.buf_sym = xlnk.cma_array(shape=(pkt_sym, ), dtype=np.uint8) self.buf_time = xlnk.cma_array(shape=(pkt_time * 2, ), dtype=np.int16) # QPSK IP General Config self.axi_qpsk_tx.lfsr_rst = 1 self.axi_qpsk_tx.enable = 1 self.axi_qpsk_tx.packetsize_rf = 1024 self.axi_qpsk_tx.lfsr_rst = 0 self.axi_qpsk_tx.output_gain = 2**32 - 1 # QPSK IP Symbol Config self.axi_qpsk_tx.reset_symbol = 1 self.axi_qpsk_tx.packetsize_symbol = pkt_sym - 1 self.axi_qpsk_tx.reset_symbol = 0 self.axi_qpsk_tx.autorestart_symbol = 0 # QPSK IP FFT Config self.axi_qpsk_tx.reset_fft = 1 self.axi_qpsk_tx.packetsize_fft = pkt_fft - 1 self.axi_qpsk_tx.reset_fft = 0 self.axi_qpsk_tx.autorestart_fft = 0 ## QPSK IP Time Config self.axi_qpsk_tx.reset_time = 1 self.axi_qpsk_tx.packetsize_time = pkt_time - 1 self.axi_qpsk_tx.reset_time = 0 self.axi_qpsk_tx.autorestart_time = 0
def run_my_cnn(path, name): overlay = Overlay(path) ip = nngen_ctrl.nngen_ip(overlay, name) xlnk = Xlnk() buf = xlnk.cma_array(16 * 1024, dtype=np.int32) for i in range(len(buf)): buf[i] = i ip.set_global_buffer(buf) ip.run() ip.wait() print(buf[:16])
def __init__(self, description, pkt_wind=2048): super().__init__(description) xlnk = Xlnk() self.buf_wind = xlnk.cma_array(shape=(pkt_wind, ), dtype=np.int16) self.set_window_coeffs(np.ones(2048)) self.set_enable(1) self.window_0.dc_enable = 1 self.params = ipw.VBox([], layout=ipw.Layout(width='auto')) self.window_sel = ipw.HBox([], layout=ipw.Layout(width='auto')) self.coeffs = np.ones(2048) self.window_length = 2048 self.window_type = 'Rectangular' self.coeffs_sat = np.ones(2048) self.frequency = 0 self.input = 0
def __init__(self, fpga_config, overlay): ddr2fpga_nb = fpga_config.config['ddr2fpga_nb'] fpga2ddr_nb = fpga_config.config['fpga2ddr_nb'] mem_nb = fpga_config.config['mem_nb'] self.u_axi_dma_ddr2fpga = [ overlay.__getattr__(f'axi_dma_ddr2fpga_{i}') for i in range(ddr2fpga_nb) ] self.u_axi_dma_fpga2ddr = [ overlay.__getattr__(f'axi_dma_fpga2ddr_{i}') for i in range(fpga2ddr_nb) ] self.u_mem = [ overlay.__getattr__(f'memory_{i}') for i in range(mem_nb) ] self.u_func = [] for name, nb in fpga_config.func_layout.items(): self.u_func += [ overlay.__getattr__(f'{name}_{i}') for i in range(nb) ] self.u_ddr2fpga = [ overlay.__getattr__(f'ddr2fpga_{i}') for i in range(ddr2fpga_nb) ] # enable function interrupts for i in self.u_func: i.write(0x04, 1) i.write(0x08, 1) xlnk = Xlnk() self.chunk_array = [ xlnk.cma_array(shape=(fpga_config.config['mem_depth'], ), dtype=np.uint64) for i in range(fpga_config.config['mem_nb']) ] self.state = FPGA_state(fpga_config) self.config = fpga_config.config
import time from pynq import Overlay #import numpy as np from pynq import Xlnk M = 200 N = 200 xlnk = Xlnk(); overlay = Overlay('/home/xilinx/pynq/overlays/HMM_v4/HMM_v4_4.bit') overlay.download() HMM_test = overlay.HMM_Scoring_0 arr_m=xlnk.cma_array(shape=(200,),cacheable=0,dtype=int) arr_n=xlnk.cma_array(shape=(200,),cacheable=0,dtype=int) arr_x=xlnk.cma_array(shape=(200,),cacheable=0,dtype=int) s1 = "gcgagcgaactgcggatagttacactaacacacgaggcacgtggttgggagttacggccatgcaatggatagctcctgcatgatcggttattatacagcccattttgggcgccttccaaaggatctacttatcagaaggggtggtgccgcataactctgaccggtgggcgtagtcatagcagacttttgccgggaacgca" s2 = "tggtccatctgcttggtggcagccgcaagatgccaattattggcgcggtcgacggggctgctatctgaatatcatatggtcttcacggagacaggaacttagcaaggtactaatcccacgcaaagtctttttttcaaaaatccagtctagtcctattatatatcctcggaaaacggtattaggacatcgggtacattcta" s3 = "tttattgtttttgatctcgcgtctcaaagtagctccgacacacaagcggcccttggagactgctcccgagtgcctaggggcatttggtacaaggcggttataaaacgacgacctttccccttagtgcacctgggcaggctcacaccattcctccaccgtgtgtattatttgaggggaaggattctcctgtggcggctctt" s4 = "tcaggacccaaggaggtatcaagattggaagattgtctccaggttctataggcaaaatgcaccgccctcaacggccagatgccggccgcagacttagatatgaatagaatcgggtcaagctctgctacatagattctcctccgtgctcgataactgccggagtttacgcgataagattagcggcactcttcgctgggacc" arr1 = list(s1) arr2 = list(s2) arr3 = list(s3) b1 = {} b2 = {} b3 = {} arr1 = [w.replace('a', '1')for w in arr1] arr1 = [w.replace('c', '2') for w in arr1] arr1 = [w.replace('g', '3') for w in arr1] arr1 = [w.replace('t', '4') for w in arr1] arr2 = [x.replace('a', '1')for x in arr2]
class sharedmemOverlay(Overlay): """A simple Mem-Mapped Overlay for PYNQ. This overlay is implemented with a single Matrix Multiply Core fed connected directly to the ARM Core AXI interface. """ __RESET_VALUE = 0 __NRESET_VALUE = 1 """ For convenince, we define register offsets that are scraped from the HLS implementation header files. """ __MMULT_AP_CTRL_OFF = 0x00 __MMULT_AP_CTRL_START_IDX = 0 __MMULT_AP_CTRL_DONE_IDX = 1 __MMULT_AP_CTRL_IDLE_IDX = 2 __MMULT_AP_CTRL_READY_IDX = 3 __MMULT_GIE_OFF = 0x04 __MMULT_IER_OFF = 0x08 __MMULT_ISR_OFF = 0x0C __MMULT_ADDR_A_DATA = 0x10 __MMULT_ADDR_BT_DATA = 0x18 __MMULT_ADDR_C_DATA = 0x20 __MMULT_A_SHAPE = (100, 100) __MMULT_BT_SHAPE = (100, 100) __MMULT_C_SHAPE = (100, 100) __MMULT_A_SIZE = __MMULT_A_SHAPE[0] * __MMULT_A_SHAPE[1] __MMULT_BT_SIZE = __MMULT_BT_SHAPE[0] * __MMULT_BT_SHAPE[1] __MMULT_C_SIZE = __MMULT_C_SHAPE[0] * __MMULT_C_SHAPE[1] def __init__(self, bitfile, **kwargs): """Initializes a new sharedmemOverlay object. """ # The following lines do some path searching to enable a # PYNQ-Like API for Overlays. For example, without these # lines you cannot call sharedmemOverlay('sharedmem.bit') because # sharedmem.bit is not on the bitstream search path. The # following lines fix this for any non-PYNQ Overlay # # You can safely reuse, and ignore the following lines # # Get file path of the current class (i.e. /opt/python3.6/<...>/sharedmem.py) file_path = os.path.abspath(inspect.getfile(inspect.currentframe())) # Get directory path of the current class (i.e. /opt/python3.6/<...>/sharedmem/) dir_path = os.path.dirname(file_path) # Update the bitfile path to search in dir_path bitfile = os.path.join(dir_path, bitfile) # Upload the bitfile (and parse the colocated .tcl script) super().__init__(bitfile, **kwargs) # Manually define the GPIO pin that drives reset self.__resetPin = GPIO(GPIO.get_gpio_pin(0), "out") self.nreset() # Define a Register object at address 0x0 of the mmult address space # We will use this to set bits and start the core (see start()) # Do NOT write to __ap_ctrl unless __resetPin has been set to __NRESET_VALUE self.__ap_ctrl = Register(self.mmultCore.mmio.base_addr, 32) self.__a_offset = Register( self.mmultCore.mmio.base_addr + self.__MMULT_ADDR_A_DATA, 32) self.__bt_offset = Register( self.mmultCore.mmio.base_addr + self.__MMULT_ADDR_BT_DATA, 32) self.__c_offset = Register( self.mmultCore.mmio.base_addr + self.__MMULT_ADDR_C_DATA, 32) self.xlnk = Xlnk() def __start(self): """Raise AP_START and enable the HLS core """ self.__ap_ctrl[self.__MMULT_AP_CTRL_START_IDX] = 1 pass def __stop(self): """Lower AP_START and disable the HLS core """ self.__ap_ctrl[self.__MMULT_AP_CTRL_START_IDX] = 0 pass def nreset(self): """Set the reset pin to self.__NRESET_VALUE to place the core into not-reset (usually run) """ self.__resetPin.write(self.__NRESET_VALUE) def reset(self): """Set the reset pin to self.__RESET_VALUE to place the core into reset """ self.__resetPin.write(self.__RESET_VALUE) def run(self, A, B): """ Launch computation on the mmult HLS core Parameters ---------- A : Numpy ndarray of at most size TODOxTODO (it will be padded) A buffer containing ND Array Elements to be transferred to the core B : Numpy ndarray of at most size TODOxTODO (it will be padded) A buffer containing ND Array Elements to be transferred to the core """ if (not isinstance(A, np.ndarray)): raise TypeError("Parameter A must be an instance of " "numpy.ndarray") if (not isinstance(B, np.ndarray)): raise RuntimeError("Parameter B must be an instance of " "numpy.ndarray") sza = A.shape if (sza[0] > self.__MMULT_A_SHAPE[0]): raise RuntimeError( f"Dimension 0 of A must be less than or equal to" f"{self.__MMULT_A_SHAPE[0]}") if (sza[1] > self.__MMULT_A_SHAPE[1]): raise RuntimeError( f"Dimension 1 of A must be less than or equal to" f"{self.__MMULT_A_SHAPE[1]}") szb = B.shape if (szb[0] > self.__MMULT_BT_SHAPE[1]): raise RuntimeError( f"Dimension 0 of B must be less than or equal to" f"{self.__MMULT_BT_SHAPE[0]}") if (szb[1] > self.__MMULT_BT_SHAPE[0]): raise RuntimeError( f"Dimension 1 of B must be less than or equal to" f"{self.__MMULT_BT_SHAPE[1]}") # Check size of A # Check size of B # Allocate C a = self.xlnk.cma_array(self.__MMULT_A_SHAPE, "int") bt = self.xlnk.cma_array(self.__MMULT_BT_SHAPE, "int") c = self.xlnk.cma_array(self.__MMULT_C_SHAPE, "int") # Copy A->a a[:A.shape[0], :A.shape[1]] = A # Copy BT->bt bt[:B.shape[1], :B.shape[0]] = B.transpose() # TODO: Enable Interrupts # Write address of a, bt, c to HLS core self.__a_offset[31:0] = self.xlnk.cma_get_phy_addr(a.pointer) self.__bt_offset[31:0] = self.xlnk.cma_get_phy_addr(bt.pointer) self.__c_offset[31:0] = self.xlnk.cma_get_phy_addr(c.pointer) self.__start() # TODO: Wait for ASYNC Interrupt # TODO: Clear Interrupt import time time.sleep(1) self.__stop() C = np.zeros((A.shape[0], B.shape[1]), np.int32) # Transform C into a Numpy Array C[:A.shape[0], :B.shape[1]] = c[:A.shape[0], :B.shape[1]] a.freebuffer() bt.freebuffer() c.freebuffer() return C
def __init__(self, addr_port_client=("192.168.1.100", 3000)): print('FPGA_Connect_Object init') self.resolution = [640, 360] self.client_port = addr_port_client team_name = 'SystemsETHZ' # agent = Agent(team_name) interval_time = 0 xlnk = Xlnk() xlnk.xlnk_reset() ###########################variable initializing###################### OVERLAY_PATH = '/home/xilinx/jupyter_notebooks/dac_2019_contest/common/' + team_name + '/ultra96_v04.bit' WEIGHTS_FILE_NAME = '/home/xilinx/jupyter_notebooks/dac_2019_contest/common/' + team_name + '/weights_file_v04_demo.txt' ###########################change board settings###################### ###########################download overlay###################### overlay = Overlay(OVERLAY_PATH) self.dma = overlay.axi_dma_0 self.nn_ctrl = MMIO(0xA0010000, length=1024) ###########################download weights###################### self.MINIBATCH_SIZE = 1 self.height = 176 self.width = 320 pixel_bits = 24 pixels_per_line = 384/pixel_bits self.num_lines = int((self.height*self.width)/pixels_per_line) self.in_buffer = xlnk.cma_array(shape=(self.MINIBATCH_SIZE*self.num_lines, 64), dtype=np.uint8) fire1_num_out_lines = (self.height/4)*(self.width/4)*self.MINIBATCH_SIZE self.fire1_out_buffer = xlnk.cma_array(shape=(int(16*fire1_num_out_lines),), dtype=np.uint32) fire2_num_out_lines = (self.height/8)*(self.width/8)*self.MINIBATCH_SIZE self.fire2_out_buffer = xlnk.cma_array(shape=(int(16*fire2_num_out_lines),), dtype=np.uint32) fire3_num_out_lines = (self.height/16)*(self.width/16)*self.MINIBATCH_SIZE self.fire3_out_buffer = xlnk.cma_array(shape=(int(16*fire3_num_out_lines),), dtype=np.uint32) self.fire4_out_buffer = xlnk.cma_array(shape=(int(16*fire3_num_out_lines),), dtype=np.uint32) self.fire5_out_buffer = xlnk.cma_array(shape=(int(16*fire3_num_out_lines),), dtype=np.uint32) final_num_lines = int((self.height/16)*(self.width/16)) self.bndboxes = [xlnk.cma_array(shape=(self.MINIBATCH_SIZE,final_num_lines,16), dtype=np.int32), xlnk.cma_array(shape=(self.MINIBATCH_SIZE,final_num_lines,16), dtype=np.int32), xlnk.cma_array(shape=(self.MINIBATCH_SIZE,final_num_lines,16), dtype=np.int32), xlnk.cma_array(shape=(self.MINIBATCH_SIZE,final_num_lines,16), dtype=np.int32)] self.obj_array = np.zeros((self.MINIBATCH_SIZE,final_num_lines)) NUM_LAYERS = 3+4*4 weights_file = open(WEIGHTS_FILE_NAME, "r") layer = 0 total_iterations = np.zeros(NUM_LAYERS) for line in weights_file: if "layer" in line: temp = line.split(": ") layer = int(temp[1]) if "total_iterations" in line: temp = line.split(": ") total_iterations[layer] = int(temp[1]) weights_file.close() weightfactors_length = np.zeros(NUM_LAYERS) self.weightsfactors = [] for i in range(0, NUM_LAYERS): weightfactors_length[i] = int(total_iterations[i]) self.weightsfactors.append( xlnk.cma_array(shape=(int(16*weightfactors_length[i]),), dtype=np.uint32) ) self.obj_factors = np.zeros(4) self.box_factors = np.zeros(4) index = 0 weights_file = open(WEIGHTS_FILE_NAME, "r") for line in weights_file: if "layer" in line: temp = line.split(": ") layer = int(temp[1]) index = 0 elif "total_iterations" not in line: if "obj_factor" in line: temp = line.split(' ') self.obj_factors[int(temp[1])] = int(temp[2]) elif "box_factor" in line: temp = line.split(' ') self.box_factors[int(temp[1])] = int(temp[2]) else: no0x = line.split('0x')[-1] base = 1 while base < len(no0x): part = no0x[-1*(base+8):-1*base] self.weightsfactors[layer][index*16 + int(base/8)] = int(part, 16) base += 8 index += 1
from math import ceil import time from pynq import Xlnk import numpy as np import matplotlib.pyplot as plt from pynq.lib import Pmod_ADC from pynq.overlays.base import BaseOverlay ol = BaseOverlay("base.bit") #create an instance of Xlnk xlnk = Xlnk() xlnk.cma_stats() #allocate a memory buffer py_buffer = xlnk.cma_array(shape=(100, ), dtype=np.uint32) #allocate a output memory buffer out_buffer = xlnk.cma_array(shape=(100, ), dtype=np.uint32) adc = Pmod_ADC(ol.PMODA) #delay = 0.00 #values = np.linspace(0, 2, 20) samples = [] count = 0 while count < 100: count = count + 1 sample = adc.read() #time.sleep(0.1) #samples.append(sample[0])
print("Loading image ../images/bigBunny_1080.png") img = cv2.imread('../images/bigBunny_1080.png') imgY = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) print("Size of imgY is ", imgY.shape) height, width, channels = img.shape kernel = np.array([[1.0, 2.0, 1.0], [0.0, 0.0, 0.0], [-1.0, -2.0, -1.0]], np.float32) # Sobel Horizontal Edges numberOfIterations = 10 print("Number of loop iterations: " + str(numberOfIterations)) dstSW = np.ones((height, width), np.uint8) xFimgY = mem_manager.cma_array( (height, width), np.uint8) #allocated physically contiguous numpy array xFimgY[:] = imgY[:] # copy source data xFdst = mem_manager.cma_array( (height, width), np.uint8) #allocated physically contiguous numpy array print("Start SW loop") startSW = time.time() for i in range(numberOfIterations): cv2.filter2D(imgY, -1, kernel, dst=dstSW, borderType=cv2.BORDER_CONSTANT) #filter2D on ARM stopSW = time.time() print("Start HW loop") startPL = time.time() for i in range(numberOfIterations):
hdmi_in.start() from pynq import MMIO rgb2yuv = MMIO(base.ip_dict['h264/rgb2yuv_with_axi_0']['phys_addr'], 0x10000) h264 = MMIO(base.ip_dict['h264/h264enc_with_axi_0']['phys_addr'], 0x10000) from h264py.h264 import H264 h264_send = H264() from pynq import Xlnk xlnk = Xlnk() size = 1920 * 1088 * 4 xlnk.xlnk_reset() cma_recv = xlnk.cma_array((size, ), dtype=np.uint8) result = xlnk.cma_array((size, ), dtype=np.uint8) for i in range(200): cma_send = hdmi_in.readframe() rgb2yuv.write(0x04, cma_send.physical_address) rgb2yuv.write(0x08, cma_recv.physical_address) rgb2yuv.write(0x0c, 1088) rgb2yuv.write(0x10, 1920) rgb2yuv.write(0x14, 1920 * 1088) rgb2yuv.write(0x00, 1) rgb2yuv.write(0x00, 0) while rgb2yuv.read(0x18) == 1: pass while rgb2yuv.read(0x18) == 0:
y2 = int(round(bbox[b][3] * 360)) x1 = np.clip(x1, 1, 640) y1 = np.clip(y1, 1, 360) x2 = np.clip(x2, 1, 640) y2 = np.clip(y2, 1, 360) result.write(batch[b].split('.')[0].zfill(3) + '.jpg' + ' ' + str([x1, x2, y1, y2]) + '\n') print(batch[b], [x1, x2, y1, y2]) ################################## Init FPGA ################################## xlnk = Xlnk() xlnk.xlnk_reset() img = xlnk.cma_array(shape=[4, 160, 320, 4], dtype=np.uint8) fm = xlnk.cma_array(shape=(628115 * 32), dtype=np.uint8) weight = xlnk.cma_array(shape=(220672), dtype=np.int16) biasm = xlnk.cma_array(shape=(432 * 16), dtype=np.int16) print("Allocating memory done") parameter = np.fromfile("SkyNet.bin", dtype=np.int16) np.copyto(weight, parameter[0:220672]) np.copyto(biasm[0:428 * 16], parameter[220672:]) print("Parameters loading done") overlay = Overlay("SkyNet.bit") print("Bitstream loaded") SkyNet = overlay.SkyNet SkyNet.write(0x10, img.physical_address)
import itertools from functools import partial # Packages for using hardware import pynq.lib.dma from pynq import Xlnk import numpy as np from pynq import Overlay import sys overlay = Overlay('./sampleRNN_GRU_unroll.bit') # Downloading the bitstream on the FPGA dma1 = overlay.axi_dma_0 # Having an object point to the DMA xlnk = Xlnk() # Allocation of contiguous arrays dim_mv = 64 # Allocating space for both inputs and outputs in_stream = xlnk.cma_array(shape=(2*dim_mv+192*dim_mv+192*dim_mv+3*dim_mv+3*dim_mv,1), dtype=np.float32) out_stream = xlnk.cma_array(shape=(6*dim_mv,1), dtype=np.float32) try: import torch.backends.cudnn.rnn except ImportError: pass # Function used for using hardware designed def GRU_Hardware(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None): # Pre-processing data detachedVar = torch.cat((input.t(), hidden.t(), w_ih.reshape(3*dim_mv*dim_mv, 1), w_hh.reshape(3*dim_mv*dim_mv, 1), b_ih.reshape(3*dim_mv, 1), b_hh.reshape(3*dim_mv, 1)), 0).detach() in_stream[:] = detachedVar # Sending out data to the hardware and letting the DMA know the space allocated for the output dma1.sendchannel.transfer(in_stream)
from pynq import Xlnk from pynq import MMIO from pprint import pprint import random M = int(sys.argv[1]) N = int(sys.argv[2]) xlnk = Xlnk() ol = Overlay('./tutorial.bit') ####this prints all the IPs inside pprint(ol.ip_dict) # load inputs in_buffer = xlnk.cma_array(shape=(2 * M * M, ), dtype=np.uint32) out_buffer = xlnk.cma_array(shape=(M * M, ), dtype=np.uint32) for i in range(0, len(in_buffer)): in_buffer[i] = random.randint(1, 9) m0 = np.zeros((M, M)) for i in range(M * M): base_row = int(i * N / (M * M)) base_column = int(int(i % M) / N) column = int(base_column * N) + (int(i % N)) row = base_row + int((i % ((M * M) / N)) / M) * N m0[row][column] = in_buffer[i] m1 = np.zeros((M, M)) for i in range(M * M):
from datetime import datetime from pynq import Xlnk from pynq import Overlay import pynq import struct from multiprocessing import Process, Pipe, Queue, Event, Manager print('\n**** Running SkyNet') xlnk = Xlnk() xlnk.xlnk_reset() ########## Allocate memory for weights and off-chip buffers mytype = 'B,' * 63 + 'B' dt = np.dtype(mytype) img = xlnk.cma_array(shape=(3, 162 * 2, 322 * 2), dtype=np.uint8) conv_weight_1x1_all = xlnk.cma_array(shape=(413, 32), dtype=dt) conv_weight_3x3_all = xlnk.cma_array(shape=(64, 3, 3), dtype=dt) bias_all = xlnk.cma_array(shape=(106), dtype=dt) DDR_pool_3_out = xlnk.cma_array(shape=(2, 164, 324), dtype=dt) DDR_pool_6_out = xlnk.cma_array(shape=(3, 84, 164), dtype=dt) DDR_buf = xlnk.cma_array(shape=(128, 44, 84), dtype=dt) predict_boxes = xlnk.cma_array(shape=(4, 5), dtype=np.float32) constant = xlnk.cma_array(shape=(4, 3), dtype=np.int32) print("Allocating memory done") ########### Load parameters from SD card to DDR params = np.fromfile("SkyNet.bin", dtype=dt) idx = 0
def update_graphs(tsliderValue): graphs = [] motor.capture_mode('ia_ib_angle_rpm') xlnk = Xlnk() input_buffer = xlnk.cma_array(shape=(256, ), dtype=np.uint8) capture_address = input_buffer.physical_address capture_count = 1000 def continuous_capture(capture_count): mmio_stream = MMIO(capture_address, 256) cap_list = [([]) for i in range(4)] for _ in range(capture_count): motor.stream_capture(capture_address) for i in range(4, 260, 4): stream = mmio_stream.read(i - 4, 4) highbits, lowbits = bytesplit(stream) if (i % 8 != 0): cap_list[0].extend([(np.int16(lowbits))]) cap_list[1].extend([(np.int16(highbits))]) else: cap_list[2].extend([(np.int16(lowbits))]) cap_list[3].extend([(np.int16(highbits))]) return cap_list cap_list = continuous_capture(capture_count) Ia, Ib, angle, rpm = cap_list[0], cap_list[1], cap_list[3], cap_list[2] current_Ia = np.array(Ia) * 0.00039 current_Ib = np.array(Ib) * 0.00039 data = { 'Ia': current_Ia, 'Ib': current_Ib, 'angle': cap_list[3], 'rpm': cap_list[2] } df = pd.DataFrame(data, columns=['Ia', 'Ib', 'angle', 'rpm']) if str(tsliderValue) == 'Ia Current': data = df.Ia elif str(tsliderValue) == 'Ib Current': data = df.Ib elif str(tsliderValue) == 'Angle': data = df.angle else: data = df.rpm graphs.append( dcc.Graph(id='Ia', figure={ 'data': [ go.Scatter( x=random_x, y=data, opacity=0.7, marker={ 'size': 15, 'line': { 'width': 0.5, 'color': 'white' } }, ) for i in df.items() ], 'layout': go.Layout(xaxis={'title': 'Sample'}, yaxis={'title': str(tsliderValue)}, margin={ 'l': 80, 'b': 40, 't': 10, 'r': 10 }, hovermode='closest') })) graphs.append((html.Div([dcc.Markdown(children='### `Plot-2 Ia vs Ib`')], style={'padding': '3px 3px 3px 3px'}))) graphs.append( dcc.Graph(id='Ia vs Ib', figure={ 'data': [ go.Scattergl(x=df['Ia'], y=df['Ib'], mode='markers', opacity=0.7, marker=dict(color='#F0598E', line=dict(width=1)), name=i) for i in df.items() ], 'layout': go.Layout(xaxis={'title': 'Current Ia'}, yaxis={'title': 'Current Ib'}, margin={ 'l': 80, 'b': 40, 't': 10, 'r': 10 }, legend={ 'x': 0, 'y': 1 }, hovermode='closest') }), ) return graphs
FracNet.register_map # In[4]: # timer.register_map # In[5]: bus512 = 'B,' * 63 + 'B' dt_512 = np.dtype(bus512) bus256 = 'B,' * 31 + 'B' dt_256 = np.dtype(bus256) image_thermo = xlnk.cma_array(shape=(3, 32, 32), dtype=np.uint64) result = xlnk.cma_array(shape=(10), dtype=np.float32) # In[6]: import numpy as np images = np.load('conv1_input_uint64.npy') # In[7]: num_tests = 1000 with open('labels.bin', 'rb') as f: content = f.read() print(len(content)) labels = np.ndarray((num_tests, ))
class _PSTraceAnalyzer: """Class for the Trace Analyzer controlled by PS. A typical use of this class is on the base overlay. This class can capture digital IO patterns / stimulus on all the pins. There can by multiple such instances on the defined overlay. Attributes ---------- trace_control : MMIO The trace controller associated with the analyzer. dma : DMA The PS controlled DMA object associated with the analyzer. intf_spec : dict The interface specification, e.g., PYNQZ1_PMODA_SPECIFICATION. num_analyzer_samples : int The number of samples to be analyzed. samples : numpy.ndarray The raw data samples expressed in numpy array. frequency_mhz: float The frequency of the trace analyzer, in MHz. clk : Clocks The clock management unit for the trace analyzer. xlnk : Xlnk The Xlnk object to control contiguous memory. """ def __init__(self, ip_info, intf_spec_name): """Return a new PS controlled trace analyzer object. The maximum sample rate is 100MHz. Usually the sample rate is set to no larger than 10MHz in order for the signals to be captured on pins / wires. For Pmod header, pin numbers 0-7 correspond to the pins on the Pmod interface. For Arduino header, pin numbers 0-13 correspond to D0-D13; pin numbers 14-19 correspond to A0-A5; pin numbers 20-21 correspond to SDA and SCL. Parameters ---------- ip_info : dict The dictionary containing the IP associated with the analyzer. intf_spec_name : str/dict The name of the interface specification. """ if type(intf_spec_name) is str: self.intf_spec = eval(intf_spec_name) elif type(intf_spec_name) is dict: self.intf_spec = intf_spec_name else: raise ValueError("Interface specification has to be str or dict.") trace_cntrl_info = ip_info['trace_cntrl_{}_0'.format( self.intf_spec['monitor_width'])] trace_dma_info = ip_info['axi_dma_0'] self.trace_control = MMIO(trace_cntrl_info['phys_addr'], trace_cntrl_info['addr_range']) self.dma = DMA(trace_dma_info) self.num_analyzer_samples = 0 self.samples = None self._cma_array = None self.frequency_mhz = 0 self.clk = Clocks self.xlnk = Xlnk() self._status = 'RESET' def __repr__(self): """Disambiguation of the object. Users can call `repr(object_name)` to display the object information. """ parameter_list = list() parameter_list.append('num_analyzer_samples={}'.format( self.num_analyzer_samples)) parameter_list.append('frequency_mhz={}'.format(self.frequency_mhz)) parameter_string = ", ".join(map(str, parameter_list)) return '{}({})'.format(self.__class__.__name__, parameter_string) @property def status(self): """Return the analyzer's status. Returns ------- str Indicating the current status of the analyzer; can be 'RESET', 'READY', or 'RUNNING'. """ return self._status def setup(self, num_analyzer_samples=DEFAULT_NUM_TRACE_SAMPLES, frequency_mhz=DEFAULT_CLOCK_FREQUENCY_MHZ, fclk_index=3): """Configure the trace analyzer. This method prepares the trace analyzer by sending configuration parameters to the Microblaze. Note that the analyzer is always attached to the pins, so there is no need to use any method like 'connect()'. In short, once the analyzer has been setup, it is connected as well. FCLK3 will be configured during this method. Note ---- The first sample captured is a dummy sample (for both pattern generator and FSM generator), therefore we have to allocate a buffer one sample larger. Parameters ---------- num_analyzer_samples : int The number of samples to be analyzed. frequency_mhz: float The frequency of the captured samples, in MHz. fclk_index : int The index of the fclk controlled by clock management object. """ if not 1 <= num_analyzer_samples <= MAX_NUM_TRACE_SAMPLES: raise ValueError('Number of samples should be in ' '[1, {}]'.format(MAX_NUM_TRACE_SAMPLES)) self.num_analyzer_samples = num_analyzer_samples if not MIN_CLOCK_FREQUENCY_MHZ <= frequency_mhz <= \ MAX_CLOCK_FREQUENCY_MHZ: raise ValueError("Clock frequency out of range " "[{}, {}]".format(MIN_CLOCK_FREQUENCY_MHZ, MAX_CLOCK_FREQUENCY_MHZ)) setattr(self.clk, "fclk{}_mhz".format(fclk_index), frequency_mhz) self.frequency_mhz = frequency_mhz trace_byte_width = round(self.intf_spec['monitor_width'] / 8) self._cma_array = self.xlnk.cma_array( [1, self.num_analyzer_samples], dtype=BYTE_WIDTH_TO_NPTYPE[trace_byte_width]) self._status = 'READY' def reset(self): """Reset the trace analyzer. This method will bring the trace analyzer from any state to 'RESET' state. """ if self._status == 'RUNNING': self.stop() self.samples = None self.num_analyzer_samples = 0 self.frequency_mhz = 0 if self._cma_array is not None: self._cma_array.close() self._status = 'RESET' def run(self): """Start the DMA to capture the traces. Return ------ None """ self.dma.recvchannel.transfer(self._cma_array) if self.intf_spec['monitor_width'] == 32: self.trace_control.write(TRACE_CNTRL_32_LENGTH, self.num_analyzer_samples) self.trace_control.write(TRACE_CNTRL_32_DATA_COMPARE, 0) self.trace_control.write(TRACE_CNTRL_32_ADDR_AP_CTRL, 1) self.trace_control.write(TRACE_CNTRL_32_ADDR_AP_CTRL, 0) else: self.trace_control.write(TRACE_CNTRL_64_LENGTH, self.num_analyzer_samples) self.trace_control.write(TRACE_CNTRL_64_DATA_COMPARE_MSW, 0) self.trace_control.write(TRACE_CNTRL_64_DATA_COMPARE_LSW, 0) self.trace_control.write(TRACE_CNTRL_64_ADDR_AP_CTRL, 1) self.trace_control.write(TRACE_CNTRL_64_ADDR_AP_CTRL, 0) self._status = 'RUNNING' def stop(self): """Stop the DMA after capture is done. Return ------ None """ self.dma.recvchannel.wait() self._status = 'READY' def __del__(self): """Destructor for trace buffer object. Returns ------- None """ if self._cma_array is not None: self._cma_array.close() def analyze(self, steps): """Analyze the captured pattern. This function will process the captured pattern and put the pattern into a Wavedrom compatible format. The data output is of format: [{'name': '', 'pin': 'D1', 'wave': '1...0.....'}, {'name': '', 'pin': 'D2', 'wave': '0.1..01.01'}] Note the all the lanes should have the same number of samples. All the pins are assumed to be tri-stated and traceable. Currently only no `step()` method is supported for PS controlled trace analyzer. Parameters ---------- steps : int Number of samples to analyze. A value 0 means to analyze all the valid samples. Returns ------- list A list of dictionaries, each dictionary consisting the pin number, and the waveform pattern in string format. """ io_pins = get_tri_state_pins(self.intf_spec['traceable_io_pins'], self.intf_spec['traceable_tri_states']) if steps == 0: num_valid_samples = self.num_analyzer_samples else: num_valid_samples = steps trace_byte_width = round(self.intf_spec['monitor_width'] / 8) data_type = '>i{}'.format(trace_byte_width) self.samples = np.zeros(num_valid_samples, dtype=data_type) np.copyto(self.samples, self._cma_array) temp_bytes = np.frombuffer(self.samples, dtype=np.uint8) bit_array = np.unpackbits(temp_bytes) temp_lanes = bit_array.reshape(num_valid_samples, self.intf_spec['monitor_width']).T[::-1] wavelanes = list() for pin_label in io_pins: temp_lane = temp_lanes[self.intf_spec['traceable_io_pins'] [pin_label]] bitstring = ''.join(temp_lane.astype(str).tolist()) wave = bitstring_to_wave(bitstring) wavelanes.append({'name': '', 'pin': pin_label, 'wave': wave}) return wavelanes