示例#1
0
  def __init__(self, param_str):
    self.input_mean_value_=128.0
    self.input_scale_=1.0

    param_dict = eval(param_str) # Get args from prototxt

    self._args = xdnn_io.make_dict_args(param_dict)
    self._numPE = self._args["batch_sz"] # Bryan hack to determine number of PEs in FPGA

    # Establish FPGA Communication, Load bitstream
    ret, handles = xdnn.createHandle(self._args["xclbin"], "kernelSxdnn_0")
    if ret != 0:
      raise Exception("Failed to open FPGA handle.")

    self._args["scaleB"] = 1
    self._args["PE"] = -1

    # Instantiate runtime interface object
    self._fpgaRT = xdnn.XDNNFPGAOp(handles, self._args)

    self._parser = xdnn.CompilerJsonParser(self._args["netcfg"])

    self._indictnames = self._parser.getInputs()
    self._outdictnames =  self._parser.getOutputs()

    input_shapes = map(lambda x: tuple(x), self._parser.getInputs().itervalues())
    output_shapes = map(lambda x: tuple(x), self._parser.getOutputs().itervalues())

    self._indict = {}
    for i,name in enumerate(self._indictnames):
        self._indict[name] = np.empty(input_shapes[i],dtype=np.float32)

    self._outdict = {}
    for i,name in enumerate(self._outdictnames):
        self._outdict[name] = np.empty(output_shapes[i],dtype=np.float32)
示例#2
0
    def initialize(self, args):
        self._xsIn = xstream.Base()
        self._compJson = xdnn.CompilerJsonParser(args['netcfg'])
        self._firstInputShape = next(itervalues(self._compJson.getInputs()))

        # For now don't use a mean array, in theory this should avoid broadcasting, but harder to pass in from recipe
        # Can be hacked in, if that performance boost is necessary
        #HWC format as this is the native format that comes out of jpeg decode
        #self._meanarr = np.zeros ( (self._firstInputShape[2], self._firstInputShape[3], self._firstInputShape[1],), dtype = np.float32, order='C' )
        #self._meanarr += args['img_mean']

        if self.master:
            print("Pre is starting loop")
            self.run()
示例#3
0
    def initialize(self, args):
        self.numProcessed = 0
        self.startTime = timeit.default_timer()
        self.net = caffe.Net(args['deploymodel'], args['caffemodel'],
                             caffe.TEST)
        self.netOut = np.empty((args['batch_sz'], ) +
                               self.net.blobs['layer31-conv'].data.shape[1:],
                               dtype=np.float32)
        self.biases = bias_selector(args)
        self._args['net_h'] = self.net.blobs['data'].data.shape[2]
        self._args['net_w'] = self.net.blobs['data'].data.shape[3]
        self.fpgaOutputShapes = list(
            itervalues(
                xdnn.CompilerJsonParser(self._args['netcfg']).getOutputs()))
        for i in range(len(self.fpgaOutputShapes)):
            self.fpgaOutputShapes[i][0] = self._args['batch_sz']

        # indices for unpacking concatenated arrays to individual array.
        self.buf_indices = [0]
        for i, outputShape in enumerate(self.fpgaOutputShapes):
            self.buf_indices.append(self.buf_indices[-1] +
                                    np.prod(outputShape))
        print("Post is starting loop")
        self.run()
示例#4
0
def run(args=None):
  if not args:
    parser = xdnn_io.default_parser_args()
    parser.add_argument('--numprepproc', type=int, default=1,
                        help='number of parallel processes used to decode and quantize images')
    parser.add_argument('--numstream', type=int, default=16,
                        help='number of FPGA streams')
    parser.add_argument('--deviceID', type=int, default=0,
                        help='FPGA no. -> FPGA ID to run in case multiple FPGAs')
    parser.add_argument('--benchmarkmode', type=int, default=0,
                        help='bypass pre/post processing for benchmarking')
    parser.add_argument('--profile', action='store_true',
                        help='Print average latencies for preproc/exec/postproc')

    args = parser.parse_args()
    args = xdnn_io.make_dict_args(args)

  sharedInputArrs = []
  fpgaOutputs = []

  compilerJSONObj = xdnn.CompilerJsonParser(args['netcfg'])

  input_shapes = [v for k,v in compilerJSONObj.getInputs().items()]
  output_shapes = [v for k,v in compilerJSONObj.getOutputs().items()]

  for in_idx in range(len(input_shapes)):
      input_shapes[in_idx][0] = args['batch_sz']
  for out_idx in range(len(output_shapes)):
      output_shapes[out_idx][0] = args['batch_sz']

  input_sizes = map(lambda x: np.prod(x), input_shapes)
  output_sizes = map(lambda x: np.prod(x), output_shapes)

  num_shared_slots = args['numstream']

  # shared memory from preprocessing to fpga forward
  shared_trans_arrs = SharedMemoryQueue("trans",num_shared_slots*(args['numprepproc']*args['batch_sz']),
          input_shapes +[(args['batch_sz'], 4)])

  # shared memory from fpga forward to postprocessing
  shared_output_arrs = SharedMemoryQueue("output",num_shared_slots, output_shapes + [(args['batch_sz'], 4)])

  # Form list of images to chunks of batch_sz
  img_paths = xdnn_io.getFilePaths(args['images'])
  imgids = list(range(len(img_paths)))
  imgid_chunks = [ imgids[i:i+args['batch_sz']] for i in range(0, len(img_paths), args['batch_sz']) ]

  # Start all processes
  p = mp.Pool(initializer = init_pre_process,
    initargs = (args,  img_paths, input_shapes, shared_trans_arrs, ), processes = args['numprepproc'])

  xdnnProc = mp.Process(target=fpga_process, args=(args, len(imgid_chunks), compilerJSONObj,shared_trans_arrs,shared_output_arrs,))

  postProc = mp.Process(target=post_process, args=(args, img_paths, fpgaOutputs,output_shapes,shared_output_arrs,))
  xdnnProc.start()
  postProc.start()

  t1 = timeit.default_timer()
  if args['perpetual']:
    while True:
      res = [p.map_async(run_pre_process, imgid_chunks)]
      for j in res:
        j.wait()
        del j
  else:
    p.map_async(run_pre_process, imgid_chunks)

  xdnnProc.join()
  postProc.join()

  p.close()
  p.join()
  t2 = timeit.default_timer()
  total_t = t2 - t1
  if(args['profile']):
    print("Total time taken: {} s\n Total images: {}\nAverage FPS: {}".format(total_t, \
            len(img_paths), len(img_paths)/total_t))
示例#5
0
def run(args=None):
    if not args:
        parser = xdnn_io.default_parser_args()
        parser = yolo_parser_args(parser)
        parser.add_argument('--startxstream',
                            default=True,
                            action='store_true',
                            help='automatically start obj store server')
        parser.add_argument('--servermode',
                            default=False,
                            action='store_true',
                            help='accept images from another process')
        parser.add_argument("--deploymodel",
                            type=str,
                            default='',
                            help='Original prototxt')
        parser.add_argument("--caffemodel",
                            type=str,
                            default='',
                            help='Original caffemodel')

        args = parser.parse_args()
        args = xdnn_io.make_dict_args(args)
        args['preprocseq'] = [('resize', (224, 224)),
                              ('meansub', [104.007, 116.669, 122.679]),
                              ('chtranspose', (2, 0, 1))]

    if (args['golden'] or args['visualize']):
        assert args['labels'], "Provide --labels to compute mAP."
        assert args[
            'results_dir'], "For accuracy measurements, provide --results_dir to save the detections."
        labels = xdnn_io.get_labels(args['labels'])
        colors = generate_colors(len(labels))

    args['startxstream'] = True
    args['servermode'] = False

    timerQ = Queue()
    args['timerQ'] = timerQ

    compJson = xdnn.CompilerJsonParser(args['netcfg'])
    firstInputShape = next(itervalues(compJson.getInputs()))
    args['net_h'] = firstInputShape[2]
    args['net_w'] = firstInputShape[3]

    # start object store
    # (make sure to 'pip install pyarrow')
    xserver = None
    if args['startxstream']:
        xserver = xstream.Server()

    graph = grapher.Graph("yolo_v2")
    graph.node("prep", yolov2_pre.Node, args)
    graph.node("fpga", yolov2_fpga.Node, args)
    graph.node("post", yolov2_post.Node, args)

    graph.edge("START", None, "prep")
    graph.edge("prep", "prep", "fpga")
    graph.edge("fpga", "fpga", "post")
    graph.edge("DONE", "post", "fpga")
    graph.edge("DONE", "post", None)

    if not args['servermode']:
        graph.serve(background=True)
        img_paths = xdnn_io.getFilePaths(args['images'])

        reqProc = mp.Process(target=request_process,
                             args=(
                                 args,
                                 img_paths,
                                 graph._in[0],
                                 graph._out[0],
                             ))

        t = timeit.default_timer()
        reqProc.start()
        reqProc.join()
        graph.stop(kill=False)
        t2 = args['timerQ'].get()
        full_time = t2 - t

        args['timerQ'].close()

        print("Total time : {}s for {} images".format(full_time,
                                                      len(img_paths)))
        print("Average FPS : {} imgs/sec".format(len(img_paths) / full_time))
    else:
        print("Serving %s -> %s" % (graph._in[0], graph._out[0]))
        graph.serve()

    # mAP calculation
    if (args['golden']):
        print(flush=True)
        print("Computing mAP score  : ", flush=True)
        print("Class names are  : {} ".format(labels), flush=True)
        mAP = calc_detector_mAP(args['results_dir'], args['golden'], len(labels), labels,\
                args['prob_threshold'], args['mapiouthresh'], args['points'])
        sys.stdout.flush()
示例#6
0
    def initialize(self, args):
        # make sure to set up subscribe sockets first, so we don't miss messages
        self.sub_0 = self.get_sub(0)
        self.sub_1 = self.get_sub(1)

        self._compJson = xdnn.CompilerJsonParser(args['netcfg'])
        self._fpgaRT = xdnn.XDNNFPGAOp(args)

        self._numStreams = args['numstream']
        # allocate twice as many buffers than streams here because...
        # we know when 'a' stream/buffer completes, but we don't keep track
        # of exactly which stream/buffer is freed.
        # using double the buffers ensures that we are never clobbering
        # existing streams/buffers
        self._numStreamBuffers = self._numStreams * 2
        self._numStreamsActive = 0
        self._currStreamIdx = 0
        self._bsz = args['batch_sz']

        self._inputBuffers = []
        self._outputBuffers = []
        self._firstInputShape = next(itervalues(self._compJson.getInputs()))
        # firstOutputShape = next(itervalues(self._compJson.getOutputs()))
        outputShapes = list(itervalues(self._compJson.getOutputs()))
        outputNames = list(iterkeys(self._compJson.getOutputs()))

        for si in range(self._bsz * self._numStreamBuffers):
            self._inputBuffers.append(
                mp.Array(ctypes.c_float,
                         np.prod(tuple(self._firstInputShape)).tolist()))

            bufs = []
            for outputShape in outputShapes:
                bufs.append(
                    np.empty((self._bsz, ) + tuple(outputShape[1:]),
                             dtype=np.float32,
                             order='C'))
            self._outputBuffers.append(bufs)

        # print("outputBuffer : ", [len(item) for item in self._outputBuffers])

        # Pipeline:
        # 1) ingest
        #    collect individual requests into 1 batch for 1 stream
        # 2) ingest_worker(s)
        #    copy individual object store blobs into local buffers for 1 stream
        # 3) loop
        #    submit fpga job
        # 4) wait
        #    wait for fpga job

        self._qingest = mp.Queue(maxsize=len(self._inputBuffers))
        self._qfpga = mp.Queue(maxsize=len(self._inputBuffers))
        # spawn ingest_workers to copy remote buffers to local buffer
        self._ingestWorkers = []
        for pi in range(args['numprepproc']):
            p = mp.Process(target=ingest_worker,
                           args=(
                               self._qingest,
                               self._qfpga,
                               self._firstInputShape,
                               self._inputBuffers,
                           ))
            p.start()
            self._ingestWorkers.append(p)

        # ingest thread dispatches incoming work to ingest_workers
        self._ingestThread = threading.Thread(target=self.ingest,
                                              args=(self._qingest,
                                                    self._qfpga))
        self._ingestThread.start()

        # wait thread collects completed FPGA results and sends forward
        self._qwait = mp.Queue(maxsize=len(self._inputBuffers))
        self._waitThread = threading.Thread(target=self.wait,
                                            args=(
                                                self._qwait,
                                                outputNames,
                                            ))
        self._waitThread.start()

        print("Starting FPGA loop")

        self.run()
示例#7
0
      sys.stdout.flush()

mp_classify.register_pre(YoloPreProcess)
mp_classify.register_post(YoloPostProcess)

if __name__ == '__main__':
  parser = xdnn_io.default_parser_args()
  parser = yolo_parser_args(parser)
  args = parser.parse_args()
  args = xdnn_io.make_dict_args(args)

  if(args['golden'] or args['visualize']):
    assert args['labels'], "Provide --labels to compute mAP."
    assert args['results_dir'], "For accuracy measurements, provide --results_dir to save the detections."

  compilerJSONObj = xdnn.CompilerJsonParser(args['netcfg'])

  input_shapes = [v for k,v in compilerJSONObj.getInputs().items()]
  output_shapes = [v for k,v in compilerJSONObj.getOutputs().items()]

  for out_idx in range(len(output_shapes)):
      output_shapes[out_idx][0] = args['batch_sz']

  input_sizes  = map(lambda x: np.prod(x), input_shapes)
  output_sizes = map(lambda x: np.prod(x), output_shapes)

  out_w = output_shapes[0][2]
  out_h = output_shapes[0][3]

  args['net_h'] = int(input_shapes[0][2])
  args['net_w'] = int(input_shapes[0][3])
示例#8
0
    cap = cv2.VideoCapture(args.videofile)
    if not cap.isOpened():
        raise ValueError("Couldn't read the video file {}".format(
            args.videofile))

    frame_q = mp.Queue()
    resize_q = mp.Queue()
    trans_q = mp.Queue()
    output_q = mp.Queue()
    face_q = mp.Queue()
    scale_q = mp.Queue()
    ready_fpga = mp.Queue()

    sharedInputArrs = []

    compilerJSONObj = xdnn.CompilerJsonParser(args.vitisrundir +
                                              '/compiler.json')

    input_shapes = [v for k, v in compilerJSONObj.getInputs().items()]
    output_shapes = [v for k, v in compilerJSONObj.getOutputs().items()]

    input_sizes = list(map(lambda x: np.prod(x), input_shapes))
    output_sizes = map(lambda x: np.prod(x), output_shapes)
    input_list = input_shapes[0]
    N, C, H, W = input_list[0], input_list[1], input_list[2], input_list[3]

    # shared memory from video capture to preprocessing
    shared_frame_arrs = SharedMemoryQueue("frame", num_shared_slots,
                                          [(H, W, C)])
    # shared memory from preprocessing to fpga forward
    shared_trans_arrs = SharedMemoryQueue("trans", num_shared_slots,
                                          [(H, W, C)] + input_shapes)
示例#9
0
    print('drawing boxes time: {0} seconds'.format(end_time - start_time))


if __name__ == '__main__':

    frame_q = mp.Queue()
    resize_q = mp.Queue()
    trans_q = mp.Queue()
    output_q = mp.Queue()
    face_q = mp.Queue()

    ready_fpga = mp.Queue()

    sharedInputArrs = []

    compilerJSONObj = xdnn.CompilerJsonParser('deploy.compiler.json')

    input_shapes = map(lambda x: tuple(x),
                       compilerJSONObj.getInputs().itervalues())
    output_shapes = map(lambda x: tuple(x),
                        compilerJSONObj.getOutputs().itervalues())

    input_sizes = map(lambda x: np.prod(x), input_shapes)
    output_sizes = map(lambda x: np.prod(x), output_shapes)

    print input_shapes
    print output_shapes

    # shared memory from video capture to preprocessing
    shared_frame_arrs = SharedMemoryQueue("frame", num_shared_slots,
                                          [(320, 320, 3)])