def stream(cfg, classes_file, weights, socket_ip, socket_port, image_size=128, confidence_threshold=0.6, nms_thres=0.5): print('+ Initializing model') model = Darknet(cfg, image_size) print('+ Loading model') load_darknet_weights(model, weights) print('+ Fusing model') model.fuse() print('+ Loading model to CPU') model.to('cpu').eval() print('+ Loading webcam') cap = LoadKinect(img_size=image_size) print('+ Loading classes') classes = load_classes(classes_file) colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(classes))] print('+ Connecting to remote socket') global sock sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect((socket_ip, socket_port)) print('+ Enumerating cam') for counter, (path, img, im0, vid_cap) in enumerate(cap): t = time.time() print('+ Loading image to CPU') img = torch.from_numpy(img).unsqueeze(0).to('cpu') pred, _ = model(img) print('+ Detecting objects') det = non_max_suppression(pred, confidence_threshold, nms_thres)[0] if det is not None and len(det) > 0: detected_classes = [] print('+ Rescaling model') det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() print('+ Reading depth') depth = get_depth() depth_swap = np.swapaxes(depth, 0, 1) depth_strip1d = np.array([ np.sort(stripe)[100] for stripe in depth_swap ]).astype(np.uint8) depth_strip2d_swap = np.array([ np.ones(depth_swap.shape[1]) * depth for depth in depth_strip1d ]).astype(np.uint8) depth_strip2d = np.swapaxes(depth_strip2d_swap, 0, 1) depth_edge1d = np.zeros(depth_strip1d.shape) state = False for counter, _ in np.ndenumerate(depth_edge1d[:-1]): state = True if not state and depth_strip1d[ counter] < 230 else False depth_edge1d[counter[0]] = not state state = False state_cnt = 0 for counter, _ in np.ndenumerate(depth_edge1d[:-1]): counter = counter[0] if depth_edge1d[counter] == state: state_cnt += 1 else: if state_cnt < 10: for r in range(max(0, counter - 10), counter): depth_edge1d[counter] = state state_cnt = 0 state = depth_edge1d[counter] depth_edge1d = depth_edge1d * 255 depth_edge2d_swap = np.array([ np.ones(100) * awddawd for awddawd in depth_edge1d ]).astype(np.uint8) depth_edge2d = np.swapaxes(depth_edge2d_swap, 0, 1) for *coordinates, conf, cls_conf, cls in det: if classes[int(cls)] in RISKY_CLASSES: label = '%s %.2f' % (classes[int(cls)], conf) plot_one_box(coordinates, im0, label=label, color=colors[int(cls)]) print(f"+ Detected {classes[int(cls)]}") x_avg_depth = np.mean(depth[coordinates[0] - 5:coordinates[0] + 5]) y_avg_depth = np.mean(depth[coordinates[1] - 5:coordinates[1] + 5]) detected_classes.append({ classes[int(cls)]: { 'x': coordinates[0], 'y': coordinates[1], 'z': np.average(np.array([x_avg_depth, y_avg_depth])) } }) n = [] for counter in detected_classes: width = im0.shape[1] x, y, z = counter[list(counter.keys())[0]].values() phi = (x / width * 2 - 1) * (CAMERA_FOV / 2) n.append(f"{list(counter.keys())[0]};{phi};{z}|") sock.send(''.join(str(x) for x in n)[:-1].encode('utf-8')) print('+ Cycle took %.3fs' % (time.time() - t)) plt.imshow(bgr_to_rgb(im0)) plt.show(block=False) plt.pause(.001)
def train(cfg, data_cfg, img_size=416, resume=False, epochs=100, batch_size=16, accumulated_batches=1, multi_scale=False, freeze_backbone=True, var=0, weight_path="weights/rainy", result="result.txt", ckpt=10): weights = weight_path latest = weights + 'latest.pt' best = weights + 'best.pt' device = torch_utils.select_device() if multi_scale: # pass maximum multi_scale size img_size = 608 else: torch.backends.cudnn.benchmark = True # unsuitable for multiscale # Configure run train_path = parse_data_cfg(data_cfg)['train'] # Initialize model model = Darknet(cfg, img_size) # Get dataloader dataloader = LoadImagesAndLabels(train_path, batch_size, img_size, multi_scale=multi_scale, augment=True) lr0 = 0.001 cutoff = 10 # backbone reaches to cutoff layer start_epoch = 0 best_loss = float('inf') if resume: checkpoint = torch.load(latest, map_location='cpu') # Load weights to resume from model.load_state_dict(checkpoint['model']) # if torch.cuda.device_count() > 1: # model = nn.DataParallel(model) model.to(device).train() # Transfer learning (train only YOLO layers) # for i, (name, p) in enumerate(model.named_parameters()): # p.requires_grad = True if (p.shape[0] == 255) else False # Set optimizer optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=lr0, momentum=.9) start_epoch = checkpoint['epoch'] + 1 if checkpoint['optimizer'] is not None: optimizer.load_state_dict(checkpoint['optimizer']) best_loss = checkpoint['best_loss'] del checkpoint # current, saved else: # Initialize model with backbone (optional) if cfg.endswith('yolov3.cfg'): load_darknet_weights(model, weights + 'darknet53.conv.74') cutoff = 75 elif cfg.endswith('yolov3-tiny.cfg'): load_darknet_weights(model, weights + 'yolov3-tiny.conv.15') cutoff = 15 elif cfg.startswith('cfg/bdd100k'): #transfer learning print("Apply transfer learning for bdd100k cfg") tmp_model = Darknet('cfg/yolov3.cfg', img_size) load_darknet_weights(tmp_model, "weights/yolov3.weights") pretrained_dict = tmp_model.state_dict() for k, v in model.state_dict().items(): if v.shape != pretrained_dict[k].shape: pretrained_dict[k] = torch.empty(v.shape) #TODO: conv, batch if k.split(".")[2].startswith("conv"): nn.init.normal_(pretrained_dict[k], 0.0, 0.03) elif k.split(".")[2].startswith("batch_norm") and k.split( ".")[3] == "weight": nn.init.normal_(pretrained_dict[k], 1.0, 0.03) elif k.split(".")[2].startswith("batch_norm") and k.split( ".")[3] == "bias": nn.init.constant_(pretrained_dict[k], 0.0) else: nn.init_normal_(pretrained_dict[k], torch.mean(v), torch.std(v)) print(k, v.shape) model.load_state_dict(pretrained_dict) del tmp_model #freeze_layer cutoff = 10 model.freeze_layers(cutoff) model.to(device).train() # Set optimizer optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=lr0, momentum=.9) # Set scheduler # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[54, 61], gamma=0.1) t0 = time.time() model_info(model) n_burnin = min(round(dataloader.nB / 5), 1000) # number of burn-in batches for epoch in range(1, epochs + 1): epoch += start_epoch print(('%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf', 'cls', 'total', 'nTargets', 'time')) # Update scheduler (automatic) # scheduler.step() # Update scheduler (manual) at 0, 54, 61 epochs to 1e-3, 1e-4, 1e-5 if epoch > 50: lr = lr0 / 10 else: lr = lr0 for g in optimizer.param_groups: g['lr'] = lr # Freeze darknet53.conv.74 for first epoch if freeze_backbone: for i, (name, p) in enumerate(model.named_parameters()): if int(name.split('.')[1]) < cutoff: # if layer < 75 p.requires_grad = False if (epoch == 0) else True ui = -1 rloss = defaultdict(float) # running loss optimizer.zero_grad() for i, (imgs, targets, _, _, var) in enumerate(dataloader): if sum([len(x) for x in targets]) < 1: # if no targets continue continue # SGD burn-in if (epoch == 0) & (i <= n_burnin): lr = lr0 * (i / n_burnin)**4 for g in optimizer.param_groups: g['lr'] = lr # Compute loss, compute gradient, update parameters loss = model(imgs.to(device), targets, var=0) loss.backward() # accumulate gradient for x batches before optimizing if ((i + 1) % accumulated_batches == 0) or (i == len(dataloader) - 1): optimizer.step() optimizer.zero_grad() # Running epoch-means of tracked metrics ui += 1 for key, val in model.losses.items(): rloss[key] = (rloss[key] * ui + val) / (ui + 1) s = ('%8s%12s' + '%10.3g' * 7) % ( '%g/%g' % (epoch, epochs + start_epoch), '%g/%g' % (i, len(dataloader) - 1), rloss['xy'], rloss['wh'], rloss['conf'], rloss['cls'], rloss['loss'], model.losses['nT'], time.time() - t0) t0 = time.time() print(s) # Update best loss loss_per_target = rloss['loss'] / rloss['nT'] if loss_per_target < best_loss: best_loss = loss_per_target # Save latest checkpoint checkpoint = { 'epoch': epoch, 'best_loss': best_loss, 'model': model.state_dict(), 'optimizer': optimizer.state_dict() } torch.save(checkpoint, latest) # Save best checkpoint if best_loss == loss_per_target: os.system('cp ' + latest + ' ' + best) # Save backup weights every 5 epochs (optional) if (epoch > 0) & (epoch % ckpt == 0): os.system('cp ' + latest + ' ' + weights + 'backup{}.pt'.format(epoch)) # Calculate mAP with torch.no_grad(): mAP, R, P = test.test(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size) # Write epoch results with open(result, 'a') as file: file.write(s + '%11.3g' * 3 % (mAP, P, R) + '\n')
def app(): cfg = 'ml-data/yolov3.cfg' global image_size image_size = 320 weights = 'ml-data/weights/yolov3.weights' classes_file = 'ml-data/classes.txt' socket_ip = '10.10.10.1' # socket_ip = '127.0.0.1' socket_port = 1337 print('+ Initializing model') global model model = Darknet(cfg, image_size) print('+ Loading model') load_darknet_weights(model, weights) print('+ Fusing model') model.fuse() print('+ Loading model to CPU') model.to('cpu').eval() print('+ Loading classes') global classes classes = load_classes(classes_file) global colors colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(classes))] print('+ Connecting to remote socket') global sock sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect((socket_ip, socket_port)) while 1: # # Depth # depth_result = analyse_depth() depth_raw = depth_result["raw"] depth_done = depth_result["done"] depth_objects = depth_result["objects"] # # RGB # rgb_result = analyse_rgb() rgb_raw = rgb_result["raw"] rgb_done = rgb_result["done"] rgb_objects = rgb_result["objects"] print("FRAME [D]: " + depth_objects) print("FRAME [C]: " + rgb_objects) sock.send(bytes(f"{rgb_objects}|{depth_objects}".encode('utf-8'))) time.sleep(0.01) # Plot vbar = np.zeros((depth_raw.shape[0], 5, 3)).astype(np.uint8) depthbar = np.concatenate((depth_raw, vbar, depth_done), axis=1) rgbbar = np.concatenate((rgb_raw, vbar, rgb_done), axis=1) hbar = np.zeros((5, depthbar.shape[1], 3)).astype(np.uint8) cv2.imshow('LineWarn', np.concatenate((depthbar, hbar, rgbbar), axis=0)) if cv2.waitKey(10) == 27: break
def train( cfg, data_cfg, weights_from="", weights_to="", save_every=10, img_size=(1088, 608), resume=False, epochs=100, batch_size=16, accumulated_batches=1, freeze_backbone=False, opt=None, ): # The function starts NUM_WORKERS = opt.num_workers timme = strftime("%Y-%d-%m %H:%M:%S", gmtime()) timme = timme[5:-3].replace('-', '_') timme = timme.replace(' ', '_') timme = timme.replace(':', '_') weights_to = osp.join(weights_to, 'run' + timme) mkdir_if_missing(weights_to) mkdir_if_missing(weights_to + '/cfg/') if resume: latest_resume = osp.join(weights_from, 'latest.pt') torch.backends.cudnn.benchmark = True # unsuitable for multiscale # Configure run f = open(data_cfg) data_config = json.load(f) trainset_paths = data_config['train'] dataset_root = data_config['root'] f.close() transforms = T.Compose([T.ToTensor()]) # Get dataloader dataset = JointDataset(dataset_root, trainset_paths, img_size, augment=True, transforms=transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, drop_last=True, collate_fn=collate_fn) # Initialize model model = Darknet(cfg, dataset.nID) cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 if resume: checkpoint = torch.load(latest_resume, map_location='cpu') # Load weights to resume from model.load_state_dict(checkpoint['model']) model.cuda().train() # Set optimizer optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=opt.lr, momentum=.9) start_epoch = checkpoint['epoch'] + 1 if checkpoint['optimizer'] is not None: optimizer.load_state_dict(checkpoint['optimizer']) del checkpoint # current, saved else: # Initialize model with backbone (optional) if cfg.endswith('yolov3.cfg'): load_darknet_weights(model, osp.join(weights_from, 'darknet53.conv.74')) cutoff = 75 elif cfg.endswith('yolov3-tiny.cfg'): load_darknet_weights(model, osp.join(weights_from, 'yolov3-tiny.conv.15')) cutoff = 15 model.cuda().train() # Set optimizer optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=opt.lr, momentum=.9, weight_decay=1e-4) model = torch.nn.DataParallel(model) # Set scheduler scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[int(0.5 * opt.epochs), int(0.75 * opt.epochs)], gamma=0.1) # An important trick for detection: freeze bn during fine-tuning if not opt.unfreeze_bn: for i, (name, p) in enumerate(model.named_parameters()): p.requires_grad = False if 'batch_norm' in name else True # model_info(model) t0 = time.time() for epoch in range(epochs): epoch += start_epoch logger.info( ('%8s%12s' + '%10s' * 6) % ('Epoch', 'Batch', 'box', 'conf', 'id', 'total', 'nTargets', 'time')) # Freeze darknet53.conv.74 for first epoch if freeze_backbone and (epoch < 2): for i, (name, p) in enumerate(model.named_parameters()): if int(name.split('.')[2]) < cutoff: # if layer < 75 p.requires_grad = False if (epoch == 0) else True ui = -1 rloss = defaultdict(float) # running loss ## training schedule optimizer.zero_grad() for i, (imgs, targets, _, _, targets_len) in enumerate(dataloader): if sum([len(x) for x in targets]) < 1: # if no targets continue continue # SGD burn-in burnin = min(1000, len(dataloader)) if (epoch == 0) & (i <= burnin): lr = opt.lr * (i / burnin)**4 for g in optimizer.param_groups: g['lr'] = lr # Compute loss, compute gradient, update parameters loss, components = model(imgs.cuda(), targets.cuda(), targets_len.cuda()) components = torch.mean(components.view(-1, 5), dim=0) loss = torch.mean(loss) loss.backward() # accumulate gradient for x batches before optimizing if ((i + 1) % accumulated_batches == 0) or (i == len(dataloader) - 1): optimizer.step() optimizer.zero_grad() # Running epoch-means of tracked metrics ui += 1 for ii, key in enumerate(model.module.loss_names): rloss[key] = (rloss[key] * ui + components[ii]) / (ui + 1) # rloss indicates running loss values with mean updated at every epoch s = ('%8s%12s' + '%10.3g' * 6) % ( '%g/%g' % (epoch, epochs - 1), '%g/%g' % (i, len(dataloader) - 1), rloss['box'], rloss['conf'], rloss['id'], rloss['loss'], rloss['nT'], time.time() - t0) t0 = time.time() if i % opt.print_interval == 0: logger.info(s) # Save latest checkpoint checkpoint = { 'epoch': epoch, 'model': model.module.state_dict(), 'optimizer': optimizer.state_dict() } copyfile(cfg, weights_to + '/cfg/yolo3.cfg') copyfile(data_cfg, weights_to + '/cfg/ccmcpe.json') latest = osp.join(weights_to, 'latest.pt') torch.save(checkpoint, latest) if epoch % save_every == 0 and epoch != 0: # making the checkpoint lite checkpoint["optimizer"] = [] torch.save( checkpoint, osp.join(weights_to, "weights_epoch_" + str(epoch) + ".pt")) # Calculate mAP ''' if epoch % opt.test_interval == 0: with torch.no_grad(): mAP, R, P = test.test(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size, print_interval=40, nID=dataset.nID) test.test_emb(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size, print_interval=40, nID=dataset.nID) ''' # Call scheduler.step() after opimizer.step() with pytorch > 1.1.0 scheduler.step()
def run( act_dtype=ng.int16, weight_dtype=ng.int8, bias_dtype=ng.int32, scale_dtype=ng.int8, disable_fusion=False, conv2d_par_ich=1, conv2d_par_och=1, conv2d_par_col=1, conv2d_par_row=1, conv2d_concur_och=None, conv2d_stationary='filter', pool_par=1, elem_par=1, chunk_size=64, axi_datawidth=32, silent=False, onnx_filename='yolov3-tiny.onnx', weight_filename='yolov3-tiny.npy', verilog_filename=None, sim_filename=None, # simtype=None, # no RTL simulation # simtype='iverilog', simtype='verilator', cfg_filename='yolov3-tiny.cfg', weights_filename='yolov3-tiny.weights', model_path='yolov3'): # input mean and standard deviation imagenet_mean = np.array([0.485, 0.456, 0.406]).astype(np.float32) imagenet_std = np.array([0.229, 0.224, 0.225]).astype(np.float32) img_size = (416, 416) act_shape = (1, img_size[0], img_size[1], 3) # pytorch model model_url = "https://github.com/ultralytics/yolov3" if not os.path.isdir(model_path): raise FileNotFoundError( "Download the YOLOv3 model using Pytorch, such as " "'%s'. Then extract it, and rename it as '%s'" % (model_url, model_path)) # Darknet model configuration and pretrained weights cfg_url = "https://github.com/pjreddie/darknet/blob/master/cfg/yolov3-tiny.cfg" if not os.path.isfile(cfg_filename): urllib.request.urlretrieve(cfg_url, cfg_filename) weights_url = "https://pjreddie.com/media/files/yolov3-tiny.weights" if not os.path.isfile(weights_filename): urllib.request.urlretrieve(weights_url, weights_filename) sys.path.insert(0, model_path) import models models.ONNX_EXPORT = True model = models.Darknet(cfg_filename, img_size).to('cpu') models.load_darknet_weights(model, weights_filename) # Pytorch to ONNX dummy_input = torch.randn(*act_shape).transpose(1, 3) input_names = ['act'] output_names = ['scores', 'boxes'] model.eval() torch.onnx.export(model, dummy_input, onnx_filename, input_names=input_names, output_names=output_names) # -------------------- # (1) Represent a DNN model as a dataflow by NNgen operators # -------------------- # ONNX to NNgen dtypes = {} shapes = {} (outputs, placeholders, variables, constants, operators) = ng.from_onnx(onnx_filename, value_dtypes=dtypes, value_shapes=shapes, default_placeholder_dtype=act_dtype, default_variable_dtype=weight_dtype, default_constant_dtype=weight_dtype, default_operator_dtype=act_dtype, default_scale_dtype=scale_dtype, default_bias_dtype=bias_dtype, disable_fusion=disable_fusion, verbose=False) # -------------------- # (2) Assign quantized weights to the NNgen operators # -------------------- if act_dtype.width > 8: act_scale_factor = 128 else: act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5)) input_scale_factors = {'act': act_scale_factor} input_means = {'act': imagenet_mean * act_scale_factor} input_stds = {'act': imagenet_std * act_scale_factor} ng.quantize(outputs, input_scale_factors, input_means, input_stds) # -------------------- # (3) Assign hardware attributes # -------------------- for op in operators.values(): if isinstance(op, ng.conv2d): op.attribute(par_ich=conv2d_par_ich, par_och=conv2d_par_och, par_col=conv2d_par_col, par_row=conv2d_par_row, concur_och=conv2d_concur_och, stationary=conv2d_stationary) if isinstance(op, (ng.avg_pool, ng.max_pool, ng.avg_pool_serial, ng.max_pool_serial)): op.attribute(par=pool_par) if ng.is_elementwise_operator(op): op.attribute(par=elem_par) # -------------------- # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software # -------------------- act = placeholders['act'] outs = (outputs['scores'], outputs['boxes']) # verification data img = np.array(PIL.Image.open('car416x416.png').convert('RGB')).astype( np.float32) img = img.reshape([1] + list(img.shape)) img = img / 255 img = (img - imagenet_mean) / imagenet_std # execution on pytorch model_input = img if act.perm is not None: model_input = np.transpose(model_input, act.reversed_perm) model.eval() model_rslts = model(torch.from_numpy(model_input)) model_outs = [rslt.detach().numpy() for rslt in model_rslts] model_outs = [(np.transpose(model_out, act.perm) if act.perm is not None and len(model_out.shape) == len(act.shape) else model_out) for model_out in model_outs] scaled_model_outs = [ model_out * out.scale_factor for model_out, out in zip(model_outs, outs) ] # software-based verification vact = img * act_scale_factor vact = np.clip(vact, -1.0 * (2**(act.dtype.width - 1) - 1), 1.0 * (2**(act.dtype.width - 1) - 1)) vact = np.round(vact).astype(np.int64) # compare outputs of hidden layers leaky_relu_ops = [ v for k, v in operators.items() if (isinstance(v, ng.conv2d) and isinstance(v.act_func, ng.leaky_relu_base)) ] leaky_relu_ops = list(sorted(set(leaky_relu_ops), key=leaky_relu_ops.index)) conv2d_ops = [ v for k, v in operators.items() if (isinstance(v, ng.conv2d) and v.act_func is None) ] conv2d_ops = list(sorted(set(conv2d_ops), key=conv2d_ops.index)) # only 1st output sub_ops = leaky_relu_ops[:9] + conv2d_ops[:1] sub_outs = ng.eval(sub_ops, act=vact) sub_outs = [sub_out.transpose([0, 3, 1, 2]) for sub_out in sub_outs] sub_scale_factors = [sub_op.scale_factor for sub_op in sub_ops] model.eval() mouts = [] # all Conv2d-LeakyReLU layers before YOLOLayer mouts.append( nn.Sequential(model.module_list[0])( torch.from_numpy(model_input)).detach().numpy()) mouts.append( nn.Sequential(*model.module_list[0:3])( torch.from_numpy(model_input)).detach().numpy()) mouts.append( nn.Sequential(*model.module_list[0:5])( torch.from_numpy(model_input)).detach().numpy()) mouts.append( nn.Sequential(*model.module_list[0:7])( torch.from_numpy(model_input)).detach().numpy()) mouts.append( nn.Sequential(*model.module_list[0:9])( torch.from_numpy(model_input)).detach().numpy()) mouts.append( nn.Sequential(*model.module_list[0:11])( torch.from_numpy(model_input)).detach().numpy()) mouts.append( nn.Sequential(*model.module_list[0:13])( torch.from_numpy(model_input)).detach().numpy()) mouts.append( nn.Sequential(*model.module_list[0:14])( torch.from_numpy(model_input)).detach().numpy()) mouts.append( nn.Sequential(*model.module_list[0:15])( torch.from_numpy(model_input)).detach().numpy()) mouts.append( nn.Sequential(*model.module_list[0:16])( torch.from_numpy(model_input)).detach().numpy()) scaled_mouts = [ mout * scale_factor for mout, scale_factor in zip(mouts, sub_scale_factors) ] sub_mean_square_errors = [ np.sum((sub_out - mout)**2) / sub_out.size for mout, sub_out in zip(scaled_mouts, sub_outs) ] sub_corrcoefs = [ np.corrcoef(mout.reshape([-1]), sub_out.reshape([-1])) for mout, sub_out in zip(mouts, sub_outs) ] # compare prediction results vouts = ng.eval(outs, act=vact) mean_square_errors = [ np.sum((vout - scaled_model_out)**2) / vout.size for vout, scaled_model_out in zip(vouts, scaled_model_outs) ] corrcoefs = [ np.corrcoef(model_out.reshape([-1]), vout.reshape([-1])) for model_out, vout in zip(model_outs, vouts) ] # breakpoint() # -------------------- # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT) # -------------------- # to Veriloggen object # targ = ng.to_veriloggen(outs, 'yolov3tiny', silent=silent, # config={'maxi_datawidth': axi_datawidth}) # to IP-XACT (the method returns Veriloggen object, as well as to_veriloggen) targ = ng.to_ipxact(outs, 'yolov3tiny', silent=silent, config={'maxi_datawidth': axi_datawidth}) # to Verilog HDL RTL (the method returns a source code text) # rtl = ng.to_verilog(outs, 'yolov3tiny', silent=silent, # config={'maxi_datawidth': axi_datawidth}) # -------------------- # (6) Save the quantized weights # -------------------- param_data = ng.export_ndarray(outs, chunk_size) param_bytes = len(param_data) np.save(weight_filename, param_data) # -------------------- # (7) Simulate the generated hardware by Veriloggen and Verilog simulator # -------------------- if simtype is None: sys.exit() variable_addr = int(math.ceil( (act.addr + act.memory_size) / chunk_size)) * chunk_size check0_addr = int(math.ceil( (variable_addr + param_bytes) / chunk_size)) * chunk_size check1_addr = int( math.ceil( (check0_addr + outs[0].memory_size) / chunk_size)) * chunk_size tmp_addr = int(math.ceil( (check1_addr + outs[1].memory_size) / chunk_size)) * chunk_size memimg_datawidth = 32 # mem = np.zeros([1024 * 1024 * 256 // (memimg_datawidth // 8)], dtype=np.int64) mem = np.zeros([1024 * 1024 * 256 // (memimg_datawidth // 8)], dtype=np.int16) mem = mem + [100] # placeholder axi.set_memory( mem, vact, memimg_datawidth, act_dtype.width, act.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_ich)) # parameters (variable and constant) axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr) # verification data axi.set_memory( mem, vouts[0], memimg_datawidth, act_dtype.width, check0_addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_och)) axi.set_memory( mem, vouts[1], memimg_datawidth, act_dtype.width, check1_addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_och)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if sim_filename is None: sim_filename = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + sim_filename memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for bat in range(outs[0].shape[0]): for x in range(outs[0].shape[1]): orig = memory.read_word(bat * outs[0].aligned_shape[1] + x, outs[0].addr, act_dtype.width) check = memory.read_word(bat * outs[0].aligned_shape[1] + x, check0_addr, act_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', bat, x, ') orig: ', orig, ' check: ', check) ok = False # else: # print('OK (', bat, x, # ') orig: ', orig, ' check: ', check) for bat in range(outs[1].shape[0]): for x in range(outs[1].shape[1]): orig = memory.read_word(bat * outs[1].aligned_shape[1] + x, outs[1].addr, act_dtype.width) check = memory.read_word(bat * outs[1].aligned_shape[1] + x, check1_addr, act_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', bat, x, ') orig: ', orig, ' check: ', check) ok = False # else: # print('OK (', bat, x, # ') orig: ', orig, ' check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) # output source code if verilog_filename is not None: m.to_verilog(verilog_filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=sim_filename) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt