def controller_bench(space, num_layers, device=torch.device('cpu'), skip=True, epochs=200): lr = 0.2 batch_size = 5 max_epochs = epochs best_rollout = [] best_paras = [] best_reward = -100000 start = time.time() agent = ctrl.Agent(space, num_layers, batch_size, lr=lr, device=device, skip=skip) target = get_target(space, num_layers, skip) reward_history = [] for e in range(max_epochs): # if e == 100: # agent.lr_decay(0.3) for i in range(batch_size): rollout, paras = agent.rollout() print(agent.agent.ema) # print(rollout, paras) arch_paras, quan_paras = utility.split_paras(paras) # fpga_model = FPGAModel( # rLUT=100000, rThroughput=1000, # arch_paras=arch_paras, quan_paras=quan_paras) reward = get_reward(rollout, quan_paras, target) reward_history.append(reward) # if reward == 1: # print(e*batch_size + i) # quit() if reward > best_reward: best_reward = reward best_rollout = rollout best_paras = paras # print(best_rollout, best_paras) print("action: {}, reward: {}".format(rollout, reward)) agent.store_rollout(rollout, reward) # E = agent.train_step() print("epoch {}".format(e)) print(f"best rollout {best_rollout}, " + f"best architecture: {best_paras}, " + f"best reward: {best_reward}") print("elasped time is {}".format(time.time() - start)) print("target: {}".format(target)) plot(reward_history)
'filter_width': 3, 'num_filters': 36, 'pool_size': 1, 'act_num_int_bits': 3, 'act_num_frac_bits': 1, 'weight_num_int_bits': 2, 'weight_num_frac_bits': 4 }, { 'filter_height': 5, 'filter_width': 5, 'num_filters': 24, 'pool_size': 1, 'act_num_int_bits': 3, 'act_num_frac_bits': 3, 'weight_num_int_bits': 3, 'weight_num_frac_bits': 4 }, { 'filter_height': 1, 'filter_width': 1, 'num_filters': 24, 'pool_size': 1, 'act_num_int_bits': 2, 'act_num_frac_bits': 5, 'weight_num_int_bits': 0, 'weight_num_frac_bits': 6 }] arch_paras, quan_paras = utility.split_paras(paras) fpga_model = FPGAModel(30000, 1000, arch_paras, quan_paras) print(fpga_model.get_info())
def sync_search(device, dir='experiment'): dir = os.path.join(dir, f"rLut={args.rLUT}, rThroughput={args.rThroughput}") if os.path.exists(dir) is False: os.makedirs(dir) filepath = os.path.join(dir, f"joint ({args.episodes} episodes)") logger = get_logger(filepath) csvfile = open(filepath + '.csv', mode='w+', newline='') writer = csv.writer(csvfile) logger.info(f"INFORMATION") logger.info(f"mode: \t\t\t\t\t {'joint'}") logger.info(f"dataset: \t\t\t\t {args.dataset}") logger.info(f"number of child network layers: \t {args.layers}") logger.info(f"include stride: \t\t\t {not args.no_stride}") logger.info(f"include pooling: \t\t\t {not args.no_pooling}") logger.info(f"skip connection: \t\t\t {args.skip}") logger.info(f"required # LUTs: \t\t\t {args.rLUT}") logger.info(f"required throughput: \t\t\t {args.rThroughput}") logger.info(f"Assumed frequency: \t\t\t {CLOCK_FREQUENCY}") logger.info(f"training epochs: \t\t\t {args.epochs}") logger.info(f"data augmentation: \t\t\t {args.augment}") logger.info(f"batch size: \t\t\t\t {args.batch_size}") logger.info(f"controller learning rate: \t\t {args.learning_rate}") logger.info(f"architecture episodes: \t\t\t {args.episodes}") logger.info(f"using multi gpus: \t\t\t {args.multi_gpu}") logger.info(f"architecture space: ") for name, value in ARCH_SPACE.items(): logger.info(name + f": \t\t\t\t {value}") logger.info(f"quantization space: ") for name, value in QUAN_SPACE.items(): logger.info(name + f": \t\t\t {value}") agent = Agent({ **ARCH_SPACE, **QUAN_SPACE }, args.layers, lr=args.learning_rate, device=torch.device('cpu'), skip=args.skip) train_data, val_data = data.get_data(args.dataset, device, shuffle=True, batch_size=args.batch_size, augment=args.augment) input_shape, num_classes = data.get_info(args.dataset) writer.writerow(["ID"] + ["Layer {}".format(i) for i in range(args.layers)] + ["Accuracy"] + [ "Partition (Tn, Tm)", "Partition (#LUTs)", "Partition (#cycles)", "Total LUT", "Total Throughput" ] + ["Time"]) child_id, total_time = 0, 0 logger.info('=' * 50 + "Start exploring architecture & quantization space" + '=' * 50) best_samples = BestSamples(5) for e in range(args.episodes): logger.info('-' * 130) child_id += 1 start = time.time() rollout, paras = agent.rollout() logger.info("Sample Architecture ID: {}, Sampled actions: {}".format( child_id, rollout)) arch_paras, quan_paras = utility.split_paras(paras) fpga_model = FPGAModel(rLUT=args.rLUT, rThroughput=args.rThroughput, arch_paras=arch_paras, quan_paras=quan_paras) if fpga_model.validate(): model, optimizer = child.get_model(input_shape, arch_paras, num_classes, device, multi_gpu=args.multi_gpu, do_bn=False) _, reward = backend.fit(model, optimizer, train_data, val_data, quan_paras=quan_paras, epochs=args.epochs, verbosity=args.verbosity) else: reward = 0 agent.store_rollout(rollout, reward) end = time.time() ep_time = end - start total_time += ep_time best_samples.register(child_id, rollout, reward) writer.writerow([child_id] + [str(paras[i]) for i in range(args.layers)] + [reward] + list(fpga_model.get_info()) + [ep_time]) logger.info(f"Reward: {reward}, " + f"Elasped time: {ep_time}, " + f"Average time: {total_time/(e+1)}") logger.info(f"Best Reward: {best_samples.reward_list[0]}, " + f"ID: {best_samples.id_list[0]}, " + f"Rollout: {best_samples.rollout_list[0]}") logger.info('=' * 50 + "Architecture & quantization sapce exploration finished" + '=' * 50) logger.info(f"Total elasped time: {total_time}") logger.info(f"Best samples: {best_samples}") csvfile.close()
def tune(paras=[], dataset='CIFAR10'): # quantize = True if 'act_num_int_bits' in paras[0] else False arch_paras, quan_paras = utility.split_paras(paras) input_shape, num_classes = data.get_info(dataset) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_data, val_data = data.get_data( name=dataset, device=device, shuffle=True, batch_size=args.batch_size, augment=args.augment) model, _ = get_model( input_shape, arch_paras, num_classes, device=device, multi_gpu=args.multi_gpu, do_bn=args.do_bn) optimizer, lr_schedule = get_optimizer(args.optimizer, model) best_acc = 0 best_quan_acc = 0 cvsfile = open('tune.csv', mode='w+', newline='') writer = csv.writer(cvsfile) writer.writerow(['Epoch', 'train acc', 'val acc', 'quan acc']) for epoch in range(1, args.epochs+1): # print('before training ', model.conv_1.bias, model.conv_2.bias) epoch_lr = lr_schedule(optimizer, epoch) print('-' * 80) print(f"Epoch {epoch} \t LR: {epoch_lr}" + f"\t Best Acc: {best_acc:6.3%}" + (f"\t quantized: {best_quan_acc:6.3%}" if quan_paras is not None else '')) print("Training ...") running_loss, running_correction, num_batches = 0, 0, 0 running_total = 0 bar_width = 30 model.train() start = time.time() for input_batch, label_batch in train_data: batch_loss, batch_correction = backend.batch_fit( model, input_batch, label_batch, optimizer) end = time.time() running_loss += batch_loss running_correction += batch_correction num_batches += 1 running_total += input_batch.size(0) train_acc = running_correction / running_total train_loss = running_loss / running_total epoch_percentage = num_batches / len(train_data) print('|' + '='*(math.ceil(bar_width * epoch_percentage)-1) + '>' + ' '*(bar_width - math.ceil(bar_width * epoch_percentage)) + '|' + f"{epoch_percentage:4.1%}-{end-start:4.2f}s" + f"\t loss: {train_loss:.5}, acc: {train_acc:6.3%} ", end=('\r' if epoch_percentage < 1 else '\n')) # print('after training ', model.conv_1.bias, model.conv_2.bias) print("Training finished, start evaluating ...") model.eval() running_loss, running_correction, num_batches = 0, 0, 0 running_total = 0 start = time.time() for input_batch, label_batch in val_data: with torch.no_grad(): batch_loss, batch_correction = backend.batch_fit( model, input_batch, label_batch) end = time.time() running_loss += batch_loss running_correction += batch_correction num_batches += 1 running_total += input_batch.size(0) val_acc = running_correction / running_total val_loss = running_loss / running_total epoch_percentage = num_batches / len(val_data) print('|' + '='*(math.ceil(bar_width * epoch_percentage)-1) + '>' + ' '*(bar_width - math.ceil(bar_width * epoch_percentage)) + '|' + f"{epoch_percentage:4.1%}-{end-start:4.2f}s" + f"\t loss: {val_loss:.5}, acc: {val_acc:6.3%} ", end=('\r' if epoch_percentage < 1 else '\n')) if val_acc > best_acc: best_acc = val_acc quan_acc = 'N/A' if quan_paras is not None: print("Start evaluating with quantization ...") running_loss, running_correction, num_batches = 0, 0, 0 running_total = 0 start = time.time() for input_batch, label_batch in val_data: with torch.no_grad(): batch_loss, batch_correction = backend.batch_fit( model, input_batch, label_batch, quan_paras=quan_paras) end = time.time() running_loss += batch_loss running_correction += batch_correction num_batches += 1 running_total += input_batch.size(0) quan_acc = running_correction / running_total quan_loss = running_loss / running_total epoch_percentage = num_batches / len(val_data) print('|' + '='*(math.ceil(bar_width * epoch_percentage)-1) + '>' + ' '*(bar_width - math.ceil( bar_width * epoch_percentage)) + '|' + f"{epoch_percentage:4.1%}-{end-start:4.2f}s" + f"\t loss: {quan_loss:.5}, acc: {quan_acc:6.3%} ", end=('\r' if epoch_percentage < 1 else '\n')) if quan_acc > best_quan_acc: best_quan_acc = quan_acc writer.writerow([str(epoch), train_acc, val_acc, quan_acc]) print(f"Finished tuning ... final accuracy: {best_acc:6.3%}, " + f"quantized accuracy :{best_quan_acc:6.3%}") para_num, para_size = compute_parameter_number(model.graph, quan_paras) print(f"Total number of parameters: {para_num}") print(f"Total parameter size: {para_size if para_size > 0 else 'N/A'}")
def sync_search(device, dir='experiment'): dir = os.path.join( dir, utility.cleanText(f"rLut-{args.rLUT}_rThroughput-{args.rThroughput}")) if os.path.exists(dir) is False: os.makedirs(dir) filepath = os.path.join( dir, utility.cleanText(f"joint_{args.episodes}-episodes")) logger = utility.get_logger(filepath) csvfile = open(filepath + '.csv', mode='w+', newline='') writer = csv.writer(csvfile) tb_writer = SummaryWriter(filepath) logger.info(f"INFORMATION") logger.info(f"mode: \t\t\t\t\t {'joint'}") logger.info(f"dataset: \t\t\t\t {args.dataset}") logger.info(f"number of child network layers: \t {args.layers}") logger.info(f"seed: \t\t\t\t {args.seed}") logger.info(f"gpu: \t\t\t\t {args.gpu}") logger.info(f"include batchnorm: \t\t\t {args.batchnorm}") logger.info(f"include stride: \t\t\t {not args.no_stride}") logger.info(f"include pooling: \t\t\t {not args.no_pooling}") logger.info(f"skip connection: \t\t\t {args.skip}") logger.info(f"required # LUTs: \t\t\t {args.rLUT}") logger.info(f"required throughput: \t\t\t {args.rThroughput}") logger.info(f"Assumed frequency: \t\t\t {CLOCK_FREQUENCY}") logger.info(f"training epochs: \t\t\t {args.epochs}") logger.info(f"data augmentation: \t\t\t {args.augment}") logger.info(f"batch size: \t\t\t\t {args.batch_size}") logger.info(f"controller learning rate: \t\t {args.learning_rate}") logger.info(f"controller learning rate: \t\t {args.learning_rate}") logger.info(f"architecture episodes: \t\t\t {args.episodes}") logger.info(f"using multi gpus: \t\t\t {args.multi_gpu}") logger.info(f"architecture space: ") for name, value in ARCH_SPACE.items(): logger.info(name + f": \t\t\t\t {value}") logger.info(f"quantization space: ") for name, value in QUAN_SPACE.items(): logger.info(name + f": \t\t\t {value}") agent = Agent({ **ARCH_SPACE, **QUAN_SPACE }, args.layers, lr=args.learning_rate, device=torch.device('cpu'), skip=args.skip) train_data, val_data = data.get_data(args.dataset, device, shuffle=True, batch_size=args.batch_size, augment=args.augment) input_shape, num_classes = data.get_info(args.dataset) ## (3,32,32) -> (1,3,32,32) add batch dimension sample_input = utility.get_sample_input(device, input_shape) writer.writerow(["ID"] + ["Layer {}".format(i) for i in range(args.layers)] + ["Accuracy"] + [ "Partition (Tn, Tm)", "Partition (#LUTs)", "Partition (#cycles)", "Total LUT", "Total Throughput" ] + ["Time"]) arch_id, total_time = 0, 0 best_reward = float('-inf') logger.info('=' * 50 + "Start exploring architecture & quantization space" + '=' * 50) best_samples = BestSamples(5) for e in range(args.episodes): logger.info('-' * 130) arch_id += 1 start = time.time() rollout, paras = agent.rollout() logger.info("Sample Architecture ID: {}, Sampled actions: {}".format( arch_id, rollout)) arch_paras, quan_paras = utility.split_paras(paras) fpga_model = FPGAModel(rLUT=args.rLUT, rThroughput=args.rThroughput, arch_paras=arch_paras, quan_paras=quan_paras) if fpga_model.validate(): model, optimizer = child.get_model(input_shape, arch_paras, num_classes, device, multi_gpu=args.multi_gpu, do_bn=args.batchnorm) if args.verbosity > 1: print(model) torchsummary.summary(model, input_shape) if args.adapt: num_w = utility.get_net_param(model) macs = utility.get_net_macs(model, sample_input) tb_writer.add_scalar('num_param', num_w, arch_id) tb_writer.add_scalar('macs', macs, arch_id) if args.verbosity > 1: print(f"# of param: {num_w}, macs: {macs}") _, val_acc = backend.fit(model, optimizer, train_data, val_data, quan_paras=quan_paras, epochs=args.epochs, verbosity=args.verbosity) else: val_acc = 0 if args.adapt: ## TODO: how to make arch_reward function with macs and latency? arch_reward = val_acc else: arch_reward = val_acc agent.store_rollout(rollout, arch_reward) end = time.time() ep_time = end - start total_time += ep_time best_samples.register(arch_id, rollout, arch_reward) tb_writer.add_scalar('val_acc', val_acc, arch_id) tb_writer.add_scalar('arch_reward', arch_reward, arch_id) if arch_reward > best_reward: best_reward = arch_reward tb_writer.add_scalar('best_reward', best_reward, arch_id) tb_writer.add_graph(model.eval(), (sample_input, )) writer.writerow([arch_id] + [str(paras[i]) for i in range(args.layers)] + [arch_reward] + list(fpga_model.get_info()) + [ep_time]) logger.info(f"Reward: {arch_reward}, " + f"Elasped time: {ep_time}, " + f"Average time: {total_time/(e+1)}") logger.info(f"Best Reward: {best_samples.reward_list[0]}, " + f"ID: {best_samples.id_list[0]}, " + f"Rollout: {best_samples.rollout_list[0]}") logger.info('=' * 50 + "Architecture & quantization sapce exploration finished" + '=' * 50) logger.info(f"Total elasped time: {total_time}") logger.info(f"Best samples: {best_samples}") tb_writer.close() csvfile.close()