Пример #1
0
def controller_bench(space,
                     num_layers,
                     device=torch.device('cpu'),
                     skip=True,
                     epochs=200):
    lr = 0.2
    batch_size = 5
    max_epochs = epochs
    best_rollout = []
    best_paras = []
    best_reward = -100000
    start = time.time()
    agent = ctrl.Agent(space,
                       num_layers,
                       batch_size,
                       lr=lr,
                       device=device,
                       skip=skip)
    target = get_target(space, num_layers, skip)
    reward_history = []
    for e in range(max_epochs):
        # if e == 100:
        #     agent.lr_decay(0.3)
        for i in range(batch_size):
            rollout, paras = agent.rollout()
            print(agent.agent.ema)
            # print(rollout, paras)
            arch_paras, quan_paras = utility.split_paras(paras)
            # fpga_model = FPGAModel(
            #     rLUT=100000, rThroughput=1000,
            #     arch_paras=arch_paras, quan_paras=quan_paras)
            reward = get_reward(rollout, quan_paras, target)
            reward_history.append(reward)
            # if reward == 1:
            #     print(e*batch_size + i)
            #     quit()
            if reward > best_reward:
                best_reward = reward
                best_rollout = rollout
                best_paras = paras
                # print(best_rollout, best_paras)
            print("action: {}, reward: {}".format(rollout, reward))
            agent.store_rollout(rollout, reward)
        # E = agent.train_step()
        print("epoch {}".format(e))
        print(f"best rollout {best_rollout}, " +
              f"best architecture: {best_paras}, " +
              f"best reward: {best_reward}")
    print("elasped time is {}".format(time.time() - start))
    print("target: {}".format(target))
    plot(reward_history)
Пример #2
0
        'filter_width': 3,
        'num_filters': 36,
        'pool_size': 1,
        'act_num_int_bits': 3,
        'act_num_frac_bits': 1,
        'weight_num_int_bits': 2,
        'weight_num_frac_bits': 4
    }, {
        'filter_height': 5,
        'filter_width': 5,
        'num_filters': 24,
        'pool_size': 1,
        'act_num_int_bits': 3,
        'act_num_frac_bits': 3,
        'weight_num_int_bits': 3,
        'weight_num_frac_bits': 4
    }, {
        'filter_height': 1,
        'filter_width': 1,
        'num_filters': 24,
        'pool_size': 1,
        'act_num_int_bits': 2,
        'act_num_frac_bits': 5,
        'weight_num_int_bits': 0,
        'weight_num_frac_bits': 6
    }]
    arch_paras, quan_paras = utility.split_paras(paras)

    fpga_model = FPGAModel(30000, 1000, arch_paras, quan_paras)
    print(fpga_model.get_info())
Пример #3
0
def sync_search(device, dir='experiment'):
    dir = os.path.join(dir,
                       f"rLut={args.rLUT}, rThroughput={args.rThroughput}")
    if os.path.exists(dir) is False:
        os.makedirs(dir)
    filepath = os.path.join(dir, f"joint ({args.episodes} episodes)")
    logger = get_logger(filepath)
    csvfile = open(filepath + '.csv', mode='w+', newline='')
    writer = csv.writer(csvfile)
    logger.info(f"INFORMATION")
    logger.info(f"mode: \t\t\t\t\t {'joint'}")
    logger.info(f"dataset: \t\t\t\t {args.dataset}")
    logger.info(f"number of child network layers: \t {args.layers}")
    logger.info(f"include stride: \t\t\t {not args.no_stride}")
    logger.info(f"include pooling: \t\t\t {not args.no_pooling}")
    logger.info(f"skip connection: \t\t\t {args.skip}")
    logger.info(f"required # LUTs: \t\t\t {args.rLUT}")
    logger.info(f"required throughput: \t\t\t {args.rThroughput}")
    logger.info(f"Assumed frequency: \t\t\t {CLOCK_FREQUENCY}")
    logger.info(f"training epochs: \t\t\t {args.epochs}")
    logger.info(f"data augmentation: \t\t\t {args.augment}")
    logger.info(f"batch size: \t\t\t\t {args.batch_size}")
    logger.info(f"controller learning rate: \t\t {args.learning_rate}")
    logger.info(f"architecture episodes: \t\t\t {args.episodes}")
    logger.info(f"using multi gpus: \t\t\t {args.multi_gpu}")
    logger.info(f"architecture space: ")
    for name, value in ARCH_SPACE.items():
        logger.info(name + f": \t\t\t\t {value}")
    logger.info(f"quantization space: ")
    for name, value in QUAN_SPACE.items():
        logger.info(name + f": \t\t\t {value}")
    agent = Agent({
        **ARCH_SPACE,
        **QUAN_SPACE
    },
                  args.layers,
                  lr=args.learning_rate,
                  device=torch.device('cpu'),
                  skip=args.skip)
    train_data, val_data = data.get_data(args.dataset,
                                         device,
                                         shuffle=True,
                                         batch_size=args.batch_size,
                                         augment=args.augment)
    input_shape, num_classes = data.get_info(args.dataset)
    writer.writerow(["ID"] +
                    ["Layer {}".format(i)
                     for i in range(args.layers)] + ["Accuracy"] + [
                         "Partition (Tn, Tm)", "Partition (#LUTs)",
                         "Partition (#cycles)", "Total LUT", "Total Throughput"
                     ] + ["Time"])
    child_id, total_time = 0, 0
    logger.info('=' * 50 +
                "Start exploring architecture & quantization space" + '=' * 50)
    best_samples = BestSamples(5)
    for e in range(args.episodes):
        logger.info('-' * 130)
        child_id += 1
        start = time.time()
        rollout, paras = agent.rollout()
        logger.info("Sample Architecture ID: {}, Sampled actions: {}".format(
            child_id, rollout))
        arch_paras, quan_paras = utility.split_paras(paras)
        fpga_model = FPGAModel(rLUT=args.rLUT,
                               rThroughput=args.rThroughput,
                               arch_paras=arch_paras,
                               quan_paras=quan_paras)
        if fpga_model.validate():
            model, optimizer = child.get_model(input_shape,
                                               arch_paras,
                                               num_classes,
                                               device,
                                               multi_gpu=args.multi_gpu,
                                               do_bn=False)
            _, reward = backend.fit(model,
                                    optimizer,
                                    train_data,
                                    val_data,
                                    quan_paras=quan_paras,
                                    epochs=args.epochs,
                                    verbosity=args.verbosity)
        else:
            reward = 0
        agent.store_rollout(rollout, reward)
        end = time.time()
        ep_time = end - start
        total_time += ep_time
        best_samples.register(child_id, rollout, reward)
        writer.writerow([child_id] +
                        [str(paras[i]) for i in range(args.layers)] +
                        [reward] + list(fpga_model.get_info()) + [ep_time])
        logger.info(f"Reward: {reward}, " + f"Elasped time: {ep_time}, " +
                    f"Average time: {total_time/(e+1)}")
        logger.info(f"Best Reward: {best_samples.reward_list[0]}, " +
                    f"ID: {best_samples.id_list[0]}, " +
                    f"Rollout: {best_samples.rollout_list[0]}")
    logger.info('=' * 50 +
                "Architecture & quantization sapce exploration finished" +
                '=' * 50)
    logger.info(f"Total elasped time: {total_time}")
    logger.info(f"Best samples: {best_samples}")
    csvfile.close()
Пример #4
0
def tune(paras=[], dataset='CIFAR10'):
    # quantize = True if 'act_num_int_bits' in paras[0] else False
    arch_paras, quan_paras = utility.split_paras(paras)
    input_shape, num_classes = data.get_info(dataset)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_data, val_data = data.get_data(
        name=dataset, device=device,
        shuffle=True, batch_size=args.batch_size, augment=args.augment)
    model, _ = get_model(
        input_shape, arch_paras, num_classes,
        device=device,
        multi_gpu=args.multi_gpu,
        do_bn=args.do_bn)
    optimizer, lr_schedule = get_optimizer(args.optimizer, model)
    best_acc = 0
    best_quan_acc = 0
    cvsfile = open('tune.csv', mode='w+', newline='')
    writer = csv.writer(cvsfile)
    writer.writerow(['Epoch', 'train acc', 'val acc', 'quan acc'])
    for epoch in range(1, args.epochs+1):
        # print('before training ', model.conv_1.bias, model.conv_2.bias)
        epoch_lr = lr_schedule(optimizer, epoch)
        print('-' * 80)
        print(f"Epoch {epoch} \t LR: {epoch_lr}" +
              f"\t Best Acc: {best_acc:6.3%}" +
              (f"\t quantized: {best_quan_acc:6.3%}" if quan_paras is not None
               else ''))
        print("Training ...")
        running_loss, running_correction, num_batches = 0, 0, 0
        running_total = 0
        bar_width = 30
        model.train()
        start = time.time()
        for input_batch, label_batch in train_data:
            batch_loss, batch_correction = backend.batch_fit(
                model, input_batch, label_batch, optimizer)
            end = time.time()
            running_loss += batch_loss
            running_correction += batch_correction
            num_batches += 1
            running_total += input_batch.size(0)
            train_acc = running_correction / running_total
            train_loss = running_loss / running_total
            epoch_percentage = num_batches / len(train_data)
            print('|' + '='*(math.ceil(bar_width * epoch_percentage)-1) +
                  '>' +
                  ' '*(bar_width - math.ceil(bar_width * epoch_percentage)) +
                  '|' + f"{epoch_percentage:4.1%}-{end-start:4.2f}s" +
                  f"\t loss: {train_loss:.5}, acc: {train_acc:6.3%}  ",
                  end=('\r' if epoch_percentage < 1 else '\n'))
        # print('after training ', model.conv_1.bias, model.conv_2.bias)
        print("Training finished, start evaluating ...")
        model.eval()
        running_loss, running_correction, num_batches = 0, 0, 0
        running_total = 0
        start = time.time()
        for input_batch, label_batch in val_data:
            with torch.no_grad():
                batch_loss, batch_correction = backend.batch_fit(
                    model, input_batch, label_batch)
                end = time.time()
                running_loss += batch_loss
                running_correction += batch_correction
                num_batches += 1
                running_total += input_batch.size(0)
                val_acc = running_correction / running_total
                val_loss = running_loss / running_total
                epoch_percentage = num_batches / len(val_data)
            print('|' + '='*(math.ceil(bar_width * epoch_percentage)-1) +
                  '>' +
                  ' '*(bar_width - math.ceil(bar_width * epoch_percentage)) +
                  '|' + f"{epoch_percentage:4.1%}-{end-start:4.2f}s" +
                  f"\t loss: {val_loss:.5}, acc: {val_acc:6.3%}  ",
                  end=('\r' if epoch_percentage < 1 else '\n'))
        if val_acc > best_acc:
            best_acc = val_acc
        quan_acc = 'N/A'
        if quan_paras is not None:
            print("Start evaluating with quantization ...")
            running_loss, running_correction, num_batches = 0, 0, 0
            running_total = 0
            start = time.time()
            for input_batch, label_batch in val_data:
                with torch.no_grad():
                    batch_loss, batch_correction = backend.batch_fit(
                        model, input_batch, label_batch, quan_paras=quan_paras)
                    end = time.time()
                    running_loss += batch_loss
                    running_correction += batch_correction
                    num_batches += 1
                    running_total += input_batch.size(0)
                    quan_acc = running_correction / running_total
                    quan_loss = running_loss / running_total
                    epoch_percentage = num_batches / len(val_data)
                print('|' + '='*(math.ceil(bar_width * epoch_percentage)-1) +
                      '>' + ' '*(bar_width - math.ceil(
                        bar_width * epoch_percentage)) +
                      '|' + f"{epoch_percentage:4.1%}-{end-start:4.2f}s" +
                      f"\t loss: {quan_loss:.5}, acc: {quan_acc:6.3%}  ",
                      end=('\r' if epoch_percentage < 1 else '\n'))
            if quan_acc > best_quan_acc:
                best_quan_acc = quan_acc
        writer.writerow([str(epoch), train_acc, val_acc, quan_acc])
    print(f"Finished tuning ... final accuracy: {best_acc:6.3%}, " +
          f"quantized accuracy :{best_quan_acc:6.3%}")
    para_num, para_size = compute_parameter_number(model.graph, quan_paras)
    print(f"Total number of parameters: {para_num}")
    print(f"Total parameter size: {para_size if para_size > 0 else 'N/A'}")
def sync_search(device, dir='experiment'):
    dir = os.path.join(
        dir,
        utility.cleanText(f"rLut-{args.rLUT}_rThroughput-{args.rThroughput}"))
    if os.path.exists(dir) is False:
        os.makedirs(dir)
    filepath = os.path.join(
        dir, utility.cleanText(f"joint_{args.episodes}-episodes"))
    logger = utility.get_logger(filepath)
    csvfile = open(filepath + '.csv', mode='w+', newline='')
    writer = csv.writer(csvfile)
    tb_writer = SummaryWriter(filepath)

    logger.info(f"INFORMATION")
    logger.info(f"mode: \t\t\t\t\t {'joint'}")
    logger.info(f"dataset: \t\t\t\t {args.dataset}")
    logger.info(f"number of child network layers: \t {args.layers}")
    logger.info(f"seed: \t\t\t\t {args.seed}")
    logger.info(f"gpu: \t\t\t\t {args.gpu}")
    logger.info(f"include batchnorm: \t\t\t {args.batchnorm}")
    logger.info(f"include stride: \t\t\t {not args.no_stride}")
    logger.info(f"include pooling: \t\t\t {not args.no_pooling}")
    logger.info(f"skip connection: \t\t\t {args.skip}")
    logger.info(f"required # LUTs: \t\t\t {args.rLUT}")
    logger.info(f"required throughput: \t\t\t {args.rThroughput}")
    logger.info(f"Assumed frequency: \t\t\t {CLOCK_FREQUENCY}")
    logger.info(f"training epochs: \t\t\t {args.epochs}")
    logger.info(f"data augmentation: \t\t\t {args.augment}")
    logger.info(f"batch size: \t\t\t\t {args.batch_size}")
    logger.info(f"controller learning rate: \t\t {args.learning_rate}")
    logger.info(f"controller learning rate: \t\t {args.learning_rate}")
    logger.info(f"architecture episodes: \t\t\t {args.episodes}")
    logger.info(f"using multi gpus: \t\t\t {args.multi_gpu}")
    logger.info(f"architecture space: ")
    for name, value in ARCH_SPACE.items():
        logger.info(name + f": \t\t\t\t {value}")
    logger.info(f"quantization space: ")
    for name, value in QUAN_SPACE.items():
        logger.info(name + f": \t\t\t {value}")

    agent = Agent({
        **ARCH_SPACE,
        **QUAN_SPACE
    },
                  args.layers,
                  lr=args.learning_rate,
                  device=torch.device('cpu'),
                  skip=args.skip)

    train_data, val_data = data.get_data(args.dataset,
                                         device,
                                         shuffle=True,
                                         batch_size=args.batch_size,
                                         augment=args.augment)

    input_shape, num_classes = data.get_info(args.dataset)
    ## (3,32,32) -> (1,3,32,32) add batch dimension
    sample_input = utility.get_sample_input(device, input_shape)

    writer.writerow(["ID"] +
                    ["Layer {}".format(i)
                     for i in range(args.layers)] + ["Accuracy"] + [
                         "Partition (Tn, Tm)", "Partition (#LUTs)",
                         "Partition (#cycles)", "Total LUT", "Total Throughput"
                     ] + ["Time"])

    arch_id, total_time = 0, 0
    best_reward = float('-inf')

    logger.info('=' * 50 +
                "Start exploring architecture & quantization space" + '=' * 50)
    best_samples = BestSamples(5)

    for e in range(args.episodes):
        logger.info('-' * 130)
        arch_id += 1
        start = time.time()
        rollout, paras = agent.rollout()
        logger.info("Sample Architecture ID: {}, Sampled actions: {}".format(
            arch_id, rollout))
        arch_paras, quan_paras = utility.split_paras(paras)

        fpga_model = FPGAModel(rLUT=args.rLUT,
                               rThroughput=args.rThroughput,
                               arch_paras=arch_paras,
                               quan_paras=quan_paras)

        if fpga_model.validate():

            model, optimizer = child.get_model(input_shape,
                                               arch_paras,
                                               num_classes,
                                               device,
                                               multi_gpu=args.multi_gpu,
                                               do_bn=args.batchnorm)

            if args.verbosity > 1:
                print(model)
                torchsummary.summary(model, input_shape)

            if args.adapt:
                num_w = utility.get_net_param(model)
                macs = utility.get_net_macs(model, sample_input)
                tb_writer.add_scalar('num_param', num_w, arch_id)
                tb_writer.add_scalar('macs', macs, arch_id)
                if args.verbosity > 1:
                    print(f"# of param: {num_w}, macs: {macs}")

            _, val_acc = backend.fit(model,
                                     optimizer,
                                     train_data,
                                     val_data,
                                     quan_paras=quan_paras,
                                     epochs=args.epochs,
                                     verbosity=args.verbosity)
        else:
            val_acc = 0

        if args.adapt:
            ## TODO: how to make arch_reward function with macs and latency?
            arch_reward = val_acc
        else:
            arch_reward = val_acc

        agent.store_rollout(rollout, arch_reward)
        end = time.time()
        ep_time = end - start
        total_time += ep_time
        best_samples.register(arch_id, rollout, arch_reward)

        tb_writer.add_scalar('val_acc', val_acc, arch_id)
        tb_writer.add_scalar('arch_reward', arch_reward, arch_id)

        if arch_reward > best_reward:
            best_reward = arch_reward
            tb_writer.add_scalar('best_reward', best_reward, arch_id)
            tb_writer.add_graph(model.eval(), (sample_input, ))

        writer.writerow([arch_id] +
                        [str(paras[i])
                         for i in range(args.layers)] + [arch_reward] +
                        list(fpga_model.get_info()) + [ep_time])
        logger.info(f"Reward: {arch_reward}, " + f"Elasped time: {ep_time}, " +
                    f"Average time: {total_time/(e+1)}")
        logger.info(f"Best Reward: {best_samples.reward_list[0]}, " +
                    f"ID: {best_samples.id_list[0]}, " +
                    f"Rollout: {best_samples.rollout_list[0]}")
    logger.info('=' * 50 +
                "Architecture & quantization sapce exploration finished" +
                '=' * 50)
    logger.info(f"Total elasped time: {total_time}")
    logger.info(f"Best samples: {best_samples}")
    tb_writer.close()
    csvfile.close()