def eval_pnorm(p):
        args.qtype = 'lp_norm'
        args.lp = p
        # Fix the seed
        random.seed(args.seed)
        if not args.dont_fix_np_seed:
            np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

        inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath,
                             batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq,
                             cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args)

        all_layers = []
        if args.bit_weights is not None:
            all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d)][1:-1]
        if args.bit_act is not None:
            all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU)][1:-1]
        if args.bit_act is not None and 'mobilenet' in args.arch:
            all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6)][1:-1]

        mq = ModelQuantizer(inf_model.model, args, all_layers, replacement_factory)
        loss = inf_model.evaluate_calibration()
        point = mq.get_clipping()

        del inf_model
        del mq

        return point, loss
Пример #2
0
    def eval_pnorm(p):
        args.qtype = 'lp_norm'
        args.lp = p
        # Fix the seed
        random.seed(args.seed)
        if not args.dont_fix_np_seed:
            np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath,
                             batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq,
                             cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args)

        mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)
        loss = inf_model.evaluate_calibration()
        point = mq.get_clipping()

        # evaluate
        acc = inf_model.validate()

        del inf_model
        del mq

        return point, loss, acc
def main_worker(args, ml_logger):
    global best_acc1

    if args.gpu_ids is not None:
        print("Use GPU: {} for training".format(args.gpu_ids))

    if args.log_stats:
        from utils.stats_trucker import StatsTrucker as ST
        ST("W{}A{}".format(args.bit_weights, args.bit_act))

    if 'resnet' in args.arch and args.custom_resnet:
        model = custom_resnet(arch=args.arch,
                              pretrained=args.pretrained,
                              depth=arch2depth(args.arch),
                              dataset=args.dataset)
    elif 'inception_v3' in args.arch and args.custom_inception:
        model = custom_inception(pretrained=args.pretrained)
    else:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=args.pretrained)

    device = torch.device('cuda:{}'.format(args.gpu_ids[0]))
    cudnn.benchmark = True

    torch.cuda.set_device(args.gpu_ids[0])
    model = model.to(device)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, device)
            args.start_epoch = checkpoint['epoch']
            # best_acc1 = checkpoint['best_acc1']
            # best_acc1 may be from a checkpoint from a different GPU
            # best_acc1 = best_acc1.to(device)
            checkpoint['state_dict'] = {
                normalize_module_name(k): v
                for k, v in checkpoint['state_dict'].items()
            }
            model.load_state_dict(checkpoint['state_dict'], strict=False)
            # optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    if len(args.gpu_ids) > 1:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features,
                                                   args.gpu_ids)
        else:
            model = torch.nn.DataParallel(model, args.gpu_ids)

    default_transform = {
        'train': get_transform(args.dataset, augment=True),
        'eval': get_transform(args.dataset, augment=False)
    }

    val_data = get_dataset(args.dataset, 'val', default_transform['eval'])
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=args.batch_size,
                                             shuffle=True,
                                             num_workers=args.workers,
                                             pin_memory=True)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().to(device)

    train_data = get_dataset(args.dataset, 'train', default_transform['train'])
    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               drop_last=True)

    # TODO: replace this call by initialization on small subset of training data
    # TODO: enable for activations
    # validate(val_loader, model, criterion, args, device)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    lr_scheduler = StepLR(optimizer, step_size=args.lr_step, gamma=0.1)

    mq = None
    if args.quantize:
        if args.bn_folding:
            print(
                "Applying batch-norm folding ahead of post-training quantization"
            )
            from utils.absorb_bn import search_absorbe_bn
            search_absorbe_bn(model)

        all_convs = [
            n for n, m in model.named_modules() if isinstance(m, nn.Conv2d)
        ]
        # all_convs = [l for l in all_convs if 'downsample' not in l]
        all_relu = [
            n for n, m in model.named_modules() if isinstance(m, nn.ReLU)
        ]
        all_relu6 = [
            n for n, m in model.named_modules() if isinstance(m, nn.ReLU6)
        ]
        layers = all_relu[1:-1] + all_relu6[1:-1] + all_convs[1:]
        replacement_factory = {
            nn.ReLU: ActivationModuleWrapper,
            nn.ReLU6: ActivationModuleWrapper,
            nn.Conv2d: ParameterModuleWrapper
        }
        mq = ModelQuantizer(
            model, args, layers, replacement_factory,
            OptimizerBridge(optimizer,
                            settings={
                                'algo': 'SGD',
                                'dataset': args.dataset
                            }))

        if args.resume:
            # Load quantization parameters from state dict
            mq.load_state_dict(checkpoint['state_dict'])

        mq.log_quantizer_state(ml_logger, -1)

        if args.model_freeze:
            mq.freeze()

    if args.evaluate:
        if args.log_stats:
            mean = []
            var = []
            skew = []
            kurt = []
            for n, p in model.named_parameters():
                if n.replace('.weight', '') in all_convs[1:]:
                    mu = p.mean()
                    std = p.std()
                    mean.append((n, mu.item()))
                    var.append((n, (std**2).item()))
                    skew.append((n, torch.mean(((p - mu) / std)**3).item()))
                    kurt.append((n, torch.mean(((p - mu) / std)**4).item()))
            for i in range(len(mean)):
                ml_logger.log_metric(mean[i][0] + '.mean', mean[i][1])
                ml_logger.log_metric(var[i][0] + '.var', var[i][1])
                ml_logger.log_metric(skew[i][0] + '.skewness', skew[i][1])
                ml_logger.log_metric(kurt[i][0] + '.kurtosis', kurt[i][1])

            ml_logger.log_metric('weight_mean', np.mean([s[1] for s in mean]))
            ml_logger.log_metric('weight_var', np.mean([s[1] for s in var]))
            ml_logger.log_metric('weight_skewness',
                                 np.mean([s[1] for s in skew]))
            ml_logger.log_metric('weight_kurtosis',
                                 np.mean([s[1] for s in kurt]))

        acc = validate(val_loader, model, criterion, args, device)
        ml_logger.log_metric('Val Acc1', acc)
        if args.log_stats:
            stats = ST().get_stats()
            for s in stats:
                ml_logger.log_metric(s, np.mean(stats[s]))
        return

    # evaluate on validation set
    acc1 = validate(val_loader, model, criterion, args, device)
    ml_logger.log_metric('Val Acc1', acc1, -1)

    # evaluate with k-means quantization
    # if args.model_freeze:
    # with mq.disable():
    #     acc1_nq = validate(val_loader, model, criterion, args, device)
    #     ml_logger.log_metric('Val Acc1 fp32', acc1_nq, -1)

    for epoch in range(0, args.epochs):
        # train for one epoch
        print('Timestamp Start epoch: {:%Y-%m-%d %H:%M:%S}'.format(
            datetime.datetime.now()))
        train(train_loader, model, criterion, optimizer, epoch, args, device,
              ml_logger, val_loader, mq)
        print('Timestamp End epoch: {:%Y-%m-%d %H:%M:%S}'.format(
            datetime.datetime.now()))

        if not args.lr_freeze:
            lr_scheduler.step()

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args, device)
        ml_logger.log_metric('Val Acc1', acc1, step='auto')

        # evaluate with k-means quantization
        # if args.model_freeze:
        # with mq.quantization_method('kmeans'):
        #     acc1_kmeans = validate(val_loader, model, criterion, args, device)
        #     ml_logger.log_metric('Val Acc1 kmeans', acc1_kmeans, epoch)

        # with mq.disable():
        #     acc1_nq = validate(val_loader, model, criterion, args, device)
        #     ml_logger.log_metric('Val Acc1 fp32', acc1_nq,  step='auto')

        if args.quantize:
            mq.log_quantizer_state(ml_logger, epoch)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        save_checkpoint(
            {
                'epoch':
                epoch + 1,
                'arch':
                args.arch,
                'state_dict':
                model.state_dict()
                if len(args.gpu_ids) == 1 else model.module.state_dict(),
                'best_acc1':
                best_acc1,
                'optimizer':
                optimizer.state_dict(),
            }, is_best)
Пример #4
0
def main(args, ml_logger):
    # Fix the seed
    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    args.qtype = 'max_static'
    # create model
    # Always enable shuffling to avoid issues where we get bad results due to weak statistics
    inf_model = CnnModel(args.arch,
                         args.custom_resnet,
                         args.pretrained,
                         args.dataset,
                         args.gpu_ids,
                         args.datapath,
                         batch_size=args.batch_size,
                         shuffle=True,
                         workers=args.workers,
                         print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size,
                         cal_set_size=args.cal_set_size,
                         args=args)

    all_layers = []
    if args.bit_weights is not None:
        all_layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.Conv2d)
        ][1:-1]
    if args.bit_act is not None:
        all_layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.ReLU)
        ][1:-1]
    if args.bit_act is not None and 'mobilenet' in args.arch:
        all_layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.ReLU6)
        ][1:-1]

    replacement_factory = {
        nn.ReLU: ActivationModuleWrapperPost,
        nn.ReLU6: ActivationModuleWrapperPost,
        nn.Conv2d: ParameterModuleWrapperPost
    }
    mq = ModelQuantizer(inf_model.model, args, all_layers, replacement_factory)

    loss = inf_model.evaluate_calibration()
    print("loss: {:.4f}".format(loss.item()))
    ml_logger.log_metric('loss', loss.item(), step='auto')

    # get clipping values
    p_max = mq.get_clipping()
    # print(init)

    args.qtype = 'l2_norm'
    inf_model = CnnModel(args.arch,
                         args.custom_resnet,
                         args.pretrained,
                         args.dataset,
                         args.gpu_ids,
                         args.datapath,
                         batch_size=args.batch_size,
                         shuffle=True,
                         workers=args.workers,
                         print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size,
                         cal_set_size=args.cal_set_size,
                         args=args)
    mq = ModelQuantizer(inf_model.model, args, all_layers, replacement_factory)
    loss = inf_model.evaluate_calibration()
    print("loss l2: {:.4f}".format(loss.item()))
    p_l2 = mq.get_clipping()

    args.qtype = 'l3_norm'
    inf_model = CnnModel(args.arch,
                         args.custom_resnet,
                         args.pretrained,
                         args.dataset,
                         args.gpu_ids,
                         args.datapath,
                         batch_size=args.batch_size,
                         shuffle=True,
                         workers=args.workers,
                         print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size,
                         cal_set_size=args.cal_set_size,
                         args=args)
    mq = ModelQuantizer(inf_model.model, args, all_layers, replacement_factory)
    loss = inf_model.evaluate_calibration()
    print("loss l2: {:.4f}".format(loss.item()))
    p_l3 = mq.get_clipping()

    # gamma_avg = 0
    # T_avg = 0
    num_iter = args.num_iter
    n = args.num_points

    def status_callback(i, gamma, T, f_max):
        T = T.item()
        gamma = gamma.item()
        f_max = f_max.item()

        print("gamma^2: {}, T: {}, max: {}".format(gamma, T, f_max))
        ml_logger.log_metric('gamma', gamma, step='auto')
        ml_logger.log_metric('T', T, step='auto')
        ml_logger.log_metric('f_max', f_max, step='auto')
        T_norm = T / np.sqrt(i + 1)
        ml_logger.log_metric('T_norm', T_norm, step='auto')
        gamma_norm = gamma / f_max**2
        ml_logger.log_metric('gamma_norm', gamma_norm, step='auto')

    gamma_, T_, f_max = separability_index(
        lambda x: model_func(x, p_max, inf_model, mq, p_l2, p_l3),
        len(p_max),
        n,
        num_iter,
        gpu=True,
        status_callback=status_callback)

    gamma_norm = np.mean(np.array(gamma_) / f_max.item()**2)
    T_norm = np.mean(np.array(T_) / np.sqrt(np.arange(1, num_iter + 1)))

    print("gamma^2 norm: {}, T norm: {}".format(gamma_norm, T_norm))
    ml_logger.log_metric('gamma_tot', gamma_norm, step='auto')
    ml_logger.log_metric('T_tot', T_norm, step='auto')
def main(args, ml_logger):
    # Fix the seed
    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    if args.tag is not None:
        ml_logger.mlflow.log_param('tag', args.tag)

    args.qtype = 'max_static'
    # create model
    # Always enable shuffling to avoid issues where we get bad results due to weak statistics
    custom_resnet = True
    custom_inception = True
    inf_model = CnnModel(args.arch,
                         custom_resnet,
                         custom_inception,
                         args.pretrained,
                         args.dataset,
                         args.gpu_ids,
                         args.datapath,
                         batch_size=args.batch_size,
                         shuffle=True,
                         workers=args.workers,
                         print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size,
                         cal_set_size=args.cal_set_size,
                         args=args)

    layers = []
    # TODO: make it more generic
    if args.bit_weights is not None:
        layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.Conv2d)
        ][1:-1]
    if args.bit_act is not None:
        layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.ReLU)
        ][1:-1]
    if args.bit_act is not None and 'mobilenet' in args.arch:
        layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.ReLU6)
        ][1:-1]

    replacement_factory = {
        nn.ReLU: ActivationModuleWrapperPost,
        nn.ReLU6: ActivationModuleWrapperPost,
        nn.Conv2d: ParameterModuleWrapperPost
    }

    mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)
    loss = inf_model.evaluate_calibration()

    # evaluate
    max_acc = inf_model.validate()
    max_point = mq.get_clipping()
    ml_logger.log_metric('Loss max', loss.item(), step='auto')
    ml_logger.log_metric('Acc max', max_acc, step='auto')
    data = {'max': {'alpha': max_point.cpu().numpy(), 'loss': loss.item()}}
    print("max loss: {:.4f}, max_acc: {:.4f}".format(loss.item(), max_acc))

    def eval_pnorm(p):
        args.qtype = 'lp_norm'
        args.lp = p
        # Fix the seed
        random.seed(args.seed)
        if not args.dont_fix_np_seed:
            np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        inf_model = CnnModel(
            args.arch,
            custom_resnet,
            custom_inception,
            args.pretrained,
            args.dataset,
            args.gpu_ids,
            args.datapath,
            batch_size=args.batch_size,
            shuffle=True,
            workers=args.workers,
            print_freq=args.print_freq,
            cal_batch_size=args.cal_batch_size,
            cal_set_size=args.cal_set_size,
            args=args,
        )

        mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)
        loss = inf_model.evaluate_calibration()
        point = mq.get_clipping()

        # evaluate
        acc = inf_model.validate()

        del inf_model
        del mq

        return point, loss, acc

    del inf_model
    del mq

    print("Evaluate L2 norm optimization")
    l2_point, l2_loss, l2_acc = eval_pnorm(2.)
    print("loss l2: {:.4f}".format(l2_loss.item()))
    ml_logger.log_metric('Loss l2', l2_loss.item(), step='auto')
    ml_logger.log_metric('Acc l2', l2_acc, step='auto')
    data['l2'] = {
        'alpha': l2_point.cpu().numpy(),
        'loss': l2_loss.item(),
        'acc': l2_acc
    }

    print("Evaluate L2.5 norm optimization")
    l25_point, l25_loss, l25_acc = eval_pnorm(2.5)
    print("loss l2.5: {:.4f}".format(l25_loss.item()))
    ml_logger.log_metric('Loss l2.5', l25_loss.item(), step='auto')
    ml_logger.log_metric('Acc l2.5', l25_acc, step='auto')
    data['l2.5'] = {
        'alpha': l25_point.cpu().numpy(),
        'loss': l25_loss.item(),
        'acc': l25_acc
    }

    print("Evaluate L3 norm optimization")
    l3_point, l3_loss, l3_acc = eval_pnorm(3.)
    print("loss l3: {:.4f}".format(l3_loss.item()))
    ml_logger.log_metric('Loss l3', l3_loss.item(), step='auto')
    ml_logger.log_metric('Acc l3', l3_acc, step='auto')
    data['l3'] = {
        'alpha': l3_point.cpu().numpy(),
        'loss': l3_loss.item(),
        'acc': l3_acc
    }

    # Interpolate optimal p
    xp = np.linspace(1, 5, 50)
    z = np.polyfit([2, 2.5, 3], [l2_acc, l25_acc, l3_acc], 2)
    y = np.poly1d(z)
    p_intr = xp[np.argmax(y(xp))]
    print("p intr: {:.2f}".format(p_intr))
    ml_logger.log_metric('p intr', p_intr, step='auto')

    args.qtype = 'lp_norm'
    args.lp = p_intr
    # Fix the seed
    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    inf_model = CnnModel(args.arch,
                         custom_resnet,
                         custom_inception,
                         args.pretrained,
                         args.dataset,
                         args.gpu_ids,
                         args.datapath,
                         batch_size=args.batch_size,
                         shuffle=True,
                         workers=args.workers,
                         print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size,
                         cal_set_size=args.cal_set_size,
                         args=args)

    if args.bn_folding:
        print(
            "Applying batch-norm folding ahead of post-training quantization")
        # pdb.set_trace()
        from utils.absorb_bn import search_absorbe_bn
        search_absorbe_bn(inf_model.model)
    mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)

    # Evaluate with optimal p
    lp_loss = inf_model.evaluate_calibration()
    lp_point = mq.get_clipping()
    # evaluate
    lp_acc = inf_model.validate()

    print("loss p intr: {:.4f}".format(lp_loss.item()))
    ml_logger.log_metric('Loss p intr', lp_loss.item(), step='auto')
    ml_logger.log_metric('Acc p intr', lp_acc, step='auto')

    global _eval_count, _min_loss
    _min_loss = lp_loss.item()

    idx = np.argmax([l2_acc, l25_acc, l3_acc, lp_acc])
    init = [l2_point, l25_point, l3_point, lp_point][idx]

    # run optimizer
    min_options = {}
    if args.maxiter is not None:
        min_options['maxiter'] = args.maxiter
    if args.maxfev is not None:
        min_options['maxfev'] = args.maxfev

    _iter = count(0)

    def local_search_callback(x):
        it = next(_iter)
        mq.set_clipping(x, inf_model.device)
        loss = inf_model.evaluate_calibration()
        print("\n[{}]: Local search callback".format(it))
        print("loss: {:.4f}\n".format(loss.item()))
        print(x)
        ml_logger.log_metric('Loss {}'.format(args.min_method),
                             loss.item(),
                             step='auto')

        # evaluate
        acc = inf_model.validate()
        ml_logger.log_metric('Acc {}'.format(args.min_method),
                             acc,
                             step='auto')

    args.min_method = "Powell"
    method = coord_descent if args.min_method == 'CD' else args.min_method
    res = opt.minimize(
        lambda scales: evaluate_calibration_clipped(scales, inf_model, mq),
        init.cpu().numpy(),
        method=method,
        options=min_options,
        callback=local_search_callback)

    print(res)

    scales = res.x
    mq.set_clipping(scales, inf_model.device)
    loss = inf_model.evaluate_calibration()
    ml_logger.log_metric('Loss {}'.format(args.min_method),
                         loss.item(),
                         step='auto')

    # evaluate
    acc = inf_model.validate()
    ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto')
    data['powell'] = {'alpha': scales, 'loss': loss.item(), 'acc': acc}

    print("Starting coordinate descent")
    args.min_method = "CD"
    min_options[
        'maxiter'] = 1  # Perform only one iteration of coordinate descent to avoid divergence
    _iter = count(0)
    global _eval_count
    _eval_count = count(0)
    _min_loss = lp_loss.item()
    mq.set_clipping(init, inf_model.device)
    # Run coordinate descent for comparison
    method = coord_descent
    res = opt.minimize(
        lambda scales: evaluate_calibration_clipped(scales, inf_model, mq),
        init.cpu().numpy(),
        method=method,
        options=min_options,
        callback=local_search_callback)

    print(res)

    scales = res.x
    mq.set_clipping(scales, inf_model.device)
    loss = inf_model.evaluate_calibration()
    ml_logger.log_metric('Loss {}'.format("CD"), loss.item(), step='auto')

    # evaluate
    acc = inf_model.validate()
    ml_logger.log_metric('Acc {}'.format("CD"), acc, step='auto')
    data['cd'] = {'alpha': scales, 'loss': loss.item(), 'acc': acc}

    # save scales
    f_name = "scales_{}_W{}A{}.pkl".format(args.arch, args.bit_weights,
                                           args.bit_act)
    f = open(os.path.join(proj_root_dir, 'data', f_name), 'wb')
    pickle.dump(data, f)
    f.close()
    print("Data saved to {}".format(f_name))
Пример #6
0
def main(args):
    # Fix the seed
    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    args.qtype = 'max_static'
    # create model
    # Always enable shuffling to avoid issues where we get bad results due to weak statistics
    custom_resnet = True
    inf_model = CnnModel(args.arch,
                         custom_resnet,
                         args.pretrained,
                         args.dataset,
                         args.gpu_ids,
                         args.datapath,
                         batch_size=args.batch_size,
                         shuffle=True,
                         workers=args.workers,
                         print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size,
                         cal_set_size=args.cal_set_size,
                         args=args)

    all_layers = []
    if args.bit_weights is not None:
        all_layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.Conv2d)
        ][1:-1]
    if args.bit_act is not None:
        all_layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.ReLU)
        ][1:-1]
    if args.bit_act is not None and 'mobilenet' in args.arch:
        all_layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.ReLU6)
        ][1:-1]

    id1 = 0
    id2 = 1
    layers = [all_layers[id1], all_layers[id2]]
    replacement_factory = {
        nn.ReLU: ActivationModuleWrapperPost,
        nn.ReLU6: ActivationModuleWrapperPost,
        nn.Conv2d: ParameterModuleWrapperPost
    }

    mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)

    loss = inf_model.evaluate_calibration()
    print("loss: {:.4f}".format(loss.item()))
    max_point = mq.get_clipping()

    n = args.grid_resolution
    x = np.linspace(0.01, max_point[0].item(), n)
    y = np.linspace(0.01, max_point[1].item(), n)
    X, Y = np.meshgrid(x, y)
    Z = np.empty((n, n))
    for i, x_ in enumerate(tqdm(x)):
        for j, y_ in enumerate(y):
            # set clip value to qwrappers
            scales = np.array([X[i, j], Y[i, j]])
            mq.set_clipping(scales, inf_model.device)

            # evaluate with clipping
            loss = inf_model.evaluate_calibration()
            Z[i][j] = loss.item()

    max_point = np.concatenate([max_point.cpu().numpy(), loss.cpu().numpy()])

    def eval_pnorm(p):
        args.qtype = 'lp_norm'
        args.lp = p
        # Fix the seed
        random.seed(args.seed)
        if not args.dont_fix_np_seed:
            np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        inf_model = CnnModel(args.arch,
                             custom_resnet,
                             args.pretrained,
                             args.dataset,
                             args.gpu_ids,
                             args.datapath,
                             batch_size=args.batch_size,
                             shuffle=True,
                             workers=args.workers,
                             print_freq=args.print_freq,
                             cal_batch_size=args.cal_batch_size,
                             cal_set_size=args.cal_set_size,
                             args=args)

        mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)
        loss = inf_model.evaluate_calibration()
        point = mq.get_clipping()
        point = np.concatenate([point.cpu().numpy(), loss.cpu().numpy()])

        del inf_model
        del mq
        return point

    del inf_model
    del mq
    l1_point = eval_pnorm(1.)
    print("loss l1: {:.4f}".format(l1_point[2]))

    l1_5_point = eval_pnorm(1.5)
    print("loss l1.5: {:.4f}".format(l1_5_point[2]))

    l2_point = eval_pnorm(2.)
    print("loss l2: {:.4f}".format(l2_point[2]))

    l2_5_point = eval_pnorm(2.5)
    print("loss l2.5: {:.4f}".format(l2_5_point[2]))

    l3_point = eval_pnorm(3.)
    print("loss l3: {:.4f}".format(l3_point[2]))

    f_name = "{}_l{}l{}_W{}A{}.pkl".format(args.arch, id1, id2,
                                           args.bit_weights, args.bit_act)
    f = open(os.path.join(proj_root_dir, 'data', f_name), 'wb')
    data = {
        'X': X,
        'Y': Y,
        'Z': Z,
        'max_point': max_point,
        'l1_point': l1_point,
        'l1.5_point': l1_5_point,
        'l2_point': l2_point,
        'l2.5_point': l2_5_point,
        'l3_point': l3_point
    }
    pickle.dump(data, f)
    f.close()
    print("Data saved to {}".format(f_name))
def main(args, ml_logger):
    # Fix the seed
    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    if args.tag is not None:
        ml_logger.mlflow.log_param('tag', args.tag)

    enable_bcorr = False
    if args.bcorr_w:
        args.bcorr_w = False
        enable_bcorr = True

    args.qtype = 'max_static'
    # create model
    # Always enable shuffling to avoid issues where we get bad results due to weak statistics
    custom_resnet = True
    custom_inception = True
    inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath,
                         batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args)

    layers = []
    # TODO: make it more generic
    if args.bit_weights is not None:
        layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d)][1:-1]
    if args.bit_act is not None:
        layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU)][1:-1]
    if args.bit_act is not None and 'mobilenet' in args.arch:
        layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6)][1:-1]

    replacement_factory = {nn.ReLU: ActivationModuleWrapperPost,
                           nn.ReLU6: ActivationModuleWrapperPost,
                           nn.Conv2d: ParameterModuleWrapperPost}

    mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)
    maxabs_loss = inf_model.evaluate_calibration()
    print("max loss: {:.4f}".format(maxabs_loss.item()))
    max_point = mq.get_clipping()
    ml_logger.log_metric('Loss max', maxabs_loss.item(), step='auto')

    # evaluate
    maxabs_acc = 0#inf_model.validate()
    ml_logger.log_metric('Acc maxabs', maxabs_acc, step='auto')
    data = {'max': {'alpha': max_point.cpu().numpy(), 'loss': maxabs_loss.item(), 'acc': maxabs_acc}}

    del inf_model
    del mq

    def eval_pnorm(p):
        args.qtype = 'lp_norm'
        args.lp = p
        # Fix the seed
        random.seed(args.seed)
        if not args.dont_fix_np_seed:
            np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath,
                             batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq,
                             cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args)

        mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)
        loss = inf_model.evaluate_calibration()
        point = mq.get_clipping()

        # evaluate
        acc = inf_model.validate()

        del inf_model
        del mq

        return point, loss, acc

    def eval_pnorm_on_calibration(p):
        args.qtype = 'lp_norm'
        args.lp = p
        # Fix the seed
        random.seed(args.seed)
        if not args.dont_fix_np_seed:
            np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath,
                             batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq,
                             cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args)

        mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)
        loss = inf_model.evaluate_calibration()
        point = mq.get_clipping()

        del inf_model
        del mq

        return point, loss

    # l2_point, l2_loss = eval_pnorm_on_calibration(2)
    # print("loss l2: {:.4f}".format(l2_loss.item()))
    # ml_logger.log_metric('Loss l2', l2_loss.item(), step='auto')
    # # l4_point, l4_loss = eval_pnorm_on_calibration(4)
    # print("loss l4: {:.4f}".format(l4_loss.item()))
    # ml_logger.log_metric('Loss l4', l4_loss.item(), step='auto')

    # args.qtype = 'lp_norm'
    # args.lp = p_intr
    # Fix the seed
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath,
                         batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args)

    mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)

    opt_point = np.array([0.42811054, 1.27721779, 0.53149996, 1.51492159, 0.91115569,
       1.17987683, 1.13352566, 1.5227828 , 0.67026185, 0.75535328,
       0.54173654, 0.70824616, 0.44899457, 1.25257411, 0.68778409])
    start_point = 0.8*opt_point
    end_point = 1.5*opt_point
    k = 100
    step = (end_point - start_point) / k
    print("start")
    print(start_point)
    print("end")
    print(end_point)

    losses = []
    points = []
    for i in range(k+1):
        point = start_point + i * step
        mq.set_clipping(point, inf_model.device)
        loss = inf_model.evaluate_calibration()
        losses.append(loss.item())
        points.append(point)
        print("({}: loss) - {}".format(i, loss.item()))

    data = {'opt': opt_point, 'points': points, 'loss': losses}

    # save scales
    f_name = "quadratic_loss_{}_W{}A{}.pkl".format(args.arch, args.bit_weights, args.bit_act)
    f = open(os.path.join(proj_root_dir, 'data', f_name), 'wb')
    pickle.dump(data, f)
    f.close()
    print("Data saved to {}".format(f_name))
Пример #8
0
def main(args):
    # Fix the seed
    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    args.qtype = 'max_static'
    # create model
    # Always enable shuffling to avoid issues where we get bad results due to weak statistics
    custom_resnet = True
    custom_inception = True
    replacement_factory = {
        nn.ReLU: ActivationModuleWrapperPost,
        nn.ReLU6: ActivationModuleWrapperPost,
        nn.Conv2d: ParameterModuleWrapperPost
    }

    def eval_pnorm(p):
        args.qtype = 'lp_norm'
        args.lp = p
        # Fix the seed
        random.seed(args.seed)
        if not args.dont_fix_np_seed:
            np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

        inf_model = CnnModel(args.arch,
                             custom_resnet,
                             custom_inception,
                             args.pretrained,
                             args.dataset,
                             args.gpu_ids,
                             args.datapath,
                             batch_size=args.batch_size,
                             shuffle=True,
                             workers=args.workers,
                             print_freq=args.print_freq,
                             cal_batch_size=args.cal_batch_size,
                             cal_set_size=args.cal_set_size,
                             args=args)

        all_layers = []
        if args.bit_weights is not None:
            all_layers += [
                n for n, m in inf_model.model.named_modules()
                if isinstance(m, nn.Conv2d)
            ][1:-1]
        if args.bit_act is not None:
            all_layers += [
                n for n, m in inf_model.model.named_modules()
                if isinstance(m, nn.ReLU)
            ][1:-1]
        if args.bit_act is not None and 'mobilenet' in args.arch:
            all_layers += [
                n for n, m in inf_model.model.named_modules()
                if isinstance(m, nn.ReLU6)
            ][1:-1]

        mq = ModelQuantizer(inf_model.model, args, all_layers,
                            replacement_factory)
        loss = inf_model.evaluate_calibration()
        point = mq.get_clipping()

        del inf_model
        del mq

        return point, loss

    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    inf_model = CnnModel(args.arch,
                         custom_resnet,
                         custom_inception,
                         args.pretrained,
                         args.dataset,
                         args.gpu_ids,
                         args.datapath,
                         batch_size=args.batch_size,
                         shuffle=True,
                         workers=args.workers,
                         print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size,
                         cal_set_size=args.cal_set_size,
                         args=args)

    all_layers = []
    if args.bit_weights is not None:
        all_layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.Conv2d)
        ][1:-1]
    if args.bit_act is not None:
        all_layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.ReLU)
        ][1:-1]
    if args.bit_act is not None and 'mobilenet' in args.arch:
        all_layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.ReLU6)
        ][1:-1]

    mq = ModelQuantizer(inf_model.model, args, all_layers, replacement_factory)

    p1 = torch.tensor([
        0.7677084, 1.7640269, 0.80914754, 2.044024, 0.87229156, 1.2659631,
        0.78454655, 1.3018194, 0.7894693, 0.92967707, 0.5754433, 0.9115604,
        0.5689196, 1.2382566, 0.601773
    ])
    p2 = torch.tensor([
        0.8135005, 1.7248632, 0.8009758, 2.005755, 0.83956134, 1.2431265,
        0.7720454, 1.3013302, 0.76733077, 0.96402454, 0.5914314, 0.9579072,
        0.56543064, 1.2535284, 0.6261679
    ])

    k = 50
    step = p1 - p2
    losses = []
    points = []
    for i in range(k + 1):
        point = p1 + 0.4 * i * step - 10 * step
        mq.set_clipping(point, inf_model.device)
        loss = inf_model.evaluate_calibration()
        losses.append(loss.item())
        points.append(point.cpu().numpy())
        print("({}: loss) - {}".format(i, loss.item()))

    f_name = "{}_W{}A{}_loss_conjugate_dir.pkl".format(args.arch,
                                                       args.bit_weights,
                                                       args.bit_act)
    f = open(os.path.join(proj_root_dir, 'data', f_name), 'wb')
    data = {'start': p1.cpu().numpy(), 'loss': losses, 'points': points}
    pickle.dump(data, f)
    f.close()
    print("Data saved to {}".format(f_name))
def main(args):
    # Fix the seed
    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    args.qtype = 'max_static'
    # create model
    # Always enable shuffling to avoid issues where we get bad results due to weak statistics
    custom_resnet = True
    custom_inception = True
    replacement_factory = {nn.ReLU: ActivationModuleWrapperPost,
                           nn.ReLU6: ActivationModuleWrapperPost,
                           nn.Conv2d: ParameterModuleWrapperPost}

    def eval_pnorm(p):
        args.qtype = 'lp_norm'
        args.lp = p
        # Fix the seed
        random.seed(args.seed)
        if not args.dont_fix_np_seed:
            np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

        inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath,
                             batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq,
                             cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args)

        all_layers = []
        if args.bit_weights is not None:
            all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d)][1:-1]
        if args.bit_act is not None:
            all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU)][1:-1]
        if args.bit_act is not None and 'mobilenet' in args.arch:
            all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6)][1:-1]

        mq = ModelQuantizer(inf_model.model, args, all_layers, replacement_factory)
        loss = inf_model.evaluate_calibration()
        point = mq.get_clipping()

        del inf_model
        del mq

        return point, loss

    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids,
                         args.datapath,
                         batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args)

    all_layers = []
    if args.bit_weights is not None:
        all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d)][1:-1]
    if args.bit_act is not None:
        all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU)][1:-1]
    if args.bit_act is not None and 'mobilenet' in args.arch:
        all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6)][1:-1]

    mq = ModelQuantizer(inf_model.model, args, all_layers, replacement_factory)

    start_point, start_loss = eval_pnorm(2)
    end_point, end_loss = eval_pnorm(4.5)
    k = 50
    step = (end_point - start_point) / k

    print("start")
    print(start_point)
    print("end")
    print(end_point)
    losses = []
    points = []
    for i in range(k+1):
        point = start_point + i * step
        mq.set_clipping(point, inf_model.device)
        loss = inf_model.evaluate_calibration()
        losses.append(loss.item())
        points.append(point.cpu().numpy())
        print("({}: loss) - {}".format(i, loss.item()))

    f_name = "{}_W{}A{}_loss_vs_clipping.pkl".format(args.arch, args.bit_weights, args.bit_act)
    f = open(os.path.join(proj_root_dir, 'data', f_name), 'wb')
    data = {'start': start_point.cpu().numpy(), 'end': end_point.cpu().numpy(), 'loss': losses, 'points': points}
    pickle.dump(data, f)
    f.close()
    print("Data saved to {}".format(f_name))
def main_worker(args, ml_logger):
    global best_acc1

    if args.gpu_ids is not None:
        print("Use GPU: {} for training".format(args.gpu_ids))

    # create model
    if 'resnet' in args.arch and args.custom_resnet:
        model = custom_resnet(arch=args.arch,
                              pretrained=args.pretrained,
                              depth=arch2depth(args.arch),
                              dataset=args.dataset)
    elif 'inception_v3' in args.arch and args.custom_inception:
        model = custom_inception(pretrained=args.pretrained)

    elif args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    device = torch.device('cuda:{}'.format(args.gpu_ids[0]))
    cudnn.benchmark = True

    torch.cuda.set_device(args.gpu_ids[0])
    model = model.to(device)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            # mq = ModelQuantizer(model, args)
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, device)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            # best_acc1 may be from a checkpoint from a different GPU
            # best_acc1 = best_acc1.to(device)
            model.load_state_dict(checkpoint['state_dict'])
            # optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    if len(args.gpu_ids) > 1:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features,
                                                   args.gpu_ids)
        else:
            model = torch.nn.DataParallel(model, args.gpu_ids)

    val_data = get_dataset(
        args.dataset,
        'val',
        get_transform(args.dataset,
                      augment=False,
                      scale_size=299 if 'inception' in args.arch else None,
                      input_size=299 if 'inception' in args.arch else None),
        datasets_path=args.datapath)
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=args.batch_size,
                                             shuffle=args.shuffle,
                                             num_workers=args.workers,
                                             pin_memory=True)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().to(device)
    if 'inception' in args.arch and args.custom_inception:
        first = 3
        last = -1
    else:
        first = 1
        last = -1
    if args.quantize:
        all_convs = [
            n for n, m in model.named_modules() if isinstance(m, nn.Conv2d)
        ]
        all_relu = [
            n for n, m in model.named_modules() if isinstance(m, nn.ReLU)
        ]
        all_relu6 = [
            n for n, m in model.named_modules() if isinstance(m, nn.ReLU6)
        ]
        layers = all_relu[first:last] + all_relu6[first:last] + all_convs[
            first:last]
        replacement_factory = {
            nn.ReLU: ActivationModuleWrapperPost,
            nn.ReLU6: ActivationModuleWrapperPost,
            nn.Conv2d: ParameterModuleWrapperPost
        }
        mq = ModelQuantizer(model, args, layers, replacement_factory)
        mq.log_quantizer_state(ml_logger, -1)

    acc = validate(val_loader, model, criterion, args, device)
    ml_logger.log_metric('Val Acc1', acc, step='auto')
Пример #11
0
def main_worker(args, ml_logger):
    global best_acc1
    datatime_str = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    suf_name = "_" + args.experiment

    if args.gpu_ids is not None:
        print("Use GPU: {} for training".format(args.gpu_ids))

    if args.log_stats:
        from utils.stats_trucker import StatsTrucker as ST
        ST("W{}A{}".format(args.bit_weights, args.bit_act))

    if 'resnet' in args.arch and args.custom_resnet:
        # pdb.set_trace()
        model = custom_resnet(arch=args.arch,
                              pretrained=args.pretrained,
                              depth=arch2depth(args.arch),
                              dataset=args.dataset)
    elif 'inception_v3' in args.arch and args.custom_inception:
        model = custom_inception(pretrained=args.pretrained)
    else:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=args.pretrained)

    device = torch.device('cuda:{}'.format(args.gpu_ids[0]))
    cudnn.benchmark = True

    torch.cuda.set_device(args.gpu_ids[0])
    model = model.to(device)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, device)
            args.start_epoch = checkpoint['epoch']
            # best_acc1 = checkpoint['best_acc1']
            # best_acc1 may be from a checkpoint from a different GPU
            # best_acc1 = best_acc1.to(device)
            checkpoint['state_dict'] = {
                normalize_module_name(k): v
                for k, v in checkpoint['state_dict'].items()
            }
            model.load_state_dict(checkpoint['state_dict'], strict=False)
            # optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    if len(args.gpu_ids) > 1:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features,
                                                   args.gpu_ids)
        else:
            model = torch.nn.DataParallel(model, args.gpu_ids)

    default_transform = {
        'train': get_transform(args.dataset, augment=True),
        'eval': get_transform(args.dataset, augment=False)
    }

    val_data = get_dataset(args.dataset, 'val', default_transform['eval'])
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=args.batch_size,
                                             shuffle=True,
                                             num_workers=args.workers,
                                             pin_memory=True)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().to(device)

    train_data = get_dataset(args.dataset, 'train', default_transform['train'])
    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               drop_last=True)

    # TODO: replace this call by initialization on small subset of training data
    # TODO: enable for activations
    # validate(val_loader, model, criterion, args, device)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    lr_scheduler = StepLR(optimizer, step_size=args.lr_step, gamma=0.1)

    # pdb.set_trace()
    mq = None
    if args.quantize:
        if args.bn_folding:
            print(
                "Applying batch-norm folding ahead of post-training quantization"
            )
            from utils.absorb_bn import search_absorbe_bn
            search_absorbe_bn(model)

        all_convs = [
            n for n, m in model.named_modules() if isinstance(m, nn.Conv2d)
        ]
        # all_convs = [l for l in all_convs if 'downsample' not in l]
        all_relu = [
            n for n, m in model.named_modules() if isinstance(m, nn.ReLU)
        ]
        all_relu6 = [
            n for n, m in model.named_modules() if isinstance(m, nn.ReLU6)
        ]
        layers = all_relu[1:-1] + all_relu6[1:-1] + all_convs[1:]
        replacement_factory = {
            nn.ReLU: ActivationModuleWrapper,
            nn.ReLU6: ActivationModuleWrapper,
            nn.Conv2d: ParameterModuleWrapper
        }
        mq = ModelQuantizer(
            model, args, layers, replacement_factory,
            OptimizerBridge(optimizer,
                            settings={
                                'algo': 'SGD',
                                'dataset': args.dataset
                            }))

        if args.resume:
            # Load quantization parameters from state dict
            mq.load_state_dict(checkpoint['state_dict'])

        mq.log_quantizer_state(ml_logger, -1)

        if args.model_freeze:
            mq.freeze()

    # pdb.set_trace()
    if args.evaluate:
        acc = validate(val_loader, model, criterion, args, device)
        ml_logger.log_metric('Val Acc1', acc)
        return

    # evaluate on validation set
    acc1 = validate(val_loader, model, criterion, args, device)
    ml_logger.log_metric('Val Acc1', acc1, -1)

    # evaluate with k-means quantization
    # if args.model_freeze:
    # with mq.disable():
    #     acc1_nq = validate(val_loader, model, criterion, args, device)
    #     ml_logger.log_metric('Val Acc1 fp32', acc1_nq, -1)

    # pdb.set_trace()
    # Kurtosis regularization on weights tensors
    weight_to_hook = {}
    if args.w_kurtosis:
        if args.weight_name[0] == 'all':
            all_convs = [
                n.replace(".wrapped_module", "") + '.weight'
                for n, m in model.named_modules() if isinstance(m, nn.Conv2d)
            ]
            weight_name = all_convs[1:]
            if args.remove_weight_name:
                for rm_name in args.remove_weight_name:
                    weight_name.remove(rm_name)
        else:
            weight_name = args.weight_name
        for name in weight_name:
            # pdb.set_trace()
            curr_param = fine_weight_tensor_by_name(model, name)
            # if not curr_param:
            #     name = 'float_' + name # QAT name
            #     curr_param = fine_weight_tensor_by_name(self.model, name)
            # if curr_param is not None:
            weight_to_hook[name] = curr_param

    for epoch in range(0, args.epochs):
        # train for one epoch
        print('Timestamp Start epoch: {:%Y-%m-%d %H:%M:%S}'.format(
            datetime.datetime.now()))
        train(train_loader, model, criterion, optimizer, epoch, args, device,
              ml_logger, val_loader, mq, weight_to_hook)
        print('Timestamp End epoch: {:%Y-%m-%d %H:%M:%S}'.format(
            datetime.datetime.now()))

        if not args.lr_freeze:
            lr_scheduler.step()

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args, device)
        ml_logger.log_metric('Val Acc1', acc1, step='auto')

        # evaluate with k-means quantization
        # if args.model_freeze:
        # with mq.quantization_method('kmeans'):
        #     acc1_kmeans = validate(val_loader, model, criterion, args, device)
        #     ml_logger.log_metric('Val Acc1 kmeans', acc1_kmeans, epoch)

        # with mq.disable():
        #     acc1_nq = validate(val_loader, model, criterion, args, device)
        #     ml_logger.log_metric('Val Acc1 fp32', acc1_nq,  step='auto')

        if args.quantize:
            mq.log_quantizer_state(ml_logger, epoch)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        save_checkpoint(
            {
                'epoch':
                epoch + 1,
                'arch':
                args.arch,
                'state_dict':
                model.state_dict()
                if len(args.gpu_ids) == 1 else model.module.state_dict(),
                'best_acc1':
                best_acc1,
                'optimizer':
                optimizer.state_dict(),
            },
            is_best,
            datatime_str=datatime_str,
            suf_name=suf_name)
Пример #12
0
def main_ratio(args, ml_logger):
    # Fix the seed
    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    curr_best_acc = 0
    curr_best_scale_point = None

    args.qtype = 'max_static'
    # create model
    # Always enable shuffling to avoid issues where we get bad results due to weak statistics
    custom_resnet = True
    custom_inception = True
    inf_model = CnnModel(args.arch,
                         custom_resnet,
                         custom_inception,
                         args.pretrained,
                         args.dataset,
                         args.gpu_ids,
                         args.datapath,
                         batch_size=args.batch_size,
                         shuffle=True,
                         workers=args.workers,
                         print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size,
                         cal_set_size=args.cal_set_size,
                         args=args)

    # pdb.set_trace()
    if args.bn_folding:
        print(
            "Applying batch-norm folding ahead of post-training quantization")
        # pdb.set_trace()
        from utils.absorb_bn import search_absorbe_bn
        search_absorbe_bn(inf_model.model)
    # pdb.set_trace()

    layers = []
    # TODO: make it more generic
    if args.bit_weights is not None:
        layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.Conv2d)
        ][1:-1]
    if args.bit_act is not None:
        layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.ReLU)
        ][1:-1]
    if args.bit_act is not None and 'mobilenet' in args.arch:
        layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.ReLU6)
        ][1:-1]

    replacement_factory = {
        nn.ReLU: ActivationModuleWrapperPost,
        nn.ReLU6: ActivationModuleWrapperPost,
        nn.Conv2d: ParameterModuleWrapperPost
    }

    mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)
    loss = inf_model.evaluate_calibration()

    # evaluate
    max_acc = inf_model.validate()
    max_point = mq.get_clipping()
    # pdb.set_trace()
    if max_acc > curr_best_acc:
        curr_best_acc = max_acc
        curr_best_scale_point = max_point
    ml_logger.log_metric('Loss max', loss.item(), step='auto')
    ml_logger.log_metric('Acc max', max_acc, step='auto')
    data = {'max': {'alpha': max_point.cpu().numpy(), 'loss': loss.item()}}
    print("max loss: {:.4f}, max_acc: {:.4f}".format(loss.item(), max_acc))

    def eval_pnorm(p):
        args.qtype = 'lp_norm'
        args.lp = p
        # Fix the seed
        random.seed(args.seed)
        if not args.dont_fix_np_seed:
            np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        inf_model = CnnModel(args.arch,
                             custom_resnet,
                             custom_inception,
                             args.pretrained,
                             args.dataset,
                             args.gpu_ids,
                             args.datapath,
                             batch_size=args.batch_size,
                             shuffle=True,
                             workers=args.workers,
                             print_freq=args.print_freq,
                             cal_batch_size=args.cal_batch_size,
                             cal_set_size=args.cal_set_size,
                             args=args)

        if args.bn_folding:
            print(
                "Applying batch-norm folding ahead of post-training quantization"
            )
            # pdb.set_trace()
            from utils.absorb_bn import search_absorbe_bn
            search_absorbe_bn(inf_model.model)
        mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)
        loss = inf_model.evaluate_calibration()
        point = mq.get_clipping()

        # evaluate
        acc = inf_model.validate()

        del inf_model
        del mq

        return point, loss, acc

    del inf_model
    del mq

    l2_point, l2_loss, l2_acc = eval_pnorm(2.)
    print("loss l2: {:.4f}".format(l2_loss.item()))
    ml_logger.log_metric('Loss l2', l2_loss.item(), step='auto')
    ml_logger.log_metric('Acc l2', l2_acc, step='auto')
    data['l2'] = {
        'alpha': l2_point.cpu().numpy(),
        'loss': l2_loss.item(),
        'acc': l2_acc
    }
    if l2_acc > curr_best_acc:
        curr_best_acc = l2_acc
        curr_best_scale_point = l2_point

    l25_point, l25_loss, l25_acc = eval_pnorm(2.5)
    print("loss l2.5: {:.4f}".format(l25_loss.item()))
    ml_logger.log_metric('Loss l2.5', l25_loss.item(), step='auto')
    ml_logger.log_metric('Acc l2.5', l25_acc, step='auto')
    data['l2.5'] = {
        'alpha': l25_point.cpu().numpy(),
        'loss': l25_loss.item(),
        'acc': l25_acc
    }
    if l25_acc > curr_best_acc:
        curr_best_acc = l25_acc
        curr_best_scale_point = l25_point

    l3_point, l3_loss, l3_acc = eval_pnorm(3.)
    print("loss l3: {:.4f}".format(l3_loss.item()))
    ml_logger.log_metric('Loss l3', l3_loss.item(), step='auto')
    ml_logger.log_metric('Acc l3', l3_acc, step='auto')
    data['l3'] = {
        'alpha': l3_point.cpu().numpy(),
        'loss': l3_loss.item(),
        'acc': l3_acc
    }
    if l3_acc > curr_best_acc:
        curr_best_acc = l3_acc
        curr_best_scale_point = l3_point

    # Interpolate optimal p
    xp = np.linspace(1, 5, 50)
    z = np.polyfit([2, 2.5, 3], [l2_acc, l25_acc, l3_acc], 2)
    y = np.poly1d(z)
    p_intr = xp[np.argmax(y(xp))]
    print("p intr: {:.2f}".format(p_intr))
    ml_logger.log_metric('p intr', p_intr, step='auto')

    args.qtype = 'lp_norm'
    args.lp = p_intr
    # Fix the seed
    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    inf_model = CnnModel(args.arch,
                         custom_resnet,
                         custom_inception,
                         args.pretrained,
                         args.dataset,
                         args.gpu_ids,
                         args.datapath,
                         batch_size=args.batch_size,
                         shuffle=True,
                         workers=args.workers,
                         print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size,
                         cal_set_size=args.cal_set_size,
                         args=args)

    if args.bn_folding:
        print(
            "Applying batch-norm folding ahead of post-training quantization")
        # pdb.set_trace()
        from utils.absorb_bn import search_absorbe_bn
        search_absorbe_bn(inf_model.model)
    mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)

    # Evaluate with optimal p
    lp_loss = inf_model.evaluate_calibration()
    lp_point = mq.get_clipping()
    # evaluate
    lp_acc = inf_model.validate()

    print("loss p intr: {:.4f}".format(lp_loss.item()))
    ml_logger.log_metric('Loss p intr', lp_loss.item(), step='auto')
    ml_logger.log_metric('Acc p intr', lp_acc, step='auto')
    if lp_acc > curr_best_acc:
        curr_best_acc = lp_acc
        curr_best_scale_point = lp_point

    global _eval_count, _min_loss
    _min_loss = lp_loss.item()

    idx = np.argmax([l2_acc, l25_acc, l3_acc, lp_acc])
    init = [l2_point, l25_point, l3_point, lp_point][idx]

    # run optimizer
    min_options = {}
    if args.maxiter is not None:
        min_options['maxiter'] = args.maxiter
    if args.maxfev is not None:
        min_options['maxfev'] = args.maxfev

    _iter = count(0)

    def local_search_callback(x):
        it = next(_iter)
        mq.set_clipping(x, inf_model.device)
        loss = inf_model.evaluate_calibration()
        print("\n[{}]: Local search callback".format(it))
        print("loss: {:.4f}\n".format(loss.item()))
        print(x)
        ml_logger.log_metric('Loss {}'.format(args.min_method),
                             loss.item(),
                             step='auto')

        # evaluate
        acc = inf_model.validate()
        ml_logger.log_metric('Acc {}'.format(args.min_method),
                             acc,
                             step='auto')

    args.min_method = "Powell"
    method = coord_descent if args.min_method == 'CD' else args.min_method
    res = opt.minimize(
        lambda scales: evaluate_calibration_clipped(scales, inf_model, mq),
        init.cpu().numpy(),
        method=method,
        options=min_options,
        callback=local_search_callback)

    print(res)

    scales = res.x
    mq.set_clipping(scales, inf_model.device)
    loss = inf_model.evaluate_calibration()
    ml_logger.log_metric('Loss {}'.format(args.min_method),
                         loss.item(),
                         step='auto')

    # evaluate
    acc = inf_model.validate()
    ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto')
    data['powell'] = {'alpha': scales, 'loss': loss.item(), 'acc': acc}
    if acc > curr_best_acc:
        curr_best_acc = acc
        curr_best_scale_point = scales

    print("Starting coordinate descent")
    args.min_method = "CD"
    _iter = count(0)
    global _eval_count
    _eval_count = count(0)
    _min_loss = lp_loss.item()
    mq.set_clipping(init, inf_model.device)
    # Run coordinate descent for comparison
    method = coord_descent
    res = opt.minimize(
        lambda scales: evaluate_calibration_clipped(scales, inf_model, mq),
        init.cpu().numpy(),
        method=method,
        options=min_options,
        callback=local_search_callback)

    print(res)

    scales = res.x
    mq.set_clipping(scales, inf_model.device)
    loss = inf_model.evaluate_calibration()
    ml_logger.log_metric('Loss {}'.format("CD"), loss.item(), step='auto')

    # evaluate
    acc = inf_model.validate()
    ml_logger.log_metric('Acc {}'.format("CD"), acc, step='auto')
    data['cd'] = {'alpha': scales, 'loss': loss.item(), 'acc': acc}
    if acc > curr_best_acc:
        curr_best_acc = acc
        curr_best_scale_point = scales

    pdb.set_trace()
    if curr_best_scale_point.is_cuda:
        curr_best_scale_point = curr_best_scale_point.cpu()
    best_point = np.concatenate(
        [curr_best_scale_point,
         torch.tensor([curr_best_acc])])
    print("**** START LOSS GENERATION ****")
    print("best point:" + str(best_point))
    best_point_values = best_point[:-1]
    mq.set_clipping(best_point_values, inf_model.device)
    loss = inf_model.evaluate_calibration()
    # evaluate
    top1 = inf_model.validate()
    print("best point: loss, top1: {:.4f}, {}".format(loss.item(), top1))

    # best_point = curr_best_scale_point
    # best_point = mq.get_clipping()
    # best_point_values = curr_best_scale_point[:-1]
    # pdb.set_trace()
    n = args.grid_resolution

    min_ratio = args.min_ratio  # 0.8
    max_ratio = args.max_ratio  # 1.2

    x = np.linspace(min_ratio, max_ratio, n)
    # y = np.linspace(min_ratio, max_ratio, n)

    loss_best = loss
    # X, Y = np.meshgrid(x, y)
    Z_loss = np.empty(n)
    Z_top1 = np.empty(n)
    for i, x_ in enumerate(tqdm(x)):
        # set clip value to qwrappers
        scales_ratio = x_
        mq.set_clipping((best_point_values * scales_ratio), inf_model.device)

        if scales_ratio == 1.0:
            print(best_point_values * scales_ratio)
        # evaluate with clipping
        loss = inf_model.evaluate_calibration()
        Z_loss[i] = loss.item()
        Z_top1[i] = inf_model.validate()

        str1 = "[x, loss, top1] = [{}, {}, {}]".format(x[i], Z_loss[i],
                                                       Z_top1[i])
        print(str1)

    # pdb.set_trace()
    # best_point = np.concatenate([1.0, loss_best.cpu().numpy()])
    best_point_ratio = [1.0, loss_best.cpu().numpy()]
    print("best_point_ratio: " + str(best_point_ratio))
    # best_point = [best_point_values, loss_best.cpu().numpy()]
    # print("best point: " + str(best_point))
    print("best point values: " + str(best_point_values))

    f_name = "loss_generation_lapq_{}_W{}A{}.pkl".format(
        args.arch, 'ALL', None)
    dir_fullname = os.path.join(os.getcwd(), args.experiment)
    if not os.path.exists(dir_fullname):
        os.makedirs(dir_fullname)
    f = open(os.path.join(dir_fullname, f_name), 'wb')
    data = {
        'X': x,
        'Z_loss': Z_loss,
        'Z_top1': Z_top1,
        'best_point_ratio': best_point_ratio,
        'best_point': best_point_values
    }
    pickle.dump(data, f)
    f.close()
    print("Data saved to {}".format(f_name))
Пример #13
0
def main(args, ml_logger):
    # Fix the seed
    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    # Create model
    model = NeuMF(2197225,
                  855776,
                  mf_dim=64,
                  mf_reg=0.,
                  mlp_layer_sizes=[256, 256, 128, 64],
                  mlp_layer_regs=[0. for i in [256, 256, 128, 64]])

    print(model)

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        model.device = torch.device('cuda:{}'.format(0))

    if args.load_ckp:
        ckp = torch.load(args.load_ckp)
        model.load_state_dict(ckp)

    all_embeding = [
        n for n, m in model.named_modules() if isinstance(m, nn.Embedding)
    ]
    all_linear = [
        n for n, m in model.named_modules() if isinstance(m, nn.Linear)
    ]
    all_relu = [n for n, m in model.named_modules() if isinstance(m, nn.ReLU)]
    all_relu6 = [
        n for n, m in model.named_modules() if isinstance(m, nn.ReLU6)
    ]
    layers = all_relu + all_relu6 + all_linear + all_embeding
    replacement_factory = {
        nn.ReLU: ActivationModuleWrapperPost,
        nn.ReLU6: ActivationModuleWrapperPost,
        nn.Linear: ParameterModuleWrapperPost,
        nn.Embedding: ActivationModuleWrapperPost
    }
    mq = ModelQuantizer(model, args, layers, replacement_factory)
    # mq.log_quantizer_state(ml_logger, -1)

    test_users, test_items, dup_mask, real_indices, K, samples_per_user, num_user = data_loader(
        args.data)
    data = NcfData(test_users, test_items, dup_mask, real_indices, K,
                   samples_per_user, num_user)
    cal_data = CalibrationSet('ml-20mx16x32/cal_set').cuda()
    cal_data.split(batch_size=10000)

    criterion = nn.BCEWithLogitsLoss(reduction='mean')
    criterion = criterion.cuda()

    print("init_method: {}, qtype {}".format(args.init_method, args.qtype))
    # evaluate to initialize dynamic clipping
    loss = evaluate_calibration(model, cal_data, criterion)
    print("Initial loss: {:.4f}".format(loss))

    # get clipping values
    init = get_clipping(mq)

    # evaluate
    hr, ndcg = validate(model, data)
    ml_logger.log_metric('HR init', hr, step='auto')

    # run optimizer
    min_options = {}
    if args.maxiter is not None:
        min_options['maxiter'] = args.maxiter
    if args.maxfev is not None:
        min_options['maxfev'] = args.maxfev

    _iter = count(0)

    def local_search_callback(x):
        it = next(_iter)
        loss = run_inference_on_calibration(x, model, mq, cal_data, criterion)
        print("\n[{}]: Local search callback".format(it))
        print("loss: {:.4f}\n".format(loss))

    res = opt.minimize(lambda scales: run_inference_on_calibration(
        scales, model, mq, cal_data, criterion),
                       np.array(init),
                       method=args.min_method,
                       options=min_options,
                       callback=local_search_callback)

    print(res)
    scales = res.x
    set_clipping(mq, scales, model.device)
    # evaluate
    hr, ndcg = validate(model, data)
    ml_logger.log_metric('HR Powell', hr, step='auto')
def main(args, ml_logger):
    # Fix the seed
    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    if args.tag is not None:
        ml_logger.mlflow.log_param('tag', args.tag)

    enable_bcorr = False
    if args.bcorr_w:
        args.bcorr_w = False
        enable_bcorr = True

    if args.init_method == 'random':
        args.qtype = 'max_static'

    # create model
    # Always enable shuffling to avoid issues where we get bad results due to weak statistics
    custom_resnet = True
    custom_inception = True
    inf_model = CnnModel(args.arch,
                         custom_resnet,
                         custom_inception,
                         args.pretrained,
                         args.dataset,
                         args.gpu_ids,
                         args.datapath,
                         batch_size=args.batch_size,
                         shuffle=True,
                         workers=args.workers,
                         print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size,
                         cal_set_size=args.cal_set_size,
                         args=args)

    layers = []
    # TODO: make it more generic
    if args.bit_weights is not None:
        layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.Conv2d)
        ][1:-1]
    if args.bit_act is not None:
        layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.ReLU)
        ][1:-1]
    if args.bit_act is not None and 'mobilenet' in args.arch:
        layers += [
            n for n, m in inf_model.model.named_modules()
            if isinstance(m, nn.ReLU6)
        ][1:-1]

    replacement_factory = {
        nn.ReLU: ActivationModuleWrapperPost,
        nn.ReLU6: ActivationModuleWrapperPost,
        nn.Conv2d: ParameterModuleWrapperPost
    }

    mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)
    init_loss = inf_model.evaluate_calibration()

    if args.init_method == 'random':
        clip = mq.get_clipping()
        for i, c in enumerate(clip.cpu()):
            clip[i] = np.random.uniform(0, c)
        print("Randomize initial clipping")
        print(clip)
        mq.set_clipping(clip, inf_model.device)
        init_loss = inf_model.evaluate_calibration()

    print("init loss: {:.4f}".format(init_loss.item()))
    ml_logger.log_metric('Init loss', init_loss.item(), step='auto')

    acc = inf_model.validate()
    ml_logger.log_metric('Acc init', acc, step='auto')

    init = mq.get_clipping()

    global _eval_count, _min_loss
    _min_loss = init_loss.item()

    # if enable_bcorr:
    #     args.bcorr_w = True
    # inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath,
    #                      batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq,
    #                      cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args)
    #
    # mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)

    # run optimizer
    min_options = {}
    if args.maxiter is not None:
        min_options['maxiter'] = args.maxiter
    if args.maxfev is not None:
        min_options['maxfev'] = args.maxfev

    _iter = count(0)

    def local_search_callback(x):
        it = next(_iter)
        mq.set_clipping(x, inf_model.device)
        loss = inf_model.evaluate_calibration()
        print("\n[{}]: Local search callback".format(it))
        print("loss: {:.4f}\n".format(loss.item()))
        print(x)
        ml_logger.log_metric('Loss {}'.format(args.min_method),
                             loss.item(),
                             step='auto')

        # evaluate
        acc = inf_model.validate()
        ml_logger.log_metric('Acc {}'.format(args.min_method),
                             acc,
                             step='auto')

    args.min_method = "Powell"
    method = coord_descent if args.min_method == 'CD' else args.min_method
    res = opt.minimize(
        lambda scales: evaluate_calibration_clipped(scales, inf_model, mq),
        init.cpu().numpy(),
        method=method,
        options=min_options,
        callback=local_search_callback)

    print(res)

    scales = res.x
    mq.set_clipping(scales, inf_model.device)
    loss = inf_model.evaluate_calibration()
    ml_logger.log_metric('Loss {}'.format(args.min_method),
                         loss.item(),
                         step='auto')

    # evaluate
    acc = inf_model.validate()
    ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto')
Пример #15
0
def main(args, ml_logger):
    # Fix the seed
    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    if args.tag is not None:
        ml_logger.mlflow.log_param('tag', args.tag)

    # enable_bcorr = False
    # if args.bcorr_w:
    #     args.bcorr_w = False
    #     enable_bcorr = True

    args.qtype = 'max_static'
    # create model
    # Always enable shuffling to avoid issues where we get bad results due to weak statistics
    custom_resnet = True
    custom_inception = True
    inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath,
                         batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args)

    layers = []
    # TODO: make it more generic
    if 'inception' in args.arch and args.custom_inception:
        first = 3
        last = -1
    else:
        first = 1
        last = -1
    if args.bit_weights is not None:
        layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d)][first:last]
    if args.bit_act is not None:
        layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU)][first:last]
    if args.bit_act is not None and 'mobilenet' in args.arch:
        layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6)][first:last]

    replacement_factory = {nn.ReLU: ActivationModuleWrapperPost,
                           nn.ReLU6: ActivationModuleWrapperPost,
                           nn.Conv2d: ParameterModuleWrapperPost}

    mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)
    maxabs_loss = inf_model.evaluate_calibration()
    print("max loss: {:.4f}".format(maxabs_loss.item()))
    max_point = mq.get_clipping()
    ml_logger.log_metric('Loss max', maxabs_loss.item(), step='auto')

    # evaluate
    maxabs_acc = 0#inf_model.validate()
    ml_logger.log_metric('Acc maxabs', maxabs_acc, step='auto')
    data = {'max': {'alpha': max_point.cpu().numpy(), 'loss': maxabs_loss.item(), 'acc': maxabs_acc}}

    del inf_model
    del mq

    def eval_pnorm(p):
        args.qtype = 'lp_norm'
        args.lp = p
        # Fix the seed
        random.seed(args.seed)
        if not args.dont_fix_np_seed:
            np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath,
                             batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq,
                             cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args)

        mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)
        loss = inf_model.evaluate_calibration()
        point = mq.get_clipping()

        # evaluate
        acc = inf_model.validate()

        del inf_model
        del mq

        return point, loss, acc

    def eval_pnorm_on_calibration(p):
        args.qtype = 'lp_norm'
        args.lp = p
        # Fix the seed
        random.seed(args.seed)
        if not args.dont_fix_np_seed:
            np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath,
                             batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq,
                             cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args)

        mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)
        loss = inf_model.evaluate_calibration()
        point = mq.get_clipping()

        del inf_model
        del mq

        return point, loss

    ps = np.linspace(2, 4, 10)
    losses = []
    for p in tqdm(ps):
        point, loss = eval_pnorm_on_calibration(p)
        losses.append(loss.item())
        print("(p, loss) - ({}, {})".format(p, loss.item()))

    # Interpolate optimal p
    z = np.polyfit(ps, losses, 2)
    y = np.poly1d(z)
    p_intr = y.deriv().roots[0]
    # loss_opt = y(p_intr)
    print("p intr: {:.2f}".format(p_intr))
    ml_logger.log_metric('p intr', p_intr, step='auto')

    lp_point, lp_loss, lp_acc = eval_pnorm(p_intr)

    print("loss p intr: {:.4f}".format(lp_loss.item()))
    print("acc p intr: {:.4f}".format(lp_acc))
    ml_logger.log_metric('Init loss', lp_loss.item(), step='auto')
    ml_logger.log_metric('Acc init', lp_acc, step='auto')

    global _eval_count, _min_loss
    _min_loss = lp_loss.item()

    # loss_best = np.min(losses)
    # if loss_best < lp_loss:
    #     p_intr = ps[np.argmin(losses)]
    #     print("p best: {:.2f}".format(p_intr))
    #     ml_logger.log_metric('p best', p_intr, step='auto')
    #     lp_point, lp_loss, lp_acc = eval_pnorm(p_intr)
    #     print("loss p best: {:.4f}".format(lp_loss.item()))
    #     print("acc p best: {:.4f}".format(lp_acc))
    #     ml_logger.log_metric('Loss p best', lp_loss.item(), step='auto')
    #     ml_logger.log_metric('Acc p best', lp_acc, step='auto')

    # idx = np.argmin([maxabs_loss, lp_loss])
    # init = [max_point, lp_point][idx]

    init = lp_point

    args.qtype = 'lp_norm'
    args.lp = p_intr
    # Fix the seed
    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # if enable_bcorr:
    #     args.bcorr_w = True
    inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath,
                         batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args)

    mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)

    # run optimizer
    min_options = {}
    if args.maxiter is not None:
        min_options['maxiter'] = args.maxiter
    if args.maxfev is not None:
        min_options['maxfev'] = args.maxfev

    _iter = count(0)

    def local_search_callback(x):
        it = next(_iter)
        mq.set_clipping(x, inf_model.device)
        loss = inf_model.evaluate_calibration()
        print("\n[{}]: Local search callback".format(it))
        print("loss: {:.4f}\n".format(loss.item()))
        print(x)
        ml_logger.log_metric('Loss {}'.format(args.min_method), loss.item(), step='auto')

        # evaluate
        acc = inf_model.validate()
        ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto')

    args.min_method = "Powell"
    method = coord_descent if args.min_method == 'CD' else args.min_method
    res = opt.minimize(lambda scales: evaluate_calibration_clipped(scales, inf_model, mq), init.cpu().numpy(),
                       method=method, options=min_options, callback=local_search_callback)

    print(res)

    scales = res.x
    mq.set_clipping(scales, inf_model.device)
    loss = inf_model.evaluate_calibration()
    ml_logger.log_metric('Loss {}'.format(args.min_method), loss.item(), step='auto')

    # evaluate
    acc = inf_model.validate()
    ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto')
    data['powell'] = {'alpha': scales, 'loss': loss.item(), 'acc': acc}

    # save scales
    f_name = "scales_{}_W{}A{}.pkl".format(args.arch, args.bit_weights, args.bit_act)
    f = open(os.path.join(proj_root_dir, 'data', f_name), 'wb')
    pickle.dump(data, f)
    f.close()
    print("Data saved to {}".format(f_name))
Пример #16
0
def main():
    args = parse_args()

    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    # Create model
    model = NeuMF(2197225,
                  855776,
                  mf_dim=64,
                  mf_reg=0.,
                  mlp_layer_sizes=[256, 256, 128, 64],
                  mlp_layer_regs=[0. for i in [256, 256, 128, 64]])

    print(model)

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()

    if args.load_ckp:
        ckp = torch.load(args.load_ckp)
        model.load_state_dict(ckp)

    if args.quantize:
        all_embeding = [
            n for n, m in model.named_modules() if isinstance(m, nn.Embedding)
        ]
        all_linear = [
            n for n, m in model.named_modules() if isinstance(m, nn.Linear)
        ]
        all_relu = [
            n for n, m in model.named_modules() if isinstance(m, nn.ReLU)
        ]
        all_relu6 = [
            n for n, m in model.named_modules() if isinstance(m, nn.ReLU6)
        ]
        # layers = all_relu + all_relu6 + all_linear
        layers = all_embeding
        replacement_factory = {
            nn.ReLU: ActivationModuleWrapperPost,
            nn.ReLU6: ActivationModuleWrapperPost,
            nn.Linear: ParameterModuleWrapperPost,
            nn.Embedding: ActivationModuleWrapperPost
        }
        mq = ModelQuantizer(model, args, layers, replacement_factory)
        # mq.log_quantizer_state(ml_logger, -1)

    test_users, test_items, dup_mask, real_indices, K, samples_per_user, num_user = data_loader(
        args.data)
    data = NcfData(test_users, test_items, dup_mask, real_indices, K,
                   samples_per_user, num_user)

    hr, ndcg = val(model, data)
    print('')
    print('')
    print('HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(K=K,
                                                                  hit_rate=hr,
                                                                  ndcg=ndcg))
Пример #17
0
def main(args, ml_logger):
    # Fix the seed
    random.seed(args.seed)
    if not args.dont_fix_np_seed:
        np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # create model
    # Always enable shuffling to avoid issues where we get bad results due to weak statistics
    inf_model = CnnModel(args.arch, args.custom_resnet, args.custom_inception,args.pretrained, args.dataset, args.gpu_ids, args.datapath,
                         batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq,
                         cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args)

    layers = []
    # TODO: make it more generic
    if args.bit_weights is not None:
        layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d)][1:-1]
    if args.bit_act is not None:
        layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU)][1:-1]
    if args.bit_act is not None and 'mobilenet' in args.arch:
        layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6)][1:-1]

    replacement_factory = {nn.ReLU: ActivationModuleWrapperPost,
                           nn.ReLU6: ActivationModuleWrapperPost,
                           nn.Conv2d: ParameterModuleWrapperPost}
    mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory)
    # mq.log_quantizer_state(ml_logger, -1)

    print("init_method: {}, qtype {}".format(args.init_method, args.qtype))
    # initialize scales
    if args.init_method == 'dynamic':
        # evaluate to initialize dynamic clipping
        loss = inf_model.evaluate_calibration()
        print("Initial loss: {:.4f}".format(loss.item()))

        # get clipping values
        init = mq.get_clipping()
    else:
        if args.init_method == 'static':
            init = np.array([args.siv] * len(layers))
        elif args.init_method == 'random':
            init = np.random.uniform(0.5, 1., size=len(layers))  # TODO: pass range by argument
        else:
            raise RuntimeError("Invalid argument init_method {}".format(args.init_method))

        # set clip value to qwrappers
        mq.set_clipping(init, inf_model.device)
        print("scales initialization: {}".format(str(init)))

        # evaluate with clipping
        loss = inf_model.evaluate_calibration()
        print("Initial loss: {:.4f}".format(loss.item()))

    ml_logger.log_metric('Loss init'.format(args.min_method), loss.item(), step='auto')

    global _min_loss
    _min_loss = loss.item()

    # evaluate
    acc = inf_model.validate()
    ml_logger.log_metric('Acc init', acc, step='auto')

    # run optimizer
    min_options = {}
    if args.maxiter is not None:
        min_options['maxiter'] = args.maxiter
    if args.maxfev is not None:
        min_options['maxfev'] = args.maxfev

    _iter = count(0)

    def local_search_callback(x):
        it = next(_iter)
        mq.set_clipping(x, inf_model.device)
        loss = inf_model.evaluate_calibration()
        print("\n[{}]: Local search callback".format(it))
        print("loss: {:.4f}\n".format(loss.item()))
        print(x)
        ml_logger.log_metric('Loss {}'.format(args.min_method), loss.item(), step='auto')

        # evaluate
        acc = inf_model.validate()
        ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto')

    method = coord_descent if args.min_method == 'CD' else args.min_method
    res = opt.minimize(lambda scales: evaluate_calibration_clipped(scales, inf_model, mq), init.cpu().numpy(),
                       method=method, options=min_options, callback=local_search_callback)

    print(res)

    scales = res.x
    mq.set_clipping(scales, inf_model.device)
    loss = inf_model.evaluate_calibration()
    ml_logger.log_metric('Loss {}'.format(args.min_method), loss.item(), step='auto')

    # evaluate
    acc = inf_model.validate()
    ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto')