def get_data(): dataset_dir = os.path.abspath(settings.DATASET_DIR) return data.get_dataset( dataset_dir, settings.DATASET, settings.TEMPLES, settings.SPLIT, settings.BATCH_SIZE, settings.BUFFER_SIZE, )
def load_data(self): dataset = get_dataset(self.args.data, normalize=self.args.normalize) self.args.num_features, self.args.num_classes, self.args.avg_num_nodes = dataset.num_features, dataset.num_classes, np.ceil( np.mean([data.num_nodes for data in dataset])) print('# %s: [FEATURES]-%d [NUM_CLASSES]-%d [AVG_NODES]-%d' % (dataset, self.args.num_features, self.args.num_classes, self.args.avg_num_nodes)) return dataset
def graft_block(args): os.makedirs('log', exist_ok=True) global logger logger = Logger('log/graft_block_{}_{}_num_per_class_{}.txt'.\ format(args.dataset, time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()), args.num_per_class)) cfg_t = cfgs['vgg16'] cfg_s = cfgs['vgg16-graft'] cfg_blocks_t = split_block(cfg_t) cfg_blocks_s = split_block(cfg_s) num_block = len(block_graft_ids) # ---------------------- Adaption ---------------------- adaptions_t2s = [ nn.Conv2d(cfg_blocks_t[block_graft_ids[i]][-2], cfg_blocks_s[block_graft_ids[i]][-2], kernel_size=1).cuda() for i in range(0, num_block - 1) ] for m in adaptions_t2s: init_conv(m) adaptions_s2t = [ nn.Conv2d(cfg_blocks_s[block_graft_ids[i]][-2], cfg_blocks_t[block_graft_ids[i]][-2], kernel_size=1).cuda() for i in range(0, num_block - 1) ] for m in adaptions_s2t: init_conv(m) # ---------------------- Network ---------------------- teacher = vgg_stock(cfg_t, args.dataset, args.num_class) student = vgg_bw(cfg_s, True, args.dataset, args.num_class) params_t = torch.load(args.ckpt) teacher.cuda().eval() teacher.load_state_dict(params_t) params_s = {} for key in params_t.keys(): key_split = key.split('.') if key_split[0] == 'features' and \ key_split[1] in ['0', '1', '2']: params_s[key] = params_t[key] student.cuda().train() student.load_state_dict(params_s, strict=False) blocks_s = [student.features[i] for i in block_graft_ids[:-1]] blocks_s += [nn.Sequential(nn.Flatten().cuda(), student.classifier)] # ---------------------- Optimizer ---------------------- optimizers_s = [ optim.Adam(blocks_s[i].parameters(), lr=args.lrs_s[i]) for i in range(0, num_block) ] optimizers_adapt_t2s = [ optim.Adam(adaptions_t2s[i].parameters(), lr=args.lrs_adapt_t2s[i]) for i in range(0, num_block - 1) ] optimizers_adapt_s2t = [ optim.Adam(adaptions_s2t[i].parameters(), lr=args.lrs_adapt_s2t[i]) for i in range(0, num_block - 1) ] # ---------------------- Datasets ---------------------- if args.dataset == 'CIFAR10': train_loader = DataLoader(CIFAR10Few(args.data_path, args.num_per_class, transform=get_transformer( args.dataset, cropsize=32, crop_padding=4, hflip=True)), batch_size=args.batch_size, num_workers=4, shuffle=True) elif args.dataset == 'CIFAR100': train_loader = DataLoader(CIFAR100Few(args.data_path, args.num_per_class, transform=get_transformer( args.dataset, cropsize=32, crop_padding=4, hflip=True)), batch_size=args.batch_size, num_workers=4, shuffle=True) test_loader = DataLoader(get_dataset(args, train_flag=False), batch_size=256, num_workers=4, shuffle=False) # ---------------------- Training ---------------------- os.makedirs('./ckpt/student', exist_ok=True) params_s_best = OrderedDict() for block_id in range(len(blocks_s)): best_accuarcy = 0.0 for epoch in range(args.num_epoch[block_id]): if logger: logger.write('Epoch', epoch) loss_value = train_epoch( args, teacher, blocks_s, blocks_s_len, [adaptions_t2s, adaptions_s2t], block_id, train_loader, [optimizers_s, optimizers_adapt_t2s, optimizers_adapt_s2t]) accuracy = test(teacher, test_loader) if best_accuarcy < accuracy: best_accuarcy = accuracy if epoch == args.num_epoch[block_id] - 1: block_warp = warp_block(blocks_s, block_id, adaptions_t2s, adaptions_s2t) params_s_best['block-{}'.format(block_id)] \ = block_warp.cpu().state_dict().copy() # deep copy !!! if logger: logger.write('Accuracy-B{}'.format(block_id), accuracy) for block_id in range(len(blocks_s)): block = warp_block(blocks_s, block_id, adaptions_t2s, adaptions_s2t) block.load_state_dict(params_s_best['block-{}'.format(block_id)]) block.cuda() teacher.set_scion(block, block_graft_ids[block_id], 1) accuracy = test(teacher, test_loader) if logger: logger.write('Test-Best-Accuracy-B{}'.format(block_id), accuracy) if logger: logger.close() with open('ckpt/student/vgg16-student-graft-block-{}-{}perclass.pth'.\ format(args.dataset, args.num_per_class), 'bw') as f: torch.save(params_s_best, f)
def run(): args = parse_args() # Vis window if args.vis: cv2.namedWindow('Display', cv2.WINDOW_NORMAL) # Set-up output directories dt = datetime.datetime.now().strftime('%y%m%d_%H%M') net_desc = '{}_{}'.format(dt, '_'.join(args.description.split())) save_folder = os.path.join(args.outdir, net_desc) if not os.path.exists(save_folder): os.makedirs(save_folder) tb = tensorboardX.SummaryWriter(os.path.join(args.logdir, net_desc)) # Load Dataset logging.info('Loading {} Dataset...'.format(args.dataset.title())) Dataset = get_dataset(args.dataset) train_dataset = Dataset(args.dataset_path, start=0.0, end=args.split, ds_rotate=args.ds_rotate, random_rotate=True, random_zoom=True, include_depth=args.use_depth, include_rgb=args.use_rgb) train_data = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) val_dataset = Dataset(args.dataset_path, start=args.split, end=1.0, ds_rotate=args.ds_rotate, random_rotate=True, random_zoom=True, include_depth=args.use_depth, include_rgb=args.use_rgb) val_data = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=args.num_workers) logging.info('Done') # Load the network logging.info('Loading Network...') input_channels = 1 * args.use_depth + 3 * args.use_rgb ggcnn = get_network(args.network) # net = ggcnn(input_channels=input_channels) print(torch.cuda.is_available()) print(torch.cuda.device_count()) device = torch.device("cuda:0") # net = torch.load("./ggcnn_weights_cornell/ggcnn_epoch_23_cornell",map_location=device) # net = torch.load("output/models2/cnn3/epoch_50_iou_0.49",map_location=device) net = torch.load("output/models2/211209_2216_/epoch_49_iou_0.22", map_location=device) # net = net.to(device) optimizer = optim.Adam(net.parameters()) logging.info('Done') # Print model architecture. summary(net, (input_channels, 300, 300)) f = open(os.path.join(save_folder, 'arch.txt'), 'w') sys.stdout = f summary(net, (input_channels, 300, 300)) sys.stdout = sys.__stdout__ f.close() # torch.load(os.path.join(save_folder,"epoch_10_iou_0.00_statedict.pt")) best_iou = 0.0 for epoch in range(args.epochs + 1): logging.info('Beginning Epoch {:02d}'.format(epoch)) train_results = train(epoch, net, device, train_data, optimizer, args.batches_per_epoch, vis=args.vis) # Log training losses to tensorboard tb.add_scalar('loss/train_loss', train_results['loss'], epoch) for n, l in train_results['losses'].items(): tb.add_scalar('train_loss/' + n, l, epoch) # Run Validation logging.info('Validating...') test_results = validate(net, device, val_data, args.val_batches) logging.info('%d/%d = %f' % (test_results['correct'], test_results['correct'] + test_results['failed'], test_results['correct'] / (test_results['correct'] + test_results['failed']))) # Log validation results to tensorbaord tb.add_scalar( 'loss/IOU', test_results['correct'] / (test_results['correct'] + test_results['failed']), epoch) tb.add_scalar('loss/val_loss', test_results['loss'], epoch) for n, l in test_results['losses'].items(): tb.add_scalar('val_loss/' + n, l, epoch) # Save best performing network iou = test_results['correct'] / (test_results['correct'] + test_results['failed']) if iou > best_iou or epoch == 0 or (epoch % 10) == 0: torch.save( net, os.path.join(save_folder, 'epoch_%02d_iou_%0.2f' % (epoch, iou))) torch.save( net.state_dict(), os.path.join( save_folder, 'epoch_%02d_iou_%0.2f_statedict.pt' % (epoch, iou))) if iou > best_iou: best_iou = iou
def run(): args = parse_args() # Set-up output directories dt = datetime.datetime.now().strftime('%y%m%d_%H%M') net_desc = '{}_{}'.format(dt, '_'.join(args.description.split())) save_folder = os.path.join(args.logdir, net_desc) if not os.path.exists(save_folder): os.makedirs(save_folder) tb = tensorboardX.SummaryWriter(save_folder) # Save commandline args if args is not None: params_path = os.path.join(save_folder, 'commandline_args.json') with open(params_path, 'w') as f: json.dump(vars(args), f) # Initialize logging logging.root.handlers = [] logging.basicConfig( level=logging.INFO, filename="{0}/{1}.log".format(save_folder, 'log'), format= '[%(asctime)s] {%(pathname)s:%(lineno)d} %(levelname)s - %(message)s', datefmt='%H:%M:%S') # set up logging to console console = logging.StreamHandler() console.setLevel(logging.DEBUG) # set a format which is simpler for console use formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') console.setFormatter(formatter) # add the handler to the root logger logging.getLogger('').addHandler(console) # Get the compute device device = get_device(args.force_cpu) # Load Dataset logging.info('Loading {} Dataset...'.format(args.dataset.title())) Dataset = get_dataset(args.dataset) dataset = Dataset(args.dataset_path, ds_rotate=args.ds_rotate, random_rotate=True, random_zoom=True, include_depth=args.use_depth, include_rgb=args.use_rgb) logging.info('Dataset size is {}'.format(dataset.length)) # Creating data indices for training and validation splits indices = list(range(dataset.length)) split = int(np.floor(args.split * dataset.length)) if args.ds_shuffle: np.random.seed(args.random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[:split], indices[split:] logging.info('Training size: {}'.format(len(train_indices))) logging.info('Validation size: {}'.format(len(val_indices))) # Creating data samplers and loaders train_sampler = torch.utils.data.sampler.SubsetRandomSampler(train_indices) val_sampler = torch.utils.data.sampler.SubsetRandomSampler(val_indices) train_data = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers, sampler=train_sampler) val_data = torch.utils.data.DataLoader(dataset, batch_size=1, num_workers=args.num_workers, sampler=val_sampler) logging.info('Done') # Load the network logging.info('Loading Network...') input_channels = 1 * args.use_depth + 3 * args.use_rgb network = get_network(args.network) net = network(input_channels=input_channels, dropout=args.use_dropout, prob=args.dropout_prob, channel_size=args.channel_size) net = net.to(device) logging.info('Done') if args.optim.lower() == 'adam': optimizer = optim.Adam(net.parameters()) elif args.optim.lower() == 'sgd': optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9) else: raise NotImplementedError('Optimizer {} is not implemented'.format( args.optim)) # Print model architecture. summary(net, (input_channels, 224, 224)) f = open(os.path.join(save_folder, 'arch.txt'), 'w') sys.stdout = f summary(net, (input_channels, 224, 224)) sys.stdout = sys.__stdout__ f.close() best_iou = 0.0 for epoch in range(args.epochs): logging.info('Beginning Epoch {:02d}'.format(epoch)) train_results = train(epoch, net, device, train_data, optimizer, args.batches_per_epoch, vis=args.vis) # Log training losses to tensorboard tb.add_scalar('loss/train_loss', train_results['loss'], epoch) for n, l in train_results['losses'].items(): tb.add_scalar('train_loss/' + n, l, epoch) # Run Validation logging.info('Validating...') test_results = validate(net, device, val_data) logging.info('%d/%d = %f' % (test_results['correct'], test_results['correct'] + test_results['failed'], test_results['correct'] / (test_results['correct'] + test_results['failed']))) # Log validation results to tensorbaord tb.add_scalar( 'loss/IOU', test_results['correct'] / (test_results['correct'] + test_results['failed']), epoch) tb.add_scalar('loss/val_loss', test_results['loss'], epoch) for n, l in test_results['losses'].items(): tb.add_scalar('val_loss/' + n, l, epoch) # Save best performing network iou = test_results['correct'] / (test_results['correct'] + test_results['failed']) if iou > best_iou or epoch == 0 or (epoch % 10) == 0: torch.save( net, os.path.join(save_folder, 'epoch_%02d_iou_%0.2f' % (epoch, iou))) best_iou = iou
def run(args, save_folder, log_folder): tb = tensorboardX.SummaryWriter(log_folder) # Load Dataset logging.info('Loading {} Dataset...'.format(args.dataset.title())) Dataset = get_dataset(args.dataset) train_dataset = Dataset(args.dataset_path, start=0.0, end=args.split, ds_rotate=args.ds_rotate, random_rotate=True, random_zoom=True, include_depth=args.use_depth, include_rgb=args.use_rgb) train_data = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers ) val_dataset = Dataset(args.dataset_path, start=args.split, end=1.0, ds_rotate=args.ds_rotate, random_rotate=True, random_zoom=True, include_depth=args.use_depth, include_rgb=args.use_rgb) val_data = torch.utils.data.DataLoader( val_dataset, batch_size=1, shuffle=False, num_workers=args.num_workers ) logging.info('Done') # Load the network logging.info('Loading Network...') input_channels = 1*args.use_depth + 3*args.use_rgb ggcnn = get_network(args.network) net = ggcnn(input_channels=input_channels) device = torch.device("cuda:0") net = net.to(device) optimizer = optim.Adam(net.parameters()) logging.info('Done') # Print model architecture. summary(net, (input_channels, 300, 300)) f = open(os.path.join(save_folder, 'arch.txt'), 'w') sys.stdout = f summary(net, (input_channels, 300, 300)) sys.stdout = sys.__stdout__ f.close() best_iou = 0.0 for epoch in range(args.epochs): logging.info('Beginning Epoch {:02d}'.format(epoch)) train_results = train(epoch, net, device, train_data, optimizer, args.batches_per_epoch, vis=args.vis) # Log training losses to tensorboard tb.add_scalar('loss/train_loss', train_results['loss'], epoch) for n, l in train_results['losses'].items(): tb.add_scalar('train_loss/' + n, l, epoch) # Run Validation logging.info('Validating...') test_results = validate(net, device, val_data, args.val_batches) logging.info('%d/%d = %f' % (test_results['correct'], test_results['correct'] + test_results['failed'], test_results['correct']/(test_results['correct']+test_results['failed']))) # Log validation results to tensorbaord tb.add_scalar('loss/IOU', test_results['correct'] / (test_results['correct'] + test_results['failed']), epoch) tb.add_scalar('loss/val_loss', test_results['loss'], epoch) for n, l in test_results['losses'].items(): tb.add_scalar('val_loss/' + n, l, epoch) # Save best performing network iou = test_results['correct'] / (test_results['correct'] + test_results['failed']) if iou > best_iou or epoch == 0 or (epoch % 10) == 0: torch.save(net, os.path.join(save_folder, 'epoch_%02d_iou_%0.2f' % (epoch, iou))) torch.save(net.state_dict(), os.path.join( save_folder, 'epoch_%02d_iou_%0.2f_statedict.pt' % (epoch, iou))) best_iou = iou
time = datetime.now().strftime('%Y%m%d-%H%M%S') temples = [str(x) for x in settings.TEMPLES] temples = ''.join(temples) resolution = f'{settings.IMG_WIDTH}x{settings.IMG_HEIGHT}' log_name = f'\\{settings.MODEL}\\{settings.DATASET}\\' log_name += f'{settings.NORM_TYPE}_norm\\t{temples}-{resolution}-buffer{settings.BUFFER_SIZE}-' + \ f'batch{settings.BATCH_SIZE}-e{settings.EPOCHS}\\{time}' log_dir = os.path.abspath(settings.LOG_DIR) + log_name # --- dataset --- dataset_dir = os.path.abspath(settings.DATASET_DIR) train, val = data.get_dataset( dataset_dir, settings.DATASET, settings.TEMPLES, settings.SPLIT, settings.BATCH_SIZE, settings.BUFFER_SIZE, ) # for testing purposes # x = y = tf.random.normal((5, settings.IMG_HEIGHT, settings.IMG_WIDTH, 3)) # x = tf.data.Dataset.from_tensor_slices(x).batch(1) # y = tf.data.Dataset.from_tensor_slices(y).batch(1) # train = val = tf.data.Dataset.zip((x, y)) # --- model --- model = builder.get_model( settings.MODEL, settings.DATASET, (settings.IMG_HEIGHT, settings.IMG_WIDTH, 3),
raise ValueError( '--jacquard-output can not be used with data augmentation.') return args if __name__ == '__main__': args = parse_args() # Load Network net = torch.load(args.network) device = torch.device("cuda:0") # Load Dataset logging.info('Loading {} Dataset...'.format(args.dataset.title())) Dataset = get_dataset(args.dataset) test_dataset = Dataset(args.dataset_path, start=args.split, end=1.0, ds_rotate=args.ds_rotate, random_rotate=args.augment, random_zoom=args.augment, include_depth=args.use_depth, include_rgb=args.use_rgb) test_data = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=args.num_workers) logging.info('Done') results = {'correct': 0, 'failed': 0}
def test_whole_net(args): cfg_t = cfgs['vgg16'] cfg_s = cfgs['vgg16-graft'] cfg_blocks_t = split_block(cfg_t) cfg_blocks_s = split_block(cfg_s) num_block = len(block_graft_ids) # ---------------------- Network ---------------------- teacher = vgg_stock(cfg_t, args.dataset, args.num_class) params_t = torch.load(args.ckpt) teacher.cuda().eval() teacher.load_state_dict(params_t) adaptions_t2s = [ nn.Conv2d(cfg_blocks_t[block_graft_ids[i]][-2], cfg_blocks_s[block_graft_ids[i]][-2], kernel_size=1).cuda() for i in range(0, num_block - 1) ] adaptions_s2t = [ nn.Conv2d(cfg_blocks_s[block_graft_ids[i]][-2], cfg_blocks_t[block_graft_ids[i]][-2], kernel_size=1).cuda() for i in range(0, num_block - 1) ] cfg_s = cfgs['vgg16-graft'] student = vgg_bw(cfg_s, True, args.dataset, args.num_class) student.cuda() params_s = {} for key in params_t.keys(): key_split = key.split('.') if key_split[0] == 'features' and \ key_split[1] in ['0', '1', '2']: params_s[key] = params_t[key] student.load_state_dict(params_s, strict=False) blocks_s = [student.features[i] for i in block_graft_ids[:-1]] blocks_s += [nn.Sequential(nn.Flatten().cuda(), student.classifier)] blocks = [] for block_id in range(num_block): blocks.append( warp_block(blocks_s, block_id, adaptions_t2s, adaptions_s2t).cuda()) block = nn.Sequential(*blocks) block.load_state_dict( torch.load('ckpt/student/vgg16-student-graft-net-{}-{}perclass.pth'\ .format(args.dataset, args.num_per_class)) ) test_loader = DataLoader(get_dataset(args, train_flag=False), batch_size=args.batch_size, num_workers=4, shuffle=False) block = nn.Sequential(student.features[:3], block) print('Test Accuracy: ', test(block, test_loader))
def main_worker(args, ml_logger): global best_acc1 datatime_str = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') suf_name = "_" + args.experiment if args.gpu_ids is not None: print("Use GPU: {} for training".format(args.gpu_ids)) if args.log_stats: from utils.stats_trucker import StatsTrucker as ST ST("W{}A{}".format(args.bit_weights, args.bit_act)) if 'resnet' in args.arch and args.custom_resnet: # pdb.set_trace() model = custom_resnet(arch=args.arch, pretrained=args.pretrained, depth=arch2depth(args.arch), dataset=args.dataset) elif 'inception_v3' in args.arch and args.custom_inception: model = custom_inception(pretrained=args.pretrained) else: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=args.pretrained) device = torch.device('cuda:{}'.format(args.gpu_ids[0])) cudnn.benchmark = True torch.cuda.set_device(args.gpu_ids[0]) model = model.to(device) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, device) args.start_epoch = checkpoint['epoch'] # best_acc1 = checkpoint['best_acc1'] # best_acc1 may be from a checkpoint from a different GPU # best_acc1 = best_acc1.to(device) checkpoint['state_dict'] = { normalize_module_name(k): v for k, v in checkpoint['state_dict'].items() } model.load_state_dict(checkpoint['state_dict'], strict=False) # optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if len(args.gpu_ids) > 1: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features, args.gpu_ids) else: model = torch.nn.DataParallel(model, args.gpu_ids) default_transform = { 'train': get_transform(args.dataset, augment=True), 'eval': get_transform(args.dataset, augment=False) } val_data = get_dataset(args.dataset, 'val', default_transform['eval']) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().to(device) train_data = get_dataset(args.dataset, 'train', default_transform['train']) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) # TODO: replace this call by initialization on small subset of training data # TODO: enable for activations # validate(val_loader, model, criterion, args, device) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) lr_scheduler = StepLR(optimizer, step_size=args.lr_step, gamma=0.1) # pdb.set_trace() mq = None if args.quantize: if args.bn_folding: print( "Applying batch-norm folding ahead of post-training quantization" ) from utils.absorb_bn import search_absorbe_bn search_absorbe_bn(model) all_convs = [ n for n, m in model.named_modules() if isinstance(m, nn.Conv2d) ] # all_convs = [l for l in all_convs if 'downsample' not in l] all_relu = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU) ] all_relu6 = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU6) ] layers = all_relu[1:-1] + all_relu6[1:-1] + all_convs[1:] replacement_factory = { nn.ReLU: ActivationModuleWrapper, nn.ReLU6: ActivationModuleWrapper, nn.Conv2d: ParameterModuleWrapper } mq = ModelQuantizer( model, args, layers, replacement_factory, OptimizerBridge(optimizer, settings={ 'algo': 'SGD', 'dataset': args.dataset })) if args.resume: # Load quantization parameters from state dict mq.load_state_dict(checkpoint['state_dict']) mq.log_quantizer_state(ml_logger, -1) if args.model_freeze: mq.freeze() # pdb.set_trace() if args.evaluate: acc = validate(val_loader, model, criterion, args, device) ml_logger.log_metric('Val Acc1', acc) return # evaluate on validation set acc1 = validate(val_loader, model, criterion, args, device) ml_logger.log_metric('Val Acc1', acc1, -1) # evaluate with k-means quantization # if args.model_freeze: # with mq.disable(): # acc1_nq = validate(val_loader, model, criterion, args, device) # ml_logger.log_metric('Val Acc1 fp32', acc1_nq, -1) # pdb.set_trace() # Kurtosis regularization on weights tensors weight_to_hook = {} if args.w_kurtosis: if args.weight_name[0] == 'all': all_convs = [ n.replace(".wrapped_module", "") + '.weight' for n, m in model.named_modules() if isinstance(m, nn.Conv2d) ] weight_name = all_convs[1:] if args.remove_weight_name: for rm_name in args.remove_weight_name: weight_name.remove(rm_name) else: weight_name = args.weight_name for name in weight_name: # pdb.set_trace() curr_param = fine_weight_tensor_by_name(model, name) # if not curr_param: # name = 'float_' + name # QAT name # curr_param = fine_weight_tensor_by_name(self.model, name) # if curr_param is not None: weight_to_hook[name] = curr_param for epoch in range(0, args.epochs): # train for one epoch print('Timestamp Start epoch: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) train(train_loader, model, criterion, optimizer, epoch, args, device, ml_logger, val_loader, mq, weight_to_hook) print('Timestamp End epoch: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) if not args.lr_freeze: lr_scheduler.step() # evaluate on validation set acc1 = validate(val_loader, model, criterion, args, device) ml_logger.log_metric('Val Acc1', acc1, step='auto') # evaluate with k-means quantization # if args.model_freeze: # with mq.quantization_method('kmeans'): # acc1_kmeans = validate(val_loader, model, criterion, args, device) # ml_logger.log_metric('Val Acc1 kmeans', acc1_kmeans, epoch) # with mq.disable(): # acc1_nq = validate(val_loader, model, criterion, args, device) # ml_logger.log_metric('Val Acc1 fp32', acc1_nq, step='auto') if args.quantize: mq.log_quantizer_state(ml_logger, epoch) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict() if len(args.gpu_ids) == 1 else model.module.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best, datatime_str=datatime_str, suf_name=suf_name)
else: print('==> Load pretrained model form', args.pretrained, '...') pretrained_model = torch.load(args.pretrained) # best_acc = pretrained_model['best_acc'] model.load_state_dict(pretrained_model['state_dict']) # Setup dataset - transformation, dataloader default_transform = { 'train': get_transform(args.dataset, input_size=args.input_size, augment=True), 'eval': get_transform(args.dataset, input_size=args.input_size, augment=False) } transform = getattr(model, 'input_transform', default_transform) test_data = get_dataset(args.dataset, 'val', transform['eval']) testloader = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) criterion = nn.CrossEntropyLoss() model.to(device) #.half() # uncomment for FP16 #model = torch.nn.DataParallel(model) [test_acc, test_loss] = test() print("Testing accuracy: ", test_acc) #sparsity_validate(model)
if args.mvm: model = model_mvm model.to(device) #.half() # uncomment for FP16 model = torch.nn.DataParallel(model) # Setup dataset - transformation, dataloader default_transform = { 'train': get_transform(args.dataset, input_size=args.input_size, augment=True), 'eval': get_transform(args.dataset, input_size=args.input_size, augment=False) } transform = getattr(model, 'input_transform', default_transform) train_data = get_dataset(args.dataset, 'train', transform['train']) trainloader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, worker_init_fn=_init_fn) test_data = get_dataset(args.dataset, 'val', transform['eval']) testloader = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, worker_init_fn=_init_fn)
def __init__(self, arch, use_custom_resnet, use_custom_inception, pretrained, dataset, gpu_ids, datapath, batch_size, shuffle, workers, print_freq, cal_batch_size, cal_set_size, args): self.arch = arch self.use_custom_resnet = use_custom_resnet self.pretrained = pretrained self.dataset = dataset self.gpu_ids = gpu_ids self.datapath = datapath self.batch_size = batch_size self.shuffle = shuffle self.workers = workers self.print_freq = print_freq self.cal_batch_size = cal_batch_size self.cal_set_size = cal_set_size # TODO: pass it as cmd line argument # create model if 'resnet' in arch and use_custom_resnet: model = custom_resnet(arch=arch, pretrained=pretrained, depth=arch2depth(arch), dataset=dataset) elif 'inception_v3' in arch and use_custom_inception: model = custom_inception(pretrained=pretrained) else: print("=> using pre-trained model '{}'".format(arch)) model = models.__dict__[arch](pretrained=pretrained) self.device = torch.device('cuda:{}'.format(gpu_ids[0])) torch.cuda.set_device(gpu_ids[0]) model = model.to(self.device) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, self.device) args.start_epoch = checkpoint['epoch'] checkpoint['state_dict'] = { normalize_module_name(k): v for k, v in checkpoint['state_dict'].items() } model.load_state_dict(checkpoint['state_dict'], strict=False) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if len(gpu_ids) > 1: # DataParallel will divide and allocate batch_size to all available GPUs if arch.startswith('alexnet') or arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features, gpu_ids) else: model = torch.nn.DataParallel(model, gpu_ids) self.model = model if args.bn_folding: print( "Applying batch-norm folding ahead of post-training quantization" ) from utils.absorb_bn import search_absorbe_bn search_absorbe_bn(model) # define loss function (criterion) and optimizer self.criterion = torch.nn.CrossEntropyLoss().to(self.device) val_data = get_dataset( dataset, 'val', get_transform(dataset, augment=False, scale_size=299 if 'inception' in arch else None, input_size=299 if 'inception' in arch else None), datasets_path=datapath) self.val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=shuffle, num_workers=workers, pin_memory=True) self.cal_loader = torch.utils.data.DataLoader( val_data, batch_size=self.cal_batch_size, shuffle=shuffle, num_workers=workers, pin_memory=True)
def main(): set_seed(1) date = time.strftime("%Y%m%d%H%M%S", time.localtime()) print(f'* Preparing to train model {date}') # ************** configuration **************** # - training setting resume = config['resume'] if config['mode'] == 'PYNATIVE': mode = context.PYNATIVE_MODE else: mode = context.GRAPH_MODE device = config['device'] device_id = config['device_id'] dataset_sink_mode = config['dataset_sink_mode'] # use in dataset div = 8 # setting bias and padding if resume: print('* Resuming model...') resume_config_log = config['resume_config_log'] resume_config = get_eval_config(resume_config_log) if 'best_ckpt' in resume_config.keys(): resume_model_path = resume_config['best_ckpt'] else: resume_model_path = resume_config['latest_model'] print('* [WARNING] Not using the best model, but latest saved model instead.') has_bias = resume_config['has_bias'] use_dropout = resume_config['use_dropout'] pad_mode = resume_config['pad_mode'] if pad_mode == 'pad': padding = resume_config['padding'] elif pad_mode == 'same': padding = 0 else: raise ValueError(f"invalid pad mode: {pad_mode}!") best_acc = resume_config['best_acc'] best_ckpt = resume_config['best_ckpt'] print('* The best accuracy in dev dataset for the current resumed model is {:.2f}%'.format(best_acc * 100)) else: has_bias = config['has_bias'] use_dropout = config['use_dropout'] pad_mode = config['pad_mode'] if pad_mode == 'pad': padding = config['padding'] elif pad_mode == 'same': padding = 0 else: raise ValueError(f"invalid pad mode: {pad_mode}!") # hyper-parameters if resume: batch_size = resume_config['batch_size'] opt_type = resume_config['opt'] use_dynamic_lr = resume_config['use_dynamic_lr'] warmup_step = resume_config['warmup_step'] warmup_ratio = resume_config['warmup_ratio'] else: batch_size = config['batch_size'] opt_type = config['opt'] use_dynamic_lr = config['use_dynamic_lr'] warmup_step = config['warmup_step'] warmup_ratio = config['warmup_ratio'] test_dev_batch_size = config['test_dev_batch_size'] learning_rate = float(config['learning_rate']) epochs = config['epochs'] loss_scale = config['loss_scale'] # configuration of saving model checkpoint save_checkpoint_steps = config['save_checkpoint_steps'] keep_checkpoint_max = config['keep_checkpoint_max'] prefix = config['prefix'] + '_' + date model_dir = config['model_dir'] # loss monitor loss_monitor_step = config['loss_monitor_step'] # whether to use mindInsight summary use_summary = config['use_summary'] # step_eval use_step_eval = config['use_step_eval'] eval_step = config['eval_step'] eval_epoch = config['eval_epoch'] patience = config['patience'] # eval in steps or epochs step_eval = True if eval_step == -1: step_eval = False # ************** end of configuration ************** if device == 'GPU': context.set_context(mode=mode, device_target=device, device_id=device_id) elif device == 'Ascend': import moxing as mox from utils.const import DATA_PATH, MODEL_PATH, BEST_MODEL_PATH, LOG_PATH obs_datapath = config['obs_datapath'] obs_saved_model = config['obs_saved_model'] obs_best_model = config['obs_best_model'] obs_log = config['obs_log'] mox.file.copy_parallel(obs_datapath, DATA_PATH) mox.file.copy_parallel(MODEL_PATH, obs_saved_model) mox.file.copy_parallel(BEST_MODEL_PATH, obs_best_model) mox.file.copy_parallel(LOG_PATH, obs_log) context.set_context(mode=mode, device_target=device) use_summary = False # callbacks function callbacks = [] # data train_loader, idx2label, label2idx = get_dataset(batch_size=batch_size, phase='train', test_dev_batch_size=test_dev_batch_size, div=div, num_parallel_workers=4) if eval_step == 0: eval_step = train_loader.get_dataset_size() # network net = DFCNN(num_classes=len(label2idx), padding=padding, pad_mode=pad_mode, has_bias=has_bias, use_dropout=use_dropout) # Criterion criterion = CTCLoss() # resume if resume: print("* Loading parameters...") param_dict = load_checkpoint(resume_model_path) # load the parameter into net load_param_into_net(net, param_dict) print(f'* Parameters loading from {resume_model_path} succeeded!') net.set_train(True) net.set_grad(True) # lr schedule if use_dynamic_lr: dataset_size = train_loader.get_dataset_size() learning_rate = Tensor(dynamic_lr(base_lr=learning_rate, warmup_step=warmup_step, warmup_ratio=warmup_ratio, epochs=epochs, steps_per_epoch=dataset_size), mstype.float32) print('* Using dynamic learning rate, which will be set up as :', learning_rate.asnumpy()) # optim if opt_type == 'adam': opt = nn.Adam(net.trainable_params(), learning_rate=learning_rate, beta1=0.9, beta2=0.999, weight_decay=0.0, eps=10e-8) elif opt_type == 'rms': opt = nn.RMSProp(params=net.trainable_params(), centered=True, learning_rate=learning_rate, momentum=0.9, loss_scale=loss_scale) elif opt_type == 'sgd': opt = nn.SGD(params=net.trainable_params(), learning_rate=learning_rate) else: raise ValueError(f"optimizer: {opt_type} is not supported for now!") if resume: # load the parameter into optimizer load_param_into_net(opt, param_dict) # save_model config_ck = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps, keep_checkpoint_max=keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix=prefix, directory=model_dir, config=config_ck) # logger the_logger = logger(config, date) log = Logging(logger=the_logger, model_ckpt=ckpt_cb) callbacks.append(ckpt_cb) callbacks.append(log) net = WithLossCell(net, criterion) scaling_sens = Tensor(np.full((1), loss_scale), dtype=mstype.float32) net = DFCNNCTCTrainOneStepWithLossScaleCell(net, opt, scaling_sens) net.set_train(True) model = Model(net) if use_step_eval: # step evaluation step_eval = StepAccInfo(model=model, name=prefix, div=div, test_dev_batch_size=test_dev_batch_size, step_eval=step_eval, eval_step=eval_step, eval_epoch=eval_epoch, logger=the_logger, patience=patience, dataset_size=train_loader.get_dataset_size()) callbacks.append(step_eval) # loss monitor loss_monitor = LossMonitor(loss_monitor_step) callbacks.append(loss_monitor) if use_summary: summary_dir = os.path.join(SUMMARY_DIR, date) if not os.path.exists(summary_dir): os.mkdir(summary_dir) # mindInsight summary_collector = SummaryCollector(summary_dir=summary_dir, collect_freq=1, max_file_size=4 * 1024 ** 3) callbacks.append(summary_collector) if resume: the_logger.update_acc_ckpt(best_acc, best_ckpt) print(f'* Start training...') model.train(epochs, train_loader, callbacks=callbacks, dataset_sink_mode=dataset_sink_mode)
def graft_net(args): global logger_net logger_net = Logger('log/graft_net_{}_{}_{}perclass.txt'.\ format(args.dataset, time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()), args.num_per_class)) # ---------------------- Datasets ---------------------- if args.dataset == 'CIFAR10': train_loader = DataLoader(CIFAR10Few(args.data_path, args.num_per_class, transform=get_transformer( args.dataset, cropsize=32, crop_padding=4, hflip=True)), batch_size=args.batch_size, num_workers=4, shuffle=True) elif args.dataset == 'CIFAR100': train_loader = DataLoader(CIFAR100Few(args.data_path, args.num_per_class, transform=get_transformer( args.dataset, cropsize=32, crop_padding=4, hflip=True)), batch_size=args.batch_size, num_workers=4, shuffle=True) test_loader = DataLoader(get_dataset(args, train_flag=False), batch_size=args.batch_size, num_workers=4, shuffle=False) cfg_t = cfgs['vgg16'] cfg_s = cfgs['vgg16-graft'] cfg_blocks_t = split_block(cfg_t) cfg_blocks_s = split_block(cfg_s) num_block = len(block_graft_ids) # ---------------------- Adaption ---------------------- adaptions_t2s = [ nn.Conv2d(cfg_blocks_t[block_graft_ids[i]][-2], cfg_blocks_s[block_graft_ids[i]][-2], kernel_size=1).cuda() for i in range(0, num_block - 1) ] adaptions_s2t = [ nn.Conv2d(cfg_blocks_s[block_graft_ids[i]][-2], cfg_blocks_t[block_graft_ids[i]][-2], kernel_size=1).cuda() for i in range(0, num_block - 1) ] # ---------------------- Teacher ---------------------- teacher = vgg_stock(cfg_t, args.dataset, args.num_class) params_t = torch.load(args.ckpt) teacher.cuda().eval() teacher.load_state_dict(params_t) # ---------------------- Blocks ---------------------- params_s = {} for key in params_t.keys(): key_split = key.split('.') if key_split[0] == 'features' and \ key_split[1] in ['0', '1', '2']: params_s[key] = params_t[key] student = vgg_bw(cfg_s, True, args.dataset, args.num_class) student.cuda().train() student.load_state_dict(params_s, strict=False) blocks_s = [student.features[i] for i in block_graft_ids[:-1]] blocks_s += [nn.Sequential(nn.Flatten().cuda(), student.classifier)] blocks = [] for block_id in range(num_block): blocks.append( warp_block(blocks_s, block_id, adaptions_t2s, adaptions_s2t).cuda()) params = torch.load('ckpt/student/vgg16-student-graft-block-{}-{}perclass.pth'.\ format(args.dataset, args.num_per_class)) for block_id in range(num_block): blocks[block_id].load_state_dict(params['block-{}'.format(block_id)]) for i in range(num_block - 1): block = nn.Sequential(*blocks[:(i + 2)]) optimizer = optim.Adam(block.parameters(), lr=0.0001) scion_len = sum(blocks_s_len[:(i + 2)]) accuracy_best_block = 0.0 params_best_save = None for epoch in range(args.num_epoch[i]): if logger_net: logger_net.write('Epoch', epoch) loss_value = train_epoch(args, teacher, block, scion_len, train_loader, optimizer) accuracy = test(teacher, test_loader) if accuracy > accuracy_best_block: accuracy_best_block = accuracy params_tmp = block.cpu().state_dict() params_best_save = params_tmp.copy() block.cuda() if epoch == (args.num_epoch[i] - 1) and \ i == (num_block - 2): block.load_state_dict(params_best_save) if logger_net: logger_net.write('Accuracy-length-{}'.format(scion_len), accuracy) if logger_net: logger_net.write('Student Best Accuracy', accuracy_best_block) with open('ckpt/student/vgg16-student-graft-net-{}-{}perclass.pth'\ .format(args.dataset, args.num_per_class), 'wb') as f: torch.save(block.state_dict(), f) if logger_net: logger_net.close() return accuracy_best_block
def run(): args = parse_args() # Set-up output directories dt = datetime.datetime.now().strftime('%y%m%d_%H%M') net_desc = '{}_{}'.format(dt, '_'.join(args.description.split())) save_folder = os.path.join(args.outdir, net_desc) if not os.path.exists(save_folder): os.makedirs(save_folder) tb = tensorboardX.SummaryWriter(os.path.join(args.logdir, net_desc)) # Load Dataset logging.info('Loading {} Dataset...'.format(args.dataset.title())) Dataset = get_dataset(args.dataset) # train_dataset = Dataset(args.dataset_path, start=0.0, end=args.split, ds_rotate=args.ds_rotate, # random_rotate=True, random_zoom=True, # include_depth=args.use_depth, include_rgb=args.use_rgb) train_dataset = Dataset(args.dataset_path, start=0.0, end=args.split, ds_rotate=args.ds_rotate, random_rotate=True, random_zoom=True, include_depth=args.use_depth, include_rgb=args.use_rgb) train_data = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) val_dataset = Dataset(args.dataset_path, start=args.split, end=1.0, ds_rotate=args.ds_rotate, random_rotate=False, random_zoom=False, include_depth=args.use_depth, include_rgb=args.use_rgb) val_data = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=args.num_workers) logging.info('Done') # Load the network logging.info('Loading Network...') input_channels = 1 * args.use_depth + 3 * args.use_rgb ggcnn = get_network(args.network) net = ggcnn(input_channels=input_channels) device = torch.device("cpu") # device = torch.device("cuda:0") net = net.to(device) optimizer = optim.Adam(net.parameters()) logging.info('Done') # Print model architecture. summary(net, (input_channels, 200, 200)) f = open(os.path.join(save_folder, 'arch.txt'), 'w') sys.stdout = f summary(net, (input_channels, 200, 200)) sys.stdout = sys.__stdout__ f.close() best_iou = 1000.0 for epoch in range(args.epochs): logging.info('Beginning Epoch {:02d}'.format(epoch)) train_results = train(epoch, net, device, train_data, optimizer, args.batches_per_epoch, vis=args.vis) # Log training losses to tensorboard tb.add_scalar('loss/train_loss', train_results['loss'], epoch) for n, l in train_results['losses'].items(): tb.add_scalar('train_loss/' + n, l, epoch) # Run Validation logging.info('Validating...') test_results = validate(net, device, val_data, args.val_batches) #logging.info('%d/%d = %f' % (test_results['correct'], test_results['correct'] + test_results['failed'], # test_results['correct']/(test_results['correct']+test_results['failed']))) # Log validation results to tensorbaord #tb.add_scalar('loss/IOU', test_results['correct'] / (test_results['correct'] + test_results['failed']), epoch) tb.add_scalar('loss/val_loss', test_results['loss'], epoch) for n, l in test_results['losses'].items(): tb.add_scalar('val_loss/' + n, l, epoch) # Save best performing network #iou = test_results['correct'] / (test_results['correct'] + test_results['failed']) iou = test_results['loss'] # only save if this epoch is better than prev one # always save the first one and then every 10 if iou < best_iou or epoch == 0 or (epoch % 10) == 0: torch.save( net, os.path.join(save_folder, 'epoch_%02d_iou_%0.2f' % (epoch, iou))) best_iou = iou
def main_worker(args, ml_logger): global best_acc1 if args.gpu_ids is not None: print("Use GPU: {} for training".format(args.gpu_ids)) # create model if 'resnet' in args.arch and args.custom_resnet: model = custom_resnet(arch=args.arch, pretrained=args.pretrained, depth=arch2depth(args.arch), dataset=args.dataset) elif 'inception_v3' in args.arch and args.custom_inception: model = custom_inception(pretrained=args.pretrained) elif args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() device = torch.device('cuda:{}'.format(args.gpu_ids[0])) cudnn.benchmark = True torch.cuda.set_device(args.gpu_ids[0]) model = model.to(device) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): # mq = ModelQuantizer(model, args) print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, device) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] # best_acc1 may be from a checkpoint from a different GPU # best_acc1 = best_acc1.to(device) model.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if len(args.gpu_ids) > 1: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features, args.gpu_ids) else: model = torch.nn.DataParallel(model, args.gpu_ids) val_data = get_dataset( args.dataset, 'val', get_transform(args.dataset, augment=False, scale_size=299 if 'inception' in args.arch else None, input_size=299 if 'inception' in args.arch else None), datasets_path=args.datapath) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=args.shuffle, num_workers=args.workers, pin_memory=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().to(device) if 'inception' in args.arch and args.custom_inception: first = 3 last = -1 else: first = 1 last = -1 if args.quantize: all_convs = [ n for n, m in model.named_modules() if isinstance(m, nn.Conv2d) ] all_relu = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU) ] all_relu6 = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU6) ] layers = all_relu[first:last] + all_relu6[first:last] + all_convs[ first:last] replacement_factory = { nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Conv2d: ParameterModuleWrapperPost } mq = ModelQuantizer(model, args, layers, replacement_factory) mq.log_quantizer_state(ml_logger, -1) acc = validate(val_loader, model, criterion, args, device) ml_logger.log_metric('Val Acc1', acc, step='auto')
def main_worker(args, ml_logger): global best_acc1 if args.gpu_ids is not None: print("Use GPU: {} for training".format(args.gpu_ids)) if args.log_stats: from utils.stats_trucker import StatsTrucker as ST ST("W{}A{}".format(args.bit_weights, args.bit_act)) if 'resnet' in args.arch and args.custom_resnet: model = custom_resnet(arch=args.arch, pretrained=args.pretrained, depth=arch2depth(args.arch), dataset=args.dataset) elif 'inception_v3' in args.arch and args.custom_inception: model = custom_inception(pretrained=args.pretrained) else: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=args.pretrained) device = torch.device('cuda:{}'.format(args.gpu_ids[0])) cudnn.benchmark = True torch.cuda.set_device(args.gpu_ids[0]) model = model.to(device) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, device) args.start_epoch = checkpoint['epoch'] # best_acc1 = checkpoint['best_acc1'] # best_acc1 may be from a checkpoint from a different GPU # best_acc1 = best_acc1.to(device) checkpoint['state_dict'] = { normalize_module_name(k): v for k, v in checkpoint['state_dict'].items() } model.load_state_dict(checkpoint['state_dict'], strict=False) # optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if len(args.gpu_ids) > 1: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features, args.gpu_ids) else: model = torch.nn.DataParallel(model, args.gpu_ids) default_transform = { 'train': get_transform(args.dataset, augment=True), 'eval': get_transform(args.dataset, augment=False) } val_data = get_dataset(args.dataset, 'val', default_transform['eval']) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().to(device) train_data = get_dataset(args.dataset, 'train', default_transform['train']) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) # TODO: replace this call by initialization on small subset of training data # TODO: enable for activations # validate(val_loader, model, criterion, args, device) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) lr_scheduler = StepLR(optimizer, step_size=args.lr_step, gamma=0.1) mq = None if args.quantize: if args.bn_folding: print( "Applying batch-norm folding ahead of post-training quantization" ) from utils.absorb_bn import search_absorbe_bn search_absorbe_bn(model) all_convs = [ n for n, m in model.named_modules() if isinstance(m, nn.Conv2d) ] # all_convs = [l for l in all_convs if 'downsample' not in l] all_relu = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU) ] all_relu6 = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU6) ] layers = all_relu[1:-1] + all_relu6[1:-1] + all_convs[1:] replacement_factory = { nn.ReLU: ActivationModuleWrapper, nn.ReLU6: ActivationModuleWrapper, nn.Conv2d: ParameterModuleWrapper } mq = ModelQuantizer( model, args, layers, replacement_factory, OptimizerBridge(optimizer, settings={ 'algo': 'SGD', 'dataset': args.dataset })) if args.resume: # Load quantization parameters from state dict mq.load_state_dict(checkpoint['state_dict']) mq.log_quantizer_state(ml_logger, -1) if args.model_freeze: mq.freeze() if args.evaluate: if args.log_stats: mean = [] var = [] skew = [] kurt = [] for n, p in model.named_parameters(): if n.replace('.weight', '') in all_convs[1:]: mu = p.mean() std = p.std() mean.append((n, mu.item())) var.append((n, (std**2).item())) skew.append((n, torch.mean(((p - mu) / std)**3).item())) kurt.append((n, torch.mean(((p - mu) / std)**4).item())) for i in range(len(mean)): ml_logger.log_metric(mean[i][0] + '.mean', mean[i][1]) ml_logger.log_metric(var[i][0] + '.var', var[i][1]) ml_logger.log_metric(skew[i][0] + '.skewness', skew[i][1]) ml_logger.log_metric(kurt[i][0] + '.kurtosis', kurt[i][1]) ml_logger.log_metric('weight_mean', np.mean([s[1] for s in mean])) ml_logger.log_metric('weight_var', np.mean([s[1] for s in var])) ml_logger.log_metric('weight_skewness', np.mean([s[1] for s in skew])) ml_logger.log_metric('weight_kurtosis', np.mean([s[1] for s in kurt])) acc = validate(val_loader, model, criterion, args, device) ml_logger.log_metric('Val Acc1', acc) if args.log_stats: stats = ST().get_stats() for s in stats: ml_logger.log_metric(s, np.mean(stats[s])) return # evaluate on validation set acc1 = validate(val_loader, model, criterion, args, device) ml_logger.log_metric('Val Acc1', acc1, -1) # evaluate with k-means quantization # if args.model_freeze: # with mq.disable(): # acc1_nq = validate(val_loader, model, criterion, args, device) # ml_logger.log_metric('Val Acc1 fp32', acc1_nq, -1) for epoch in range(0, args.epochs): # train for one epoch print('Timestamp Start epoch: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) train(train_loader, model, criterion, optimizer, epoch, args, device, ml_logger, val_loader, mq) print('Timestamp End epoch: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) if not args.lr_freeze: lr_scheduler.step() # evaluate on validation set acc1 = validate(val_loader, model, criterion, args, device) ml_logger.log_metric('Val Acc1', acc1, step='auto') # evaluate with k-means quantization # if args.model_freeze: # with mq.quantization_method('kmeans'): # acc1_kmeans = validate(val_loader, model, criterion, args, device) # ml_logger.log_metric('Val Acc1 kmeans', acc1_kmeans, epoch) # with mq.disable(): # acc1_nq = validate(val_loader, model, criterion, args, device) # ml_logger.log_metric('Val Acc1 fp32', acc1_nq, step='auto') if args.quantize: mq.log_quantizer_state(ml_logger, epoch) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict() if len(args.gpu_ids) == 1 else model.module.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best)
def test(): if config['mode'] == 'PYNATIVE': mode = context.PYNATIVE_MODE else: mode = context.GRAPH_MODE device = config['device'] device_id = config['device_id'] if device == 'Ascend': import moxing as mox from utils.const import DATA_PATH, MODEL_PATH, BEST_MODEL_PATH, LOG_PATH obs_datapath = config['obs_datapath'] obs_saved_model = config['obs_saved_model'] obs_best_model = config['obs_best_model'] obs_log = config['obs_log'] if not os.path.exists(MODEL_PATH): os.mkdir(MODEL_PATH) if not os.path.exists(BEST_MODEL_PATH): os.mkdir(BEST_MODEL_PATH) if not os.path.exists(LOG_PATH): os.mkdir(LOG_PATH) mox.file.copy_parallel(obs_datapath, DATA_PATH) mox.file.copy_parallel(MODEL_PATH, obs_saved_model) mox.file.copy_parallel(BEST_MODEL_PATH, obs_best_model) mox.file.copy_parallel(LOG_PATH, obs_log) test_dev_batch_size = config['test_dev_batch_size'] eval_config_log = config['log_to_eval'] data_num = config['test_dataset_size'] eval_config = get_eval_config(eval_config_log) # - use in dataset div = 8 if 'best_ckpt' in eval_config.keys(): eval_model_path = eval_config['best_ckpt'] if device == 'Ascend': import moxing as mox from utils.const import BEST_MODEL_PATH eval_model_filename = eval_model_path.split('/')[-1] obs_best_model = config['obs_best_model'] mox.file.copy_parallel(obs_best_model + eval_model_filename, eval_model_path) else: eval_model_path = eval_config['latest_model'] if device == 'Ascend': import moxing as mox from utils.const import BEST_MODEL_PATH eval_model_filename = eval_model_path.split('/')[-1] obs_saved_model = config['obs_saved_model'] mox.file.copy_parallel(obs_saved_model + eval_model_filename, eval_model_path) print( '* [WARNING] Not using the best model, but latest saved model instead.' ) # - 偏差 has_bias = eval_config['has_bias'] use_dropout = eval_config['use_dropout'] # - pad pad_mode = eval_config['pad_mode'] if pad_mode == 'pad': padding = eval_config['padding'] elif pad_mode == 'same': padding = 0 else: raise ValueError(f"invalid pad mode: {pad_mode}!") if 'best_acc' in eval_config.keys(): best_acc = eval_config['best_acc'] print('* Best accuracy for the dev dataset is : {:.2f}%'.format( best_acc * 100)) if device == 'GPU': context.set_context(mode=mode, device_target=device, device_id=device_id) elif device == 'Ascend': context.set_context(mode=mode, device_target=device) # data test_loader, idx2label, label2idx = get_dataset( phase='test', test_dev_batch_size=test_dev_batch_size, div=div, num_parallel_workers=4) net = DFCNN(num_classes=len(label2idx), padding=padding, pad_mode=pad_mode, has_bias=has_bias, use_dropout=use_dropout) # loads param param_dict = load_checkpoint(eval_model_path) load_param_into_net(net, param_dict) print('* params loaded!') net.set_train(False) converter = CTCLabelConverter(label2idx=label2idx, idx2label=idx2label, batch_size=test_dev_batch_size) words_num = 0 word_error_num = 0 limit = 0 for data in test_loader.create_tuple_iterator(): if limit > data_num and not data_num < 0: break img_batch, label_indices, label_batch, sequence_length, lab_len = data img_tensor = Tensor(img_batch, mstype.float32) model_predict = net(img_tensor) pred_str = converter.ctc_decoder(model_predict) label_str = converter.decode_label(label_batch, lab_len) for pred, lab in zip(pred_str, label_str): if limit > data_num and not data_num < 0: break words_n = len(lab) words_num += words_n # get edit distance edit_distance = get_edit_distance(lab, pred) if edit_distance <= words_n: word_error_num += edit_distance else: word_error_num += words_n limit += 1 if data_num > 0: size = str(data_num) else: size = 'all' print('* [Test result] For {} datas, the accuracy is: {:.2f}%'.format( size, ((1 - word_error_num / words_num) * 100)))