def main(): # set GPU ID os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id cudnn.benchmark = True # check save path save_path = args.save_path if not os.path.exists(save_path): os.makedirs(save_path) # make dataloader train_loader, test_loader = dataset.get_loader(args) # set model if args.model == 'res': model = resnet.ResNet18().cuda() # set criterion criterion = nn.CrossEntropyLoss().cuda() # set optimizer (default:sgd) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001, nesterov=True) # set scheduler scheduler = MultiStepLR(optimizer, milestones=[80, 120], gamma=0.1) # make logger train_logger = utils.Logger(os.path.join(save_path, 'train.log')) test_logger = utils.Logger(os.path.join(save_path, 'test.log')) # forgetting forgetting_histroy = utils.Forgetting_Events( data_size=len(train_loader.dataset)) # Start Train for epoch in range(1, args.epochs + 1): # scheduler scheduler.step() # Train train(train_loader, model, criterion, optimizer, epoch, forgetting_histroy, train_logger) validate(test_loader, model, criterion, epoch, test_logger, 'test') # Save Model each epoch if epoch == int(args.epochs): torch.save( model.state_dict(), os.path.join(save_path, '{0}_{1}.pth'.format('model', epoch))) # Finish Train torch.save(forgetting_histroy, os.path.join(save_path, 'train_forgetting.pth')) # Draw Plot plot_curves.draw_plot(save_path)
def Modellist(model_name, num_classes=1000, use_attention=None): if model_name == 'vgg11': return vgg.VGG11(num_classes, use_attention=use_attention) elif model_name == 'vgg13': return vgg.VGG13(num_classes, use_attention=use_attention) elif model_name == 'vgg16': return vgg.VGG16(num_classes, use_attention=use_attention) elif model_name == 'vgg19': return vgg.VGG19(num_classes, use_attention=use_attention) elif model_name == 'resnet18': return resnet.ResNet18(num_classes, use_attention=use_attention) elif model_name == 'resnet34': return resnet.ResNet34(num_classes, use_attention=use_attention) elif model_name == 'resnet50': return resnet.ResNet50(num_classes, use_attention=use_attention) elif model_name == 'resnet101': return resnet.ResNet101(num_classes, use_attention=use_attention) elif model_name == 'resnet152': return resnet.ResNet152(num_classes, use_attention=use_attention) elif model_name == 'densenet121': return densenet.DenseNet121(num_classes, use_attention=use_attention) elif model_name == 'densenet169': return densenet.DenseNet169(num_classes, use_attention=use_attention) elif model_name == 'densenet201': return densenet.DenseNet201(num_classes, use_attention=use_attention) elif model_name == 'densenet161': return densenet.DenseNet161(num_classes, use_attention=use_attention) elif model_name == 'mobilenetv3_small': return mobilenetv3.mobilenetv3_small(num_classes) elif model_name == 'mobilenetv3_large': return mobilenetv3.mobilenetv3_large(num_classes) elif model_name == 'efficientnet_b0': return efficientnet.efficientnet_b0(num_classes, use_attention=use_attention) elif model_name == 'efficientnet_b1': return efficientnet.efficientnet_b1(num_classes, use_attention=use_attention) elif model_name == 'efficientnet_b2': return efficientnet.efficientnet_b2(num_classes, use_attention=use_attention) elif model_name == 'efficientnet_b3': return efficientnet.efficientnet_b3(num_classes, use_attention=use_attention) elif model_name == 'efficientnet_b4': return efficientnet.efficientnet_b4(num_classes, use_attention=use_attention) elif model_name == 'efficientnet_b5': return efficientnet.efficientnet_b5(num_classes, use_attention=use_attention) elif model_name == 'efficientnet_b6': return efficientnet.efficientnet_b6(num_classes, use_attention=use_attention) else: raise ValueError("The model_name does not exist.")
def main(rank, world_size): init_process(rank, world_size) # make dataloader train_loader, test_loader = dataset.get_loader(args, rank, world_size) # set model if args.model == 'res': model = resnet.ResNet18() model = model.cuda(rank) model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = DDP(model, device_ids=[rank]) cudnn.benchmark = True # set criterion criterion = nn.CrossEntropyLoss().cuda(rank) # set optimizer (default:sgd) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001, nesterov=True) # set scheduler scheduler = MultiStepLR(optimizer, milestones=[60, 80], gamma=0.1) # make logger train_logger = utils.Logger(os.path.join(args.save_path, 'train.log')) test_logger = utils.Logger(os.path.join(args.save_path, 'test.log')) # Start Train for epoch in range(1, args.epochs + 1): # scheduler if dist.get_rank() == 0: scheduler.step() # Train train(train_loader, model, criterion, optimizer, epoch, train_logger) validate(test_loader, model, criterion, epoch, test_logger, 'test') # Save Model each epoch if dist.get_rank() == 0: if epoch == int(args.epochs): torch.save( model.state_dict(), os.path.join(args.save_path, '{0}_{1}.pth'.format('model', epoch))) # Finish Train # Draw Plot if dist.get_rank() == 0: plot_curves.draw_plot(args.save_path) dist.destroy_process_group()
def __init__(self, model, num_workers, lr, job_name): self.lock = threading.Lock() self.logger = Logger(job_name=job_name, file_dir=f"./measurement/logs/{job_name}_ps.log").logger self.cm_t1_start = np.zeros(num_workers) self.future_model = torch.futures.Future() self.batch_update_size = num_workers self.curr_update_size = 0 self.stop_flag = False if model == 'resnet20': self.model = resnet3.resnet20() elif model == 'resnet56': self.model = resnet3.resnet56() elif model == 'resnet18': self.model = resnet.ResNet18() elif model == 'resnet50': self.model = resnet.ResNet50() elif model == 'vgg13': self.model = vgg.VGG13() elif model == 'vgg16': self.model = vgg.VGG16() elif model == 'densenet121': self.model = densenet.DenseNet121() elif model == 'alexnet': self.model = alexnet.AlexNet() elif model == 'googlenet': self.model = googlenet.GoogLeNet() elif model == 'mobilenet': self.model = mobilenetv2.MobileNetV2() self.lr = lr for p in self.model.parameters(): p.grad = torch.zeros_like(p) self.optimizer = optim.SGD(self.model.parameters(), lr=lr, momentum=0.9) self.info_socketm = znet.SocketMsger.tcp_connect(DORKER0_IP, INFO_PORT) self.info_socketm.send("PS") self.info_socketm.send(f"1.0\n/home/ubuntu/measurement/logs/{job_name}_info0.log\n{job_name}") self.ps_launched_lock = threading.Lock() self.ps_launched = False
"""Based on the model_version, determine model/optimizer and KD training mode WideResNet and DenseNet were trained on multi-GPU; need to specify a dummy nn.DataParallel module to correctly load the model parameters """ if "distill" in params.model_version: student_model_load_start = time.time() # train a 5-layer CNN or a 18-layer ResNet with knowledge distillation if params.model_version == "cnn_distill": model = net.Net(params).cuda() if params.cuda else net.Net(params) optimizer = optim.Adam(model.parameters(), lr=params.learning_rate) # fetch loss function and metrics definition in model files loss_fn_kd = net.loss_fn_kd metrics = net.metrics elif params.model_version == 'resnet18_distill': model = resnet.ResNet18().cuda() if params.cuda else resnet.ResNet18() optimizer = optim.SGD(model.parameters(), lr=params.learning_rate, momentum=0.9, weight_decay=5e-4) # fetch loss function and metrics definition in model files loss_fn_kd = net.loss_fn_kd metrics = resnet.metrics student_model_load_time = time.time() - student_model_load_start logging.info("student_model_load_time: {}".format(student_model_load_time)) """ Specify the pre-trained teacher models for knowledge distillation Important note: wrn/densenet/resnext/preresnet were pre-trained models using multi-GPU, therefore need to call "nn.DaraParallel" to correctly load the model weights Trying to run on CPU will then trigger errors (too time-consuming anyway)! """
return sum(p.numel() for p in model.parameters() if p.requires_grad) if __name__ == '__main__': model_size = 0 args = parser.parse_args() cnn_dir = 'experiments/cnn_distill' json_path = os.path.join(cnn_dir, 'params.json') assert os.path.isfile( json_path), "No json configuration file found at {}".format(json_path) params = utils.Params(json_path) if args.model == "resnet18": model = resnet.ResNet18() model_checkpoint = 'experiments/base_resnet18/best.pth.tar' elif args.model == "wrn": model = wrn.wrn(depth=28, num_classes=10, widen_factor=10, dropRate=0.3) model_checkpoint = 'experiments/base_wrn/best.pth.tar' elif args.model == "distill_resnext": model = resnet.ResNet18() model_checkpoint = 'experiments/resnet18_distill/resnext_teacher/best.pth.tar' elif args.model == "distill_densenet": model = resnet.ResNet18()
"""Based on the model_version, determine model/optimizer and KD training mode WideResNet and DenseNet were trained on multi-GPU; need to specify a dummy nn.DataParallel module to correctly load the model parameters """ if "distill" in params.model_version: # train a 5-layer CNN or a 18-layer ResNet with knowledge distillation if params.model_version == "cnn_distill": model = net.Net(params).cuda() if params.cuda else net.Net(params) optimizer = optim.Adam(model.parameters(), lr=params.learning_rate) # fetch loss function and metrics definition in model files loss_fn_kd = net.loss_fn_kd metrics = net.metrics elif params.model_version == 'resnet18_distill': model = resnet.ResNet18().cuda( ) if params.cuda else resnet.ResNet18() optimizer = optim.SGD(model.parameters(), lr=params.learning_rate, momentum=0.9, weight_decay=5e-4) # fetch loss function and metrics definition in model files loss_fn_kd = net.loss_fn_kd metrics = resnet.metrics """ Specify the pre-trained teacher models for knowledge distillation Important note: wrn/densenet/resnext/preresnet were pre-trained models using multi-GPU, therefore need to call "nn.DaraParallel" to correctly load the model weights Trying to run on CPU will then trigger errors (too time-consuming anyway)! """ teacher_model = get_vgg()
def main(): # Load the parameters from json file args = parser.parse_args() json_path = os.path.join(args.model_dir, 'params.json') assert os.path.isfile( json_path), "No json configuration file found at {}".format(json_path) params = utils.Params(json_path) # Set the random seed for reproducible experiments random.seed(230) torch.manual_seed(230) np.random.seed(230) torch.cuda.manual_seed(230) warnings.filterwarnings("ignore") # Set the logger utils.set_logger(os.path.join(args.model_dir, 'train.log')) # Create the input data pipeline logging.info("Loading the datasets...") # fetch dataloaders, considering full-set vs. sub-set scenarios if params.subset_percent < 1.0: train_dl = data_loader.fetch_subset_dataloader('train', params) else: train_dl = data_loader.fetch_dataloader('train', params) dev_dl = data_loader.fetch_dataloader('dev', params) logging.info("- done.") """ Load student and teacher model """ if "distill" in params.model_version: # Specify the student models if params.model_version == "cnn_distill": # 5-layers Plain CNN print("Student model: {}".format(params.model_version)) model = net.Net(params).cuda() elif params.model_version == "shufflenet_v2_distill": print("Student model: {}".format(params.model_version)) model = shufflenet.shufflenetv2(class_num=args.num_class).cuda() elif params.model_version == "mobilenet_v2_distill": print("Student model: {}".format(params.model_version)) model = mobilenet.mobilenetv2(class_num=args.num_class).cuda() elif params.model_version == 'resnet18_distill': print("Student model: {}".format(params.model_version)) model = resnet.ResNet18(num_classes=args.num_class).cuda() elif params.model_version == 'resnet50_distill': print("Student model: {}".format(params.model_version)) model = resnet.ResNet50(num_classes=args.num_class).cuda() elif params.model_version == "alexnet_distill": print("Student model: {}".format(params.model_version)) model = alexnet.alexnet(num_classes=args.num_class).cuda() elif params.model_version == "vgg19_distill": print("Student model: {}".format(params.model_version)) model = models.vgg19_bn(num_classes=args.num_class).cuda() elif params.model_version == "googlenet_distill": print("Student model: {}".format(params.model_version)) model = googlenet.GoogleNet(num_class=args.num_class).cuda() elif params.model_version == "resnext29_distill": print("Student model: {}".format(params.model_version)) model = resnext.CifarResNeXt(cardinality=8, depth=29, num_classes=args.num_class).cuda() elif params.model_version == "densenet121_distill": print("Student model: {}".format(params.model_version)) model = densenet.densenet121(num_class=args.num_class).cuda() # optimizer if params.model_version == "cnn_distill": optimizer = optim.Adam(model.parameters(), lr=params.learning_rate * (params.batch_size / 128)) else: optimizer = optim.SGD(model.parameters(), lr=params.learning_rate * (params.batch_size / 128), momentum=0.9, weight_decay=5e-4) iter_per_epoch = len(train_dl) warmup_scheduler = utils.WarmUpLR( optimizer, iter_per_epoch * args.warm) # warmup the learning rate in the first epoch # specify loss function if args.self_training: print( '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>self training>>>>>>>>>>>>>>>>>>>>>>>>>>>>>' ) loss_fn_kd = loss_kd_self else: loss_fn_kd = loss_kd """ Specify the pre-trained teacher models for knowledge distillation Checkpoints can be obtained by regular training or downloading our pretrained models For model which is pretrained in multi-GPU, use "nn.DaraParallel" to correctly load the model weights. """ if params.teacher == "resnet18": print("Teacher model: {}".format(params.teacher)) teacher_model = resnet.ResNet18(num_classes=args.num_class) teacher_checkpoint = 'experiments/pretrained_teacher_models/base_resnet18/best.pth.tar' if args.pt_teacher: # poorly-trained teacher for Defective KD experiments teacher_checkpoint = 'experiments/pretrained_teacher_models/base_resnet18/0.pth.tar' teacher_model = teacher_model.cuda() elif params.teacher == "alexnet": print("Teacher model: {}".format(params.teacher)) teacher_model = alexnet.alexnet(num_classes=args.num_class) teacher_checkpoint = 'experiments/pretrained_teacher_models/base_alexnet/best.pth.tar' teacher_model = teacher_model.cuda() elif params.teacher == "googlenet": print("Teacher model: {}".format(params.teacher)) teacher_model = googlenet.GoogleNet(num_class=args.num_class) teacher_checkpoint = 'experiments/pretrained_teacher_models/base_googlenet/best.pth.tar' teacher_model = teacher_model.cuda() elif params.teacher == "vgg19": print("Teacher model: {}".format(params.teacher)) teacher_model = models.vgg19_bn(num_classes=args.num_class) teacher_checkpoint = 'experiments/pretrained_teacher_models/base_vgg19/best.pth.tar' teacher_model = teacher_model.cuda() elif params.teacher == "resnet50": print("Teacher model: {}".format(params.teacher)) teacher_model = resnet.ResNet50(num_classes=args.num_class).cuda() teacher_checkpoint = 'experiments/pretrained_teacher_models/base_resnet50/best.pth.tar' if args.pt_teacher: # poorly-trained teacher for Defective KD experiments teacher_checkpoint = 'experiments/pretrained_teacher_models/base_resnet50/50.pth.tar' elif params.teacher == "resnet101": print("Teacher model: {}".format(params.teacher)) teacher_model = resnet.ResNet101(num_classes=args.num_class).cuda() teacher_checkpoint = 'experiments/pretrained_teacher_models/base_resnet101/best.pth.tar' teacher_model = teacher_model.cuda() elif params.teacher == "densenet121": print("Teacher model: {}".format(params.teacher)) teacher_model = densenet.densenet121( num_class=args.num_class).cuda() teacher_checkpoint = 'experiments/pretrained_teacher_models/base_densenet121/best.pth.tar' # teacher_model = nn.DataParallel(teacher_model).cuda() elif params.teacher == "resnext29": print("Teacher model: {}".format(params.teacher)) teacher_model = resnext.CifarResNeXt( cardinality=8, depth=29, num_classes=args.num_class).cuda() teacher_checkpoint = 'experiments/pretrained_teacher_models/base_resnext29/best.pth.tar' if args.pt_teacher: # poorly-trained teacher for Defective KD experiments teacher_checkpoint = 'experiments/pretrained_teacher_models/base_resnext29/50.pth.tar' teacher_model = nn.DataParallel(teacher_model).cuda() elif params.teacher == "mobilenet_v2": print("Teacher model: {}".format(params.teacher)) teacher_model = mobilenet.mobilenetv2( class_num=args.num_class).cuda() teacher_checkpoint = 'experiments/pretrained_teacher_models/base_mobilenet_v2/best.pth.tar' elif params.teacher == "shufflenet_v2": print("Teacher model: {}".format(params.teacher)) teacher_model = shufflenet.shufflenetv2( class_num=args.num_class).cuda() teacher_checkpoint = 'experiments/pretrained_teacher_models/base_shufflenet_v2/best.pth.tar' utils.load_checkpoint(teacher_checkpoint, teacher_model) # Train the model with KD logging.info("Starting training for {} epoch(s)".format( params.num_epochs)) train_and_evaluate_kd(model, teacher_model, train_dl, dev_dl, optimizer, loss_fn_kd, warmup_scheduler, params, args, args.restore_file) # non-KD mode: regular training to obtain a baseline model else: print("Train base model") if params.model_version == "cnn": model = net.Net(params).cuda() elif params.model_version == "mobilenet_v2": print("model: {}".format(params.model_version)) model = mobilenet.mobilenetv2(class_num=args.num_class).cuda() elif params.model_version == "shufflenet_v2": print("model: {}".format(params.model_version)) model = shufflenet.shufflenetv2(class_num=args.num_class).cuda() elif params.model_version == "alexnet": print("model: {}".format(params.model_version)) model = alexnet.alexnet(num_classes=args.num_class).cuda() elif params.model_version == "vgg19": print("model: {}".format(params.model_version)) model = models.vgg19_bn(num_classes=args.num_class).cuda() elif params.model_version == "googlenet": print("model: {}".format(params.model_version)) model = googlenet.GoogleNet(num_class=args.num_class).cuda() elif params.model_version == "densenet121": print("model: {}".format(params.model_version)) model = densenet.densenet121(num_class=args.num_class).cuda() elif params.model_version == "resnet18": model = resnet.ResNet18(num_classes=args.num_class).cuda() elif params.model_version == "resnet50": model = resnet.ResNet50(num_classes=args.num_class).cuda() elif params.model_version == "resnet101": model = resnet.ResNet101(num_classes=args.num_class).cuda() elif params.model_version == "resnet152": model = resnet.ResNet152(num_classes=args.num_class).cuda() elif params.model_version == "resnext29": model = resnext.CifarResNeXt(cardinality=8, depth=29, num_classes=args.num_class).cuda() # model = nn.DataParallel(model).cuda() if args.regularization: print( ">>>>>>>>>>>>>>>>>>>>>>>>Loss of Regularization>>>>>>>>>>>>>>>>>>>>>>>>" ) loss_fn = loss_kd_regularization elif args.label_smoothing: print( ">>>>>>>>>>>>>>>>>>>>>>>>Label Smoothing>>>>>>>>>>>>>>>>>>>>>>>>" ) loss_fn = loss_label_smoothing else: print( ">>>>>>>>>>>>>>>>>>>>>>>>Normal Training>>>>>>>>>>>>>>>>>>>>>>>>" ) loss_fn = nn.CrossEntropyLoss() if args.double_training: # double training, compare to self-KD print( ">>>>>>>>>>>>>>>>>>>>>>>>Double Training>>>>>>>>>>>>>>>>>>>>>>>>" ) checkpoint = 'experiments/pretrained_teacher_models/base_' + str( params.model_version) + '/best.pth.tar' utils.load_checkpoint(checkpoint, model) if params.model_version == "cnn": optimizer = optim.Adam(model.parameters(), lr=params.learning_rate * (params.batch_size / 128)) else: optimizer = optim.SGD(model.parameters(), lr=params.learning_rate * (params.batch_size / 128), momentum=0.9, weight_decay=5e-4) iter_per_epoch = len(train_dl) warmup_scheduler = utils.WarmUpLR(optimizer, iter_per_epoch * args.warm) # Train the model logging.info("Starting training for {} epoch(s)".format( params.num_epochs)) train_and_evaluate(model, train_dl, dev_dl, optimizer, loss_fn, params, args.model_dir, warmup_scheduler, args, args.restore_file)