last_layer_optimizer = torch.optim.Adam(last_layer_optimizer_specs) # weighting of different training losses from settings import coefs # number of training epochs, number of warm epochs, push start epoch, push epochs from settings import num_train_epochs, num_warm_epochs, push_start, push_epochs # train the model log('start training') import copy for epoch in range(num_train_epochs): log('epoch: \t{0}'.format(epoch)) if epoch < num_warm_epochs: tnt.warm_only(model=ppnet_multi, log=log) _ = tnt.train(model=ppnet_multi, dataloader=train_loader, optimizer=warm_optimizer, class_specific=class_specific, coefs=coefs, log=log) else: tnt.joint(model=ppnet_multi, log=log) joint_lr_scheduler.step() _ = tnt.train(model=ppnet_multi, dataloader=train_loader, optimizer=joint_optimizer, class_specific=class_specific, coefs=coefs, log=log)
last_checkpoint = step push_model_state_epoch = None # if epoch < config.push_start and ppnet.mil_pooling == 'gated_attention': # ppnet.mil_pooling = 'average' # print('\tattention disabled') # training loop as a state machine. while True: print('step: {}, mode: {}, epoch: {}, iteration: {}'.format( step, mode.name, epoch, iteration)) if mode == TrainMode.WARM: write_mode(TrainMode.WARM, log_writer, step) warm_only(model=ppnet) train(model=ppnet, dataloader=train_loader, optimizer=warm_optimizer, config=config, log_writer=log_writer, step=step, weighting_attention=args.weighting_attention) warm_lr_scheduler.step() accu = valid(model=ppnet, dataloader=valid_loader, config=config, log_writer=log_writer, step=step, weighting_attention=args.weighting_attention) push_model_state_epoch = None
def main(args): with open("./configs/{}/{}_{}_{}.yaml".format(args.net, args.dataset, args.backbone, args.mode)) as fp: cfg = yaml.safe_load(fp) NET_ARGS = cfg['NET_ARGS'] DATA_ARGS = cfg['DATA_ARGS'] EXP_ARGS = cfg['EXP_ARGS'] os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' model_dir = os.path.join('./saved_models/', args.dataset, args.backbone, args.net, args.mode) makedir(model_dir) log, logclose = create_logger(log_filename=os.path.join( model_dir, 'train_logger_{}.txt'.format( datetime.datetime.now().strftime("%H:%M:%S")))) img_dir = os.path.join(model_dir, 'img') makedir(img_dir) weight_matrix_filename = 'outputL_weights' prototype_img_filename_prefix = 'prototype-img' prototype_self_act_filename_prefix = 'prototype-self-act' proto_bound_boxes_filename_prefix = 'bb' log(pformat(cfg)) # ---------------------------------------- Get DataLoaders ---------------------------------------------- normalize = transforms.Normalize(mean=NET_ARGS['mean'], std=NET_ARGS['std']) train_transforms = transforms.Compose([ transforms.Resize(size=(DATA_ARGS['img_size'], DATA_ARGS['img_size'])), transforms.ToTensor(), normalize, ]) train_dataset = datasets.ImageFolder(DATA_ARGS['train_dir'], train_transforms) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=EXP_ARGS['train_batch_size'], shuffle=True, num_workers=4, pin_memory=False) train_push_dataset = datasets.ImageFolder( DATA_ARGS['train_push_dir'], transforms.Compose([ transforms.Resize(size=(DATA_ARGS['img_size'], DATA_ARGS['img_size'])), transforms.ToTensor(), ])) train_push_loader = torch.utils.data.DataLoader( train_push_dataset, batch_size=EXP_ARGS['train_push_batch_size'], shuffle=False, num_workers=4, pin_memory=False) test_dataset = datasets.ImageFolder( DATA_ARGS['test_dir'], transforms.Compose([ transforms.Resize(size=(DATA_ARGS['img_size'], DATA_ARGS['img_size'])), transforms.ToTensor(), normalize, ])) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=EXP_ARGS['test_batch_size'], shuffle=False, num_workers=4, pin_memory=False) log('training set size: {0}'.format(len(train_loader.dataset))) log('push set size: {0}'.format(len(train_push_loader.dataset))) log('test set size: {0}'.format(len(test_loader.dataset))) log('batch size: {0}'.format(EXP_ARGS['train_batch_size'])) # ------------------------------------ Model and Optimizer ---------------------------------------------- ppnet = model_AttProto.construct_PPNet( base_architecture=NET_ARGS['base_architecture'], pretrained=True, img_size=DATA_ARGS['img_size'], prototype_shape=NET_ARGS['prototype_shape'], num_classes=DATA_ARGS['num_classes'], prototype_activation_function=NET_ARGS[ 'prototype_activation_function'], add_on_layers_type=NET_ARGS['add_on_layers_type'], att_version=NET_ARGS['ATT_VERSION']) ppnet = ppnet.cuda() ppnet_multi = torch.nn.DataParallel(ppnet) class_specific = True if EXP_ARGS['RESUME']['iS_RESUME']: ppnet = torch.load(EXP_ARGS['RESUME']['PATH']) log(" Resumed from model: {}".format(EXP_ARGS['RESUME']['PATH'])) ppnet_multi = torch.nn.DataParallel(ppnet) accu = tnt.test(model=ppnet_multi, dataloader=test_loader, class_specific=True, log=log, EXP_ARGS=EXP_ARGS) log("\nInit Accuracy {:.2f} \n\n".format(accu)) ppnet_multi = torch.nn.DataParallel(ppnet) warm_optimizer_lrs = EXP_ARGS['OPTIMIZER']['warm_optimizer_lrs'] warm_optimizer_specs = [ { 'params': ppnet.add_on_layers.parameters(), 'lr': warm_optimizer_lrs['add_on_layers'], 'weight_decay': 1e-3 }, { 'params': ppnet.prototype_vectors, 'lr': warm_optimizer_lrs['prototype_vectors'] }, { 'params': ppnet.att_layer.parameters(), 'lr': warm_optimizer_lrs['att_layer'], 'weight_decay': 1e-3 }, ] warm_optimizer = torch.optim.Adam(warm_optimizer_specs) joint_optimizer_lrs = EXP_ARGS['OPTIMIZER']['joint_optimizer_lrs'] joint_optimizer_specs = [{ 'params': ppnet.features.parameters(), 'lr': joint_optimizer_lrs['features'], 'weight_decay': 1e-3 }, { 'params': ppnet.add_on_layers.parameters(), 'lr': joint_optimizer_lrs['add_on_layers'], 'weight_decay': 1e-3 }, { 'params': ppnet.prototype_vectors, 'lr': joint_optimizer_lrs['prototype_vectors'] }, { 'params': ppnet.att_layer.parameters(), 'lr': joint_optimizer_lrs['att_layer'], 'weight_decay': 1e-3 }] joint_optimizer = torch.optim.Adam(joint_optimizer_specs) joint_lr_scheduler = torch.optim.lr_scheduler.StepLR( joint_optimizer, step_size=int(joint_optimizer_lrs['joint_lr_step_size']), gamma=0.1) push_epochs = [ i for i in range(EXP_ARGS['num_train_epochs']) if i % 10 == 0 ] log('\n\n------------------------ Start Training ----------------------------\n\n' ) max_acc = 0.0 max_acc_epoch = 0 max_acc_iter = 0 target_accu = 0.1 for epoch in range(EXP_ARGS['start_epoch'], EXP_ARGS['num_train_epochs']): log('------------------------- Epoch: {} -------------------------------------' .format(epoch)) if epoch < EXP_ARGS['num_warm_epochs']: tnt.warm_only(model=ppnet_multi, log=log) _ = tnt.train(model=ppnet_multi, dataloader=train_loader, optimizer=warm_optimizer, class_specific=class_specific, coefs=EXP_ARGS['LOSS']['loss_coefs_warm'], log=log, EXP_ARGS=EXP_ARGS) else: tnt.joint(model=ppnet_multi, log=log) joint_lr_scheduler.step() _ = tnt.train(model=ppnet_multi, dataloader=train_loader, optimizer=joint_optimizer, class_specific=class_specific, coefs=EXP_ARGS['LOSS']['loss_coefs_joint'], log=log, EXP_ARGS=EXP_ARGS) accu = tnt.test(model=ppnet_multi, dataloader=test_loader, class_specific=class_specific, log=log, EXP_ARGS=EXP_ARGS) if accu > max_acc: max_acc = accu max_acc_iter = 0 max_acc_epoch = epoch save.save_model_w_condition(model=ppnet, model_dir=model_dir, model_name='', accu=accu, target_accu=target_accu, log=log, best=True, stage='prepush_{}'.format(epoch)) log("\nBest Accuracy {:.2f} at epoch {} and iter {}\n\n".format( max_acc, max_acc_epoch, max_acc_iter)) if epoch >= EXP_ARGS['push_start'] and epoch in push_epochs: save.save_model_w_condition(model=ppnet, model_dir=model_dir, model_name='', accu=accu, target_accu=target_accu, log=log, best=True, stage='prepushfinal_{}'.format(epoch)) log('\n------------------------- Push Prototypes -----------------------------' ) push.push_prototypes( train_push_loader, prototype_network_parallel=ppnet_multi, class_specific=class_specific, preprocess_input_function=preprocess_input_function, prototype_layer_stride=1, root_dir_for_saving_prototypes=img_dir, epoch_number=epoch, prototype_img_filename_prefix=prototype_img_filename_prefix, prototype_self_act_filename_prefix= prototype_self_act_filename_prefix, proto_bound_boxes_filename_prefix= proto_bound_boxes_filename_prefix, save_prototype_class_identity=True, log=log) accu = tnt.test(model=ppnet_multi, dataloader=test_loader, class_specific=class_specific, log=log, EXP_ARGS=EXP_ARGS) save.save_model_w_condition(model=ppnet, model_dir=model_dir, model_name='', accu=accu, target_accu=target_accu, log=log, best=True, stage='push_{}'.format(epoch)) last_layer_optimizer_specs = [{ 'params': ppnet.last_layer.parameters(), 'lr': EXP_ARGS['OPTIMIZER']['last_layer_optimizer_lrs'] ['last_layer_optimizer_lr'] }] last_layer_optimizer = torch.optim.Adam(last_layer_optimizer_specs) last_lr_lr_scheduler = torch.optim.lr_scheduler.StepLR( last_layer_optimizer, step_size=EXP_ARGS['OPTIMIZER']['last_layer_optimizer_lrs'] ['last_lr_step_size'], gamma=0.1) log('\n------------------------- Last Layer Training -----------------------------------' ) if NET_ARGS['prototype_activation_function'] != 'linear': tnt.last_only(model=ppnet_multi, log=log) max_acc_post, max_acc_post_iter, max_acc_post_epoch = 0, 0, epoch for i in range( EXP_ARGS['OPTIMIZER']['last_layer_optimizer_lrs'] ['last_layer_optimizer_iters']): log('Last layer optimization, Iteration: {0}'.format(i)) _ = tnt.train(model=ppnet_multi, dataloader=train_loader, optimizer=last_layer_optimizer, class_specific=class_specific, coefs=EXP_ARGS['LOSS']['loss_coefs_joint'], log=log, EXP_ARGS=EXP_ARGS) last_lr_lr_scheduler.step() accu = tnt.test(model=ppnet_multi, dataloader=test_loader, class_specific=class_specific, log=log, EXP_ARGS=EXP_ARGS) if accu > max_acc_post: max_acc_post = accu max_acc_post_iter = i max_acc_post_epoch = epoch save.save_model_w_condition( model=ppnet, model_dir=model_dir, model_name='', accu=accu, target_accu=0.70, log=log, best=True, stage='postpush_{}'.format(epoch)) log("Best Accuracy - PostPush {:.2f} at epoch {} and iter {}" .format(max_acc_post, max_acc_post_epoch, max_acc_post_iter)) save.save_model_w_condition( model=ppnet, model_dir=model_dir, model_name='', accu=accu, target_accu=0.70, log=log, best=True, stage='postpushfinal_{}'.format(epoch)) logclose()