def train_dino(args): utils.init_distributed_mode(args) utils.fix_random_seeds(args.seed) print("git:\n {}\n".format(utils.get_sha())) print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items()))) cudnn.benchmark = True # ============ preparing data ... ============ transform = DataAugmentationDINO( args.global_crops_scale, args.local_crops_scale, args.local_crops_number, ) #dataset = datasets.ImageFolder(args.data_path, transform=transform) from sen12ms import get_transform dataset = AllSen12MSDataset(args.data_path, "train", transform=transform, tansform_coord=None, classes=None, seasons=None, split_by_region=True, download=False) sampler = torch.utils.data.DistributedSampler(dataset, shuffle=True) data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, batch_size=args.batch_size_per_gpu, num_workers=args.num_workers, pin_memory=True, drop_last=True, ) print(f"Data loaded: there are {len(dataset)} images.") # ============ building student and teacher networks ... ============ # if the network is a vision transformer (i.e. deit_tiny, deit_small, vit_base) if args.arch in vits.__dict__.keys(): student = vits.__dict__[args.arch]( patch_size=args.patch_size, drop_path_rate=0.1, # stochastic depth ) teacher = vits.__dict__[args.arch](patch_size=args.patch_size) embed_dim = student.embed_dim student = utils.replace_input_layer(student, inchannels=13) teacher = utils.replace_input_layer(teacher, inchannels=13) # otherwise, we check if the architecture is in torchvision models elif args.arch in torchvision_models.__dict__.keys(): student = torchvision_models.__dict__[args.arch]() teacher = torchvision_models.__dict__[args.arch]() embed_dim = student.fc.weight.shape[1] else: print(f"Unknow architecture: {args.arch}") # multi-crop wrapper handles forward with inputs of different resolutions student = utils.MultiCropWrapper( student, DINOHead( embed_dim, args.out_dim, use_bn=args.use_bn_in_head, norm_last_layer=args.norm_last_layer, )) teacher = utils.MultiCropWrapper( teacher, DINOHead(embed_dim, args.out_dim, args.use_bn_in_head), ) # move networks to gpu student, teacher = student.cuda(), teacher.cuda() # synchronize batch norms (if any) if utils.has_batchnorms(student): student = nn.SyncBatchNorm.convert_sync_batchnorm(student) teacher = nn.SyncBatchNorm.convert_sync_batchnorm(teacher) # we need DDP wrapper to have synchro batch norms working... teacher = nn.parallel.DistributedDataParallel(teacher, device_ids=[args.gpu]) teacher_without_ddp = teacher.module else: # teacher_without_ddp and teacher are the same thing teacher_without_ddp = teacher student = nn.parallel.DistributedDataParallel(student, device_ids=[args.gpu]) # teacher and student start with the same weights teacher_without_ddp.load_state_dict(student.module.state_dict()) # there is no backpropagation through the teacher, so no need for gradients for p in teacher.parameters(): p.requires_grad = False print(f"Student and Teacher are built: they are both {args.arch} network.") # ============ preparing loss ... ============ dino_loss = DINOLoss( args.out_dim, args.local_crops_number + 2, # total number of crops = 2 global crops + local_crops_number args.warmup_teacher_temp, args.teacher_temp, args.warmup_teacher_temp_epochs, args.epochs, ).cuda() # ============ preparing optimizer ... ============ params_groups = utils.get_params_groups(student) if args.optimizer == "adamw": optimizer = torch.optim.AdamW(params_groups) # to use with ViTs elif args.optimizer == "sgd": optimizer = torch.optim.SGD(params_groups, lr=0, momentum=0.9) # lr is set by scheduler elif args.optimizer == "lars": optimizer = utils.LARS( params_groups) # to use with convnet and large batches # for mixed precision training fp16_scaler = None if args.use_fp16: fp16_scaler = torch.cuda.amp.GradScaler() # ============ init schedulers ... ============ lr_schedule = utils.cosine_scheduler( args.lr * (args.batch_size_per_gpu * utils.get_world_size()) / 256., # linear scaling rule args.min_lr, args.epochs, len(data_loader), warmup_epochs=args.warmup_epochs, ) wd_schedule = utils.cosine_scheduler( args.weight_decay, args.weight_decay_end, args.epochs, len(data_loader), ) # momentum parameter is increased to 1. during training with a cosine schedule momentum_schedule = utils.cosine_scheduler(args.momentum_teacher, 1, args.epochs, len(data_loader)) print(f"Loss, optimizer and schedulers ready.") # ============ optionally resume training ... ============ to_restore = {"epoch": 0} utils.restart_from_checkpoint( os.path.join(args.output_dir, "checkpoint.pth"), run_variables=to_restore, student=student, teacher=teacher, optimizer=optimizer, fp16_scaler=fp16_scaler, dino_loss=dino_loss, ) start_epoch = to_restore["epoch"] start_time = time.time() print("Starting DINO training !") for epoch in range(start_epoch, args.epochs): data_loader.sampler.set_epoch(epoch) # ============ training one epoch of DINO ... ============ train_stats = train_one_epoch(student, teacher, teacher_without_ddp, dino_loss, data_loader, optimizer, lr_schedule, wd_schedule, momentum_schedule, epoch, fp16_scaler, args) # ============ writing logs ... ============ save_dict = { 'student': student.state_dict(), 'teacher': teacher.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch + 1, 'args': args, 'dino_loss': dino_loss.state_dict(), } if fp16_scaler is not None: save_dict['fp16_scaler'] = fp16_scaler.state_dict() utils.save_on_master(save_dict, os.path.join(args.output_dir, 'checkpoint.pth')) if args.saveckp_freq and epoch % args.saveckp_freq == 0: utils.save_on_master( save_dict, os.path.join(args.output_dir, f'checkpoint{epoch:04}.pth')) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch } if utils.is_main_process(): with (Path(args.output_dir) / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def __init__(self, student, teacher, length, val_loader, embed_dim, args): super().__init__() # self.save_hyperparameters() self.ratio = args.ratio teacher.load_state_dict(student.state_dict()) self.student = NetWrapper(student, embed_dim, args, True) self.teacher = NetWrapper(teacher, embed_dim, args) self.teacher.projector.load_state_dict( self.student.projector[-1].state_dict()) for p in self.teacher.parameters(): p.requires_grad = False print( f"Student and Teacher are built: they are both {args.arch} network." ) # ============ preparing optimizer ... ============ params_groups = utils.get_params_groups(self.student) if args.optimizer == "adamw": self.optimizer = torch.optim.AdamW( params_groups) # to use with ViTs elif args.optimizer == "sgd": self.optimizer = torch.optim.SGD( params_groups, lr=0, momentum=0.9) # lr is set by scheduler elif args.optimizer == "lars": self.optimizer = utils.LARS( params_groups) # to use with convnet and large batches length = math.ceil(length / (args.accumulate * torch.cuda.device_count())) # ============ init schedulers ... ============ self.lr_schedule = utils.cosine_scheduler( args.lr * (args.accumulate * args.batch_size_per_gpu * torch.cuda.device_count()) / 256., # linear scaling rule args.min_lr * (args.accumulate * args.batch_size_per_gpu * torch.cuda.device_count()) / 256., args.epochs, length, warmup_epochs=args.warmup_epochs, ) self.wd_schedule = utils.cosine_scheduler( args.weight_decay, args.weight_decay_end, args.epochs, length, ) # print(length) # momentum parameter is increased to 1. during training with a cosine schedule self.momentum_schedule = utils.cosine_scheduler( args.momentum_teacher, 1, args.epochs, length) print(f"Loss, optimizer and schedulers ready.") self.val_loader = val_loader self.aug1 = torch.nn.Sequential( RandomApply(T.ColorJitter(0.8, 0.8, 0.8, 0.2), p=0.3), T.RandomGrayscale(p=0.2), T.RandomHorizontalFlip(), RandomApply(T.GaussianBlur((3, 3), (1.0, 2.0)), p=0.2), T.RandomResizedCrop((image_size, image_size)), T.Normalize(mean=torch.tensor([0.485, 0.456, 0.406]), std=torch.tensor([0.229, 0.224, 0.225])), ) self.aug2 = self.aug1
def main(args, ds_init): utils.init_distributed_mode(args) if ds_init is not None: utils.create_ds_config(args) print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) if args.disable_eval_during_finetuning: dataset_val = None else: dataset_val, _ = build_dataset(is_train=False, args=args) if True: # args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) print("Sampler_train = %s" % str(sampler_train)) if args.dist_eval: if len(dataset_val) % num_tasks != 0: print( 'Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' 'This will slightly alter validation results as extra duplicate entries are added to achieve ' 'equal num of samples per-process.') sampler_val = torch.utils.data.DistributedSampler( dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) if global_rank == 0 and args.log_dir is not None: os.makedirs(args.log_dir, exist_ok=True) log_writer = utils.TensorboardLogger(log_dir=args.log_dir) else: log_writer = None data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) if dataset_val is not None: data_loader_val = torch.utils.data.DataLoader( dataset_val, sampler=sampler_val, batch_size=int(1.5 * args.batch_size), num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False) else: data_loader_val = None mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: print("Mixup is activated!") mixup_fn = Mixup(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) model = create_model( args.model, pretrained=False, num_classes=args.nb_classes, drop_rate=args.drop, drop_path_rate=args.drop_path, attn_drop_rate=args.attn_drop_rate, drop_block_rate=None, use_mean_pooling=args.use_mean_pooling, init_scale=args.init_scale, use_rel_pos_bias=args.rel_pos_bias, use_abs_pos_emb=args.abs_pos_emb, init_values=args.layer_scale_init_value, ) patch_size = model.patch_embed.patch_size print("Patch size = %s" % str(patch_size)) args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1]) args.patch_size = patch_size if args.finetune: if args.finetune.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.finetune, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.finetune, map_location='cpu') print("Load ckpt from %s" % args.finetune) checkpoint_model = None for model_key in args.model_key.split('|'): if model_key in checkpoint: checkpoint_model = checkpoint[model_key] print("Load state_dict by model_key = %s" % model_key) break if checkpoint_model is None: checkpoint_model = checkpoint state_dict = model.state_dict() for k in ['head.weight', 'head.bias']: if k in checkpoint_model and checkpoint_model[ k].shape != state_dict[k].shape: print(f"Removing key {k} from pretrained checkpoint") del checkpoint_model[k] if model.use_rel_pos_bias and "rel_pos_bias.relative_position_bias_table" in checkpoint_model: print( "Expand the shared relative position embedding to each transformer block. " ) num_layers = model.get_num_layers() rel_pos_bias = checkpoint_model[ "rel_pos_bias.relative_position_bias_table"] for i in range(num_layers): checkpoint_model["blocks.%d.attn.relative_position_bias_table" % i] = rel_pos_bias.clone() checkpoint_model.pop("rel_pos_bias.relative_position_bias_table") all_keys = list(checkpoint_model.keys()) for key in all_keys: if "relative_position_index" in key: checkpoint_model.pop(key) if "relative_position_bias_table" in key: rel_pos_bias = checkpoint_model[key] src_num_pos, num_attn_heads = rel_pos_bias.size() dst_num_pos, _ = model.state_dict()[key].size() dst_patch_shape = model.patch_embed.patch_shape if dst_patch_shape[0] != dst_patch_shape[1]: raise NotImplementedError() num_extra_tokens = dst_num_pos - ( dst_patch_shape[0] * 2 - 1) * (dst_patch_shape[1] * 2 - 1) src_size = int((src_num_pos - num_extra_tokens)**0.5) dst_size = int((dst_num_pos - num_extra_tokens)**0.5) if src_size != dst_size: print("Position interpolate for %s from %dx%d to %dx%d" % (key, src_size, src_size, dst_size, dst_size)) extra_tokens = rel_pos_bias[-num_extra_tokens:, :] rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] def geometric_progression(a, r, n): return a * (1.0 - r**n) / (1.0 - r) left, right = 1.01, 1.5 while right - left > 1e-6: q = (left + right) / 2.0 gp = geometric_progression(1, q, src_size // 2) if gp > dst_size // 2: right = q else: left = q # if q > 1.090307: # q = 1.090307 dis = [] cur = 1 for i in range(src_size // 2): dis.append(cur) cur += q**(i + 1) r_ids = [-_ for _ in reversed(dis)] x = r_ids + [0] + dis y = r_ids + [0] + dis t = dst_size // 2.0 dx = np.arange(-t, t + 0.1, 1.0) dy = np.arange(-t, t + 0.1, 1.0) print("Original positions = %s" % str(x)) print("Target positions = %s" % str(dx)) all_rel_pos_bias = [] for i in range(num_attn_heads): z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy() f = interpolate.interp2d(x, y, z, kind='cubic') all_rel_pos_bias.append( torch.Tensor(f(dx, dy)).contiguous().view( -1, 1).to(rel_pos_bias.device)) rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0) checkpoint_model[key] = new_rel_pos_bias # interpolate position embedding if 'pos_embed' in checkpoint_model: pos_embed_checkpoint = checkpoint_model['pos_embed'] embedding_size = pos_embed_checkpoint.shape[-1] num_patches = model.patch_embed.num_patches num_extra_tokens = model.pos_embed.shape[-2] - num_patches # height (== width) for the checkpoint position embedding orig_size = int( (pos_embed_checkpoint.shape[-2] - num_extra_tokens)**0.5) # height (== width) for the new position embedding new_size = int(num_patches**0.5) # class_token and dist_token are kept unchanged if orig_size != new_size: print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # only the position tokens are interpolated pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute( 0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate( pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) checkpoint_model['pos_embed'] = new_pos_embed utils.load_state_dict(model, checkpoint_model, prefix=args.model_prefix) # model.load_state_dict(checkpoint_model, strict=False) model.to(device) model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume='') print("Using EMA with decay = %.8f" % args.model_ema_decay) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print("Model = %s" % str(model_without_ddp)) print('number of params:', n_parameters) total_batch_size = args.batch_size * args.update_freq * utils.get_world_size( ) num_training_steps_per_epoch = len(dataset_train) // total_batch_size print("LR = %.8f" % args.lr) print("Batch size = %d" % total_batch_size) print("Update frequent = %d" % args.update_freq) print("Number of training examples = %d" % len(dataset_train)) print("Number of training training per epoch = %d" % num_training_steps_per_epoch) num_layers = model_without_ddp.get_num_layers() if args.layer_decay < 1.0: assigner = LayerDecayValueAssigner( list(args.layer_decay**(num_layers + 1 - i) for i in range(num_layers + 2))) else: assigner = None if assigner is not None: print("Assigned values = %s" % str(assigner.values)) skip_weight_decay_list = model.no_weight_decay() if args.disable_weight_decay_on_rel_pos_bias: for i in range(num_layers): skip_weight_decay_list.add( "blocks.%d.attn.relative_position_bias_table" % i) if args.enable_deepspeed: loss_scaler = None optimizer_params = get_parameter_groups( model, args.weight_decay, skip_weight_decay_list, assigner.get_layer_id if assigner is not None else None, assigner.get_scale if assigner is not None else None) model, optimizer, _, _ = ds_init( args=args, model=model, model_parameters=optimizer_params, dist_init_required=not args.distributed, ) print("model.gradient_accumulation_steps() = %d" % model.gradient_accumulation_steps()) assert model.gradient_accumulation_steps() == args.update_freq else: if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module optimizer = create_optimizer(args, model_without_ddp, skip_list=skip_weight_decay_list, get_num_layer=assigner.get_layer_id if assigner is not None else None, get_layer_scale=assigner.get_scale if assigner is not None else None) loss_scaler = NativeScaler() print("Use step level LR scheduler!") lr_schedule_values = utils.cosine_scheduler( args.lr, args.min_lr, args.epochs, num_training_steps_per_epoch, warmup_epochs=args.warmup_epochs, warmup_steps=args.warmup_steps, ) if args.weight_decay_end is None: args.weight_decay_end = args.weight_decay wd_schedule_values = utils.cosine_scheduler(args.weight_decay, args.weight_decay_end, args.epochs, num_training_steps_per_epoch) print("Max WD = %.7f, Min WD = %.7f" % (max(wd_schedule_values), min(wd_schedule_values))) if mixup_fn is not None: # smoothing is handled with mixup label transform criterion = SoftTargetCrossEntropy() elif args.smoothing > 0.: criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: criterion = torch.nn.CrossEntropyLoss() print("criterion = %s" % str(criterion)) utils.auto_load_model(args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, model_ema=model_ema) if args.eval: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) exit(0) print(f"Start training for {args.epochs} epochs") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) if log_writer is not None: log_writer.set_step(epoch * num_training_steps_per_epoch * args.update_freq) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn, log_writer=log_writer, start_steps=epoch * num_training_steps_per_epoch, lr_schedule_values=lr_schedule_values, wd_schedule_values=wd_schedule_values, num_training_steps_per_epoch=num_training_steps_per_epoch, update_freq=args.update_freq, ) if args.output_dir and args.save_ckpt: if (epoch + 1) % args.save_ckpt_freq == 0 or epoch + 1 == args.epochs: utils.save_model(args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, epoch=epoch, model_ema=model_ema) if data_loader_val is not None: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) if max_accuracy < test_stats["acc1"]: max_accuracy = test_stats["acc1"] if args.output_dir and args.save_ckpt: utils.save_model(args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, epoch="best", model_ema=model_ema) print(f'Max accuracy: {max_accuracy:.2f}%') if log_writer is not None: log_writer.update(test_acc1=test_stats['acc1'], head="perf", step=epoch) log_writer.update(test_acc5=test_stats['acc5'], head="perf", step=epoch) log_writer.update(test_loss=test_stats['loss'], head="perf", step=epoch) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } else: log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, # **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): if log_writer is not None: log_writer.flush() with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True model = get_model(args) patch_size = model.patch_embed.patch_size print("Patch size = %s" % str(patch_size)) args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1]) args.patch_size = patch_size # get dataset dataset_train = build_beit_pretraining_dataset(args) # prepare discrete vae d_vae = utils.create_d_vae( weight_path=args.discrete_vae_weight_path, d_vae_type=args.discrete_vae_type, device=device, image_size=args.second_input_size) if True: # args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() sampler_rank = global_rank num_training_steps_per_epoch = len(dataset_train) // args.batch_size // num_tasks sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=sampler_rank, shuffle=True ) print("Sampler_train = %s" % str(sampler_train)) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) if global_rank == 0 and args.log_dir is not None: os.makedirs(args.log_dir, exist_ok=True) log_writer = utils.TensorboardLogger(log_dir=args.log_dir) else: log_writer = None data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print("Model = %s" % str(model_without_ddp)) print('number of params:', n_parameters) total_batch_size = args.batch_size * utils.get_world_size() print("LR = %.8f" % args.lr) print("Batch size = %d" % total_batch_size) print("Number of training steps = %d" % num_training_steps_per_epoch) print("Number of training examples per epoch = %d" % (total_batch_size * num_training_steps_per_epoch)) if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module optimizer = create_optimizer( args, model_without_ddp) loss_scaler = NativeScaler() print("Use step level LR & WD scheduler!") lr_schedule_values = utils.cosine_scheduler( args.lr, args.min_lr, args.epochs, num_training_steps_per_epoch, warmup_epochs=args.warmup_epochs, warmup_steps=args.warmup_steps, ) if args.weight_decay_end is None: args.weight_decay_end = args.weight_decay wd_schedule_values = utils.cosine_scheduler( args.weight_decay, args.weight_decay_end, args.epochs, num_training_steps_per_epoch) print("Max WD = %.7f, Min WD = %.7f" % (max(wd_schedule_values), min(wd_schedule_values))) utils.auto_load_model( args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler) print(f"Start training for {args.epochs} epochs") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) if log_writer is not None: log_writer.set_step(epoch * num_training_steps_per_epoch) train_stats = train_one_epoch( model, d_vae, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, log_writer=log_writer, start_steps=epoch * num_training_steps_per_epoch, lr_schedule_values=lr_schedule_values, wd_schedule_values=wd_schedule_values, ) if args.output_dir: if (epoch + 1) % args.save_ckpt_freq == 0 or epoch + 1 == args.epochs: utils.save_model( args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, epoch=epoch) log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} if args.output_dir and utils.is_main_process(): if log_writer is not None: log_writer.flush() with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(): data_root = "/data/flower_photos" # get data root path if not os.path.exists("./save_weights"): os.makedirs("./save_weights") batch_size = 8 epochs = 10 num_classes = 5 freeze_layers = False initial_lr = 0.005 weight_decay = 5e-4 log_dir = "./logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_writer = tf.summary.create_file_writer(os.path.join( log_dir, "train")) val_writer = tf.summary.create_file_writer(os.path.join(log_dir, "val")) # data generator with data augmentation train_ds, val_ds = generate_ds(data_root, batch_size=batch_size, val_rate=0.2) # create model model = create_model(num_classes=num_classes) model.build((1, 224, 224, 3)) # 下载我提前转好的预训练权重 # 链接: https://pan.baidu.com/s/1MtYJ3FCAkiPwaMRKuyZN1Q 密码: 1cgp # load weights pre_weights_path = './convnext_tiny_1k_224.h5' assert os.path.exists(pre_weights_path), "cannot find {}".format( pre_weights_path) model.load_weights(pre_weights_path, by_name=True, skip_mismatch=True) # freeze bottom layers if freeze_layers: for layer in model.layers: if "head" not in layer.name: layer.trainable = False else: print("training {}".format(layer.name)) model.summary() # custom learning rate scheduler scheduler = cosine_scheduler(initial_lr, epochs, len(train_ds), train_writer=train_writer) # using keras low level api for training loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) optimizer = tf.keras.optimizers.SGD(learning_rate=initial_lr, momentum=0.9) train_loss = tf.keras.metrics.Mean(name='train_loss') train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') val_loss = tf.keras.metrics.Mean(name='val_loss') val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='val_accuracy') @tf.function def train_step(train_images, train_labels): with tf.GradientTape() as tape: output = model(train_images, training=True) ce_loss = loss_object(train_labels, output) # l2 loss matcher = re.compile(".*(bias|gamma|beta).*") l2loss = weight_decay * tf.add_n([ tf.nn.l2_loss(v) for v in model.trainable_variables if not matcher.match(v.name) ]) loss = ce_loss + l2loss gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) train_loss(ce_loss) train_accuracy(train_labels, output) @tf.function def val_step(val_images, val_labels): output = model(val_images, training=False) loss = loss_object(val_labels, output) val_loss(loss) val_accuracy(val_labels, output) best_val_acc = 0. for epoch in range(epochs): train_loss.reset_states() # clear history info train_accuracy.reset_states() # clear history info val_loss.reset_states() # clear history info val_accuracy.reset_states() # clear history info # train train_bar = tqdm(train_ds, file=sys.stdout) for images, labels in train_bar: # update learning rate optimizer.learning_rate = next(scheduler) train_step(images, labels) # print train process train_bar.desc = "train epoch[{}/{}] loss:{:.3f}, acc:{:.3f}, lr:{:.5f}".format( epoch + 1, epochs, train_loss.result(), train_accuracy.result(), optimizer.learning_rate.numpy()) # validate val_bar = tqdm(val_ds, file=sys.stdout) for images, labels in val_bar: val_step(images, labels) # print val process val_bar.desc = "valid epoch[{}/{}] loss:{:.3f}, acc:{:.3f}".format( epoch + 1, epochs, val_loss.result(), val_accuracy.result()) # writing training loss and acc with train_writer.as_default(): tf.summary.scalar("loss", train_loss.result(), epoch) tf.summary.scalar("accuracy", train_accuracy.result(), epoch) # writing validation loss and acc with val_writer.as_default(): tf.summary.scalar("loss", val_loss.result(), epoch) tf.summary.scalar("accuracy", val_accuracy.result(), epoch) # only save best weights if val_accuracy.result() > best_val_acc: best_val_acc = val_accuracy.result() save_name = "./save_weights/model.ckpt" model.save_weights(save_name, save_format="tf")