batch_size=batch_size, num_workers=num_workers, pin_memory=True, drop_last=True, collate_fn=yolo_dataset_collate) #------------------------------------# # 冻结一定部分训练 #------------------------------------# if Freeze_Train: for param in model.backbone.parameters(): param.requires_grad = False for epoch in range(start_epoch, end_epoch): fit_one_epoch(model_train, model, yolo_loss, loss_history, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, end_epoch, Cuda) lr_scheduler.step() if True: batch_size = Unfreeze_batch_size lr = Unfreeze_lr start_epoch = Freeze_Epoch end_epoch = UnFreeze_Epoch epoch_step = num_train // batch_size epoch_step_val = num_val // batch_size if epoch_step == 0 or epoch_step_val == 0: raise ValueError("数据集过小,无法进行训练,请扩充数据集。")
gen = tf.data.Dataset.from_generator(partial(train_dataloader.generate), (tf.float32, tf.float32, tf.float32, tf.float32)) gen_val = tf.data.Dataset.from_generator(partial(val_dataloader.generate), (tf.float32, tf.float32, tf.float32, tf.float32)) gen = gen.shuffle(buffer_size = batch_size).prefetch(buffer_size = batch_size) gen_val = gen_val.shuffle(buffer_size = batch_size).prefetch(buffer_size = batch_size) if ngpus_per_node > 1: gen = strategy.experimental_distribute_dataset(gen) gen_val = strategy.experimental_distribute_dataset(gen_val) UnFreeze_flag = True lr = lr_scheduler_func(epoch) K.set_value(optimizer.lr, lr) fit_one_epoch(model_body, loss_history, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, end_epoch, input_shape, anchors, anchors_mask, num_classes, label_smoothing, focal_loss, focal_alpha, focal_gamma, save_period, save_dir, strategy) train_dataloader.on_epoch_end() val_dataloader.on_epoch_end() else: start_epoch = Init_Epoch end_epoch = Freeze_Epoch if Freeze_Train else UnFreeze_Epoch if ngpus_per_node > 1: with strategy.scope(): model.compile(optimizer = optimizer, loss={'yolo_loss': lambda y_true, y_pred: y_pred}) else: model.compile(optimizer = optimizer, loss={'yolo_loss': lambda y_true, y_pred: y_pred}) #-------------------------------------------------------------------------------# # 训练参数的设置 # logging 用于设置tensorboard的保存地址
train_dataset = PSPnetDataset(train_lines, input_shape, num_classes, True, VOCdevkit_path) val_dataset = PSPnetDataset(val_lines, input_shape, num_classes, False, VOCdevkit_path) gen = DataLoader(train_dataset, shuffle = True, batch_size = batch_size, num_workers = num_workers, pin_memory=True, drop_last = True, collate_fn = pspnet_dataset_collate) gen_val = DataLoader(val_dataset , shuffle = True, batch_size = batch_size, num_workers = num_workers, pin_memory=True, drop_last = True, collate_fn = pspnet_dataset_collate) #------------------------------------# # 冻结一定部分训练 #------------------------------------# if Freeze_Train: for param in model.backbone.parameters(): param.requires_grad = False for epoch in range(start_epoch, end_epoch): fit_one_epoch(model_train, model, loss_history, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, end_epoch, Cuda, dice_loss, focal_loss, cls_weights, aux_branch, num_classes) lr_scheduler.step() if True: batch_size = Unfreeze_batch_size lr = Unfreeze_lr start_epoch = Freeze_Epoch end_epoch = UnFreeze_Epoch epoch_step = len(train_lines) // batch_size epoch_step_val = len(val_lines) // batch_size if epoch_step == 0 or epoch_step_val == 0: raise ValueError("数据集过小,无法进行训练,请扩充数据集。") optimizer = optim.Adam(model_train.parameters(), lr)
nbs = 64 Init_lr_fit = max(batch_size / nbs * Init_lr, 1e-4) Min_lr_fit = max(batch_size / nbs * Min_lr, 1e-6) #---------------------------------------# # 获得学习率下降的公式 #---------------------------------------# lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch) for param in model.backbone.parameters(): param.requires_grad = True epoch_step = num_train // batch_size epoch_step_val = num_val // batch_size if epoch_step == 0 or epoch_step_val == 0: raise ValueError("数据集过小,无法继续进行训练,请扩充数据集。") gen = DataLoader(train_dataset, shuffle = True, batch_size = batch_size, num_workers = num_workers, pin_memory=True, drop_last=True, collate_fn=yolo_dataset_collate) gen_val = DataLoader(val_dataset , shuffle = True, batch_size = batch_size, num_workers = num_workers, pin_memory=True, drop_last=True, collate_fn=yolo_dataset_collate) UnFreeze_flag = True gen.dataset.epoch_now = epoch gen_val.dataset.epoch_now = epoch set_optimizer_lr(optimizer, lr_scheduler_func, epoch) fit_one_epoch(model_train, model, yolo_loss, loss_history, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, UnFreeze_Epoch, Cuda, save_period)
epoch_step = num_train // batch_size epoch_step_val = num_val // batch_size if epoch_step == 0 or epoch_step_val == 0: raise ValueError("数据集过小,无法继续进行训练,请扩充数据集。") if distributed: batch_size = batch_size // ngpus_per_node gen = DataLoader(train_dataset, shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True, drop_last=True, collate_fn=ssd_dataset_collate, sampler=train_sampler) gen_val = DataLoader(val_dataset , shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True, drop_last=True, collate_fn=ssd_dataset_collate, sampler=val_sampler) UnFreeze_flag = True if distributed: train_sampler.set_epoch(epoch) set_optimizer_lr(optimizer, lr_scheduler_func, epoch) fit_one_epoch(model_train, model, criterion, loss_history, eval_callback, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, UnFreeze_Epoch, Cuda, fp16, scaler, save_period, save_dir, local_rank) if distributed: dist.barrier() if local_rank == 0: loss_history.writer.close()
gen_val = gen_val.shuffle(buffer_size=Batch_size).prefetch( buffer_size=Batch_size) lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=Lr, decay_steps=epoch_step, decay_rate=0.94, staircase=True) print( 'Train on {} samples, val on {} samples, with batch size {}.'. format(num_train, num_val, Batch_size)) optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) for epoch in range(Init_epoch, Freeze_epoch): fit_one_epoch(model, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, Freeze_epoch) else: model.compile(loss="binary_crossentropy", optimizer=Adam(lr=Lr), metrics=["binary_accuracy"]) model.fit_generator(generator.generate(True), steps_per_epoch=epoch_step, validation_data=generator.generate(False), validation_steps=epoch_step_val, epochs=Freeze_epoch, initial_epoch=Init_epoch, callbacks=[ checkpoint_period, reduce_lr, early_stopping, tensorboard ])
buffer_size=batch_size) gen_val = gen_val.shuffle(buffer_size=batch_size).prefetch( buffer_size=batch_size) if ngpus_per_node > 1: gen = strategy.experimental_distribute_dataset(gen) gen_val = strategy.experimental_distribute_dataset( gen_val) UnFreeze_flag = True lr = lr_scheduler_func(epoch) K.set_value(optimizer.lr, lr) fit_one_epoch(model, multiloss, loss_history, eval_callback, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, end_epoch, save_period, save_dir, strategy) train_dataloader.on_epoch_end() val_dataloader.on_epoch_end() else: start_epoch = Init_Epoch end_epoch = Freeze_Epoch if Freeze_Train else UnFreeze_Epoch if ngpus_per_node > 1: with strategy.scope(): model.compile( optimizer=optimizer, loss=MultiboxLoss(num_classes, neg_pos_ratio=3.0).compute_loss) else: model.compile(optimizer=optimizer,
#------------------------------------------------------# # Init_Epoch为起始世代 # Epoch总训练世代 # 提示OOM或者显存不足请调小Batch_size #------------------------------------------------------# if True: epoch_step = num_train // batch_size if epoch_step == 0: raise ValueError("数据集过小,无法进行训练,请扩充数据集。") D_model.compile(loss="binary_crossentropy", optimizer=Adam(lr, 0.5, 0.999)) D_model.trainable = False noise = layers.Input(shape=(100,)) img = G_model(noise) valid = D_model(img) Combine_model = Model(noise, valid) Combine_model.compile(loss="binary_crossentropy", optimizer=Adam(lr, 0.5, 0.999)) gen = DCganDataset(lines, input_shape, batch_size) for epoch in range(Init_epoch, Epoch): fit_one_epoch(G_model, D_model, Combine_model, epoch, epoch_step, gen, Epoch, batch_size, save_interval) lr = K.get_value(Combine_model.optimizer.lr) * 0.99 K.set_value(Combine_model.optimizer.lr, lr) lr = K.get_value(D_model.optimizer.lr) * 0.99 K.set_value(D_model.optimizer.lr, lr)
gen = tf.data.Dataset.from_generator( partial(gen.generate), (tf.float32, tf.float32, tf.float32, tf.float32)) gen = gen.shuffle(buffer_size=batch_size).prefetch( buffer_size=batch_size) lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=learning_rate_base, decay_steps=epoch_size, decay_rate=0.92, staircase=True) optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) for epoch in range(Init_epoch, Freeze_epoch): fit_one_epoch(model, loss_history, optimizer, epoch, epoch_size, gen, Freeze_epoch, cfg) else: model.compile( loss={ 'bbox_reg': box_smooth_l1(weights=cfg['loc_weight']), 'cls': conf_loss(), 'ldm_reg': ldm_smooth_l1() }, optimizer=keras.optimizers.Adam(lr=learning_rate_base)) model.fit_generator( generator=gen, steps_per_epoch=epoch_size, epochs=Freeze_epoch, initial_epoch=Init_epoch, use_multiprocessing=True if num_workers > 1 else False,
buffer_size=batch_size) gen_val = gen_val.shuffle(buffer_size=batch_size).prefetch( buffer_size=batch_size) if ngpus_per_node > 1: gen = strategy.experimental_distribute_dataset(gen) gen_val = strategy.experimental_distribute_dataset( gen_val) UnFreeze_flag = True lr = lr_scheduler_func(epoch) K.set_value(optimizer.lr, lr) fit_one_epoch(model, loss_history, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, end_epoch, save_period, save_dir, strategy) train_dataloader.on_epoch_end() val_dataloader.on_epoch_end() else: start_epoch = Init_Epoch end_epoch = Freeze_Epoch if Freeze_Train else UnFreeze_Epoch if ngpus_per_node > 1: with strategy.scope(): model.compile(optimizer=optimizer, loss={ 'centernet_loss': lambda y_true, y_pred: y_pred }) else:
initial_learning_rate=lr, first_decay_steps=5 * epoch_step, t_mul=1.0, alpha=1e-2) else: lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=lr, decay_steps=epoch_step, decay_rate=0.94, staircase=True) optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) for epoch in range(start_epoch, end_epoch): fit_one_epoch(model_body, loss_history, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, end_epoch, input_shape, anchors, anchors_mask, num_classes, label_smoothing) else: model.compile(optimizer=Adam(lr=lr), loss={ 'yolo_loss': lambda y_true, y_pred: y_pred }) model.fit_generator( generator=train_dataloader, steps_per_epoch=epoch_step, validation_data=val_dataloader, validation_steps=epoch_step_val, epochs=end_epoch, initial_epoch=start_epoch,
anchors, batch_size, num_classes, train=True).generate() gen_val = FRCNNDatasets(val_lines, input_shape, anchors, batch_size, num_classes, train=False).generate() print('Train on {} samples, val on {} samples, with batch size {}.'. format(num_train, num_val, batch_size)) for epoch in range(start_epoch, end_epoch): fit_one_epoch(model_rpn, model_all, loss_history, callback, epoch, epoch_step, epoch_step_val, gen, gen_val, end_epoch, anchors, bbox_util, roi_helper) lr = lr * 0.96 K.set_value(model_rpn.optimizer.lr, lr) K.set_value(model_all.optimizer.lr, lr) if Freeze_Train: for i in range(freeze_layers): if type(model_all.layers[i]) != tf.keras.layers.BatchNormalization: model_all.layers[i].trainable = True if True: batch_size = Unfreeze_batch_size lr = Unfreeze_lr start_epoch = Freeze_Epoch end_epoch = UnFreeze_Epoch
val_dataloader.batch_size = batch_size gen_enqueuer.stop() gen_val_enqueuer.stop() #---------------------------------------# # 构建多线程数据加载器 #---------------------------------------# gen_enqueuer = OrderedEnqueuer( train_dataloader, use_multiprocessing=True if num_workers > 1 else False, shuffle=True) gen_val_enqueuer = OrderedEnqueuer( val_dataloader, use_multiprocessing=True if num_workers > 1 else False, shuffle=True) gen_enqueuer.start(workers=num_workers, max_queue_size=10) gen_val_enqueuer.start(workers=num_workers, max_queue_size=10) gen = gen_enqueuer.get() gen_val = gen_val_enqueuer.get() UnFreeze_flag = True lr = lr_scheduler_func(epoch) K.set_value(optimizer.lr, lr) fit_one_epoch(model_rpn, model_all, model_all_body, loss_history, eval_callback, callback, epoch, epoch_step, epoch_step_val, gen, gen_val, UnFreeze_Epoch, anchors, bbox_util, roi_helper, save_period, save_dir)