with strategy.scope(): model.compile(optimizer = optimizer, loss={'yolo_loss': lambda y_true, y_pred: y_pred}) else: model.compile(optimizer = optimizer, loss={'yolo_loss': lambda y_true, y_pred: y_pred}) #-------------------------------------------------------------------------------# # 训练参数的设置 # logging 用于设置tensorboard的保存地址 # checkpoint 用于设置权值保存的细节,period用于修改多少epoch保存一次 # lr_scheduler 用于设置学习率下降的方式 # early_stopping 用于设定早停,val_loss多次不下降自动结束训练,表示模型基本收敛 #-------------------------------------------------------------------------------# time_str = datetime.datetime.strftime(datetime.datetime.now(),'%Y_%m_%d_%H_%M_%S') log_dir = os.path.join(save_dir, "loss_" + str(time_str)) logging = TensorBoard(log_dir) loss_history = LossHistory(log_dir) checkpoint = ModelCheckpoint(os.path.join(save_dir, "ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5"), monitor = 'val_loss', save_weights_only = True, save_best_only = False, period = save_period) checkpoint_last = ModelCheckpoint(os.path.join(save_dir, "last_epoch_weights.h5"), monitor = 'val_loss', save_weights_only = True, save_best_only = False, period = 1) checkpoint_best = ModelCheckpoint(os.path.join(save_dir, "best_epoch_weights.h5"), monitor = 'val_loss', save_weights_only = True, save_best_only = True, period = 1) early_stopping = EarlyStopping(monitor='val_loss', min_delta = 0, patience = 10, verbose = 1) lr_scheduler = LearningRateScheduler(lr_scheduler_func, verbose = 1) callbacks = [logging, loss_history, checkpoint, checkpoint_last, checkpoint_best, lr_scheduler] if start_epoch < end_epoch: print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size)) model.fit( x = train_dataloader, steps_per_epoch = epoch_step, validation_data = val_dataloader, validation_steps = epoch_step_val,
def main(train_list, val_list, model, exp, saved_model, batch_size, optimizer, nb_epochs, augment, max_lr, min_lr, loss_function, train_all, nb_frames, eager, params=None, **kwargs): print("Unused arguments:", kwargs) setname = train_list.split(os.sep)[0] # Timestamp to name experiment folder xptime = strftime("%Y-%m-%d_%Hh%Mm%Ss", gmtime()) xp_folder = "experiments/%s-%s-%s_%s" % (setname, model, exp, xptime) # Make folder mkdir_p(xp_folder) mkdir_p(os.path.join(xp_folder, 'checkpoints')) mkdir_p(os.path.join(xp_folder, 'tb')) print("\nSaving experiment data to:", xp_folder) # Save command (as well as possible) with open(os.path.join(xp_folder, 'command.sh'), "w") as f: command = " ".join(sys.argv[:]) + "\n" f.write(command) # Save employed parameters for future reference if params is not None: write_params(os.path.join(xp_folder, 'params.json'), params) ############# # Callbacks # ############# # Helper: Save the model. ckpt_fmt = os.path.join( xp_folder, 'checkpoints', model + '-' + exp + '.{epoch:03d}-loss{val_loss:.3f}-acc{val_acc:.3f}.hdf5') checkpointer = ModelCheckpoint(filepath=ckpt_fmt, verbose=1, save_best_only=True, monitor='val_acc') # Helper: TensorBoard tb = HistoryKeeper(logdir=os.path.join(xp_folder), keys=['val_acc', 'val_loss', 'train_time', 'val_time']) # Helper: Stop when we stop learning. # early_stopper = EarlyStopper(patience=15) # Helper: Terminate when finding a NaN loss nan_term = TerminateOnNaN() callbacks = [tb, checkpointer, nan_term] ############# ############# # Loading # ############# if augment: augmenter = default_augmenter_vid(strip_size=4) else: augment = False augmenter = None # Dataset classes train_data = ArrayData(train_list, nb_frames=nb_frames, augmenter=augmenter, eager=eager) val_data = ArrayData(val_list, nb_frames=nb_frames, augmenter=None, eager=eager, encoder=train_data.get_encoder()) # Saving encoder with open(os.path.join(xp_folder, 'encoder.pkl'), 'wb') as f: pickle.dump(train_data.get_encoder(), f) # Train loader train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, last_batch='keep', num_workers=10) nb_samples = len(train_data) # loader should provide the number of sampĺes # Validation loader val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, last_batch='keep', num_workers=10) nb_validation = len( val_data) # loader should provide the number of sampĺes # Compute number of steps steps_per_epoch = math.ceil(nb_samples / batch_size) validation_steps = math.ceil(nb_validation / batch_size) # The model net = ResearchModels(train_data.nb_classes, model, saved_model, input_shape=train_data.shape, train_all=train_all).model # A little more verbosity print("************************************") if train_all: print("Train all layers.") print("Max lr:", max_lr, " Min lr:", min_lr) print("Batch size:", batch_size) print(nb_samples, "training samples,", steps_per_epoch, "steps per epoch") print(nb_validation, "validation samples,", validation_steps, "validation steps") print("Optimizer:", optimizer) if augment: print("Using data augmentation") else: print("WARNING: Not using data augmentation") print("************************************") ############################ # Loss and Optimization # ############################ trainer = gluon.Trainer(net.collect_params(), optimizer, {'learning_rate': max_lr}) if loss_function == 'categorical_crossentropy': loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() loss_fn.hybridize() ############ # Training # ############ progress_desc = "Super epoch %03d - acc %.3f - loss %.3f " acc = Accuracy() start_time = time() super_epoch_size = 250 # Learning rate decay iteration = 1 decay_alpha = 0.01**0.25 lr = max_lr for epoch in range(1, nb_epochs + 1): train_loss, val_loss = 0., 0. nb_batches = 0 tic = time() acc.reset() start_training = time() t = tqdm(range(super_epoch_size), unit='epochs') for _ in t: for data, label in train_loader: # Learning rate decay if iteration % 10000 == 0: lr *= decay_alpha trainer.set_learning_rate(lr) print("Learning rate updated to", lr) iteration += 1 current_batch_size = data.shape[0] data = data.copyto(mx.gpu(0)) label = label.copyto(mx.gpu(0)) with autograd.record(): output = net(data) loss = loss_fn(output, label) loss.backward() # print(mx.nd.log_softmax(output[0], axis=-1), label[0]) # update parameters trainer.step(current_batch_size) # calculate training metrics train_loss += loss.mean().asscalar() # accuracy(output, label) acc.update(preds=output, labels=label) nb_batches += 1 t.set_description(progress_desc % (epoch, acc.get()[1], train_loss / nb_batches)) train_time = time() - start_training train_loss /= steps_per_epoch * super_epoch_size train_acc = acc.get()[1] acc.reset() start_val = time() # calculate validation accuracy tval = tqdm(val_loader, leave=False, desc='Running validation', unit='batch') for data, label in tval: data = data.copyto(mx.gpu(0)) label = label.copyto(mx.gpu(0)) # Compute outputs output = net(data) loss = loss_fn(output, label) # Compute metrics val_loss += loss.mean().asscalar() # val_acc += accuracy(output, label) acc.update(preds=output, labels=label) val_time = time() - start_val val_loss /= validation_steps val_acc = acc.get()[1] print( "Epoch %d: loss %.3f, acc %.3f, val_loss %.3f, val_acc %.3f, in %.1f sec" % (epoch, train_loss, train_acc, val_loss, val_acc, time() - tic)) print( "--------------------------------------------------------------------------------" ) stop = False train_info = { 'epoch': epoch, 'loss': train_loss, 'acc': train_acc, 'val_loss': val_loss, 'val_acc': val_acc, 'train_time': train_time, 'val_time': val_time } for cb in callbacks: if cb(net, train_info): stop = True if stop: break print() hours, rem = divmod(time() - start_time, 3600) days, hours = divmod(hours, 24) minutes, seconds = divmod(rem, 60) print("%d training epochs in %dd, %dh%dm%.2fs." % (nb_epochs, int(days), int(hours), int(minutes), seconds))
#------------------------------------------------------# # 载入预训练权重 #------------------------------------------------------# model.load_weights(model_path, by_name=True, skip_mismatch=True) #-------------------------------------------------------------------------------# # 训练参数的设置 # logging表示tensorboard的保存地址 # checkpoint用于设置权值保存的细节,period用于修改多少epoch保存一次 # reduce_lr用于设置学习率下降的方式 # early_stopping用于设定早停,val_loss多次不下降自动结束训练,表示模型基本收敛 #-------------------------------------------------------------------------------# tensorboard = TensorBoard(log_dir=log_dir) checkpoint_period = ModelCheckpoint( log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5', monitor='val_loss', save_weights_only=True, save_best_only=False, period=1) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1) early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1) #----------------------------------------------------# # 训练集和验证集的比例。 #----------------------------------------------------# train_ratio = 0.9
loader_test_reshape = DataLoader(dataset, batch_size=3, shuffle=False) # OPTIM-LOSS optimizer = Adam(params=net.parameters(), lr=0.002) #optimizer = Adam(params=net.parameters(), lr=0.0008) #lr_schedul = StepLR(optimizer,step_size=1000,gamma=0.5) #questo andava bene su 300 immagini con lr 0.008 e 350 epoche #lr_schedul = MultiStepLR(optimizer,milestones=[150,200,250,300],gamma=0.5) #lr_schedul = MultiStepLR(optimizer,milestones=[15,20,25,30,60],gamma=0.5) lr_schedul = MultiStepLR(optimizer, milestones=[5, 10, 15, 20, 25], gamma=0.5) #lr_schedul = StepLR(optimizer,step_size=100,gamma=1) loss = nn.NLLLoss2d(weight=torch.FloatTensor(WEIGHTS)).cuda() #modelcheck check = ModelCheckpoint() batch_number = len(loader) #num_epochs = 1000 num_epochs = 30 step_index = 0 widgets = [ 'Batch: ', progressbar.Counter(), '/', progressbar.FormatCustomText('%(total)s', {"total": batch_number}), ' ', progressbar.Bar(marker="-", left='[', right=']'), ' ', progressbar.ETA(), ' ', progressbar.DynamicMessage('loss'), ' ', progressbar.DynamicMessage("accuracy"), ' ', progressbar.DynamicMessage("epoch")
anchors = Anchors(cfg, image_size=(cfg['train_image_size'], cfg['train_image_size'])).get_anchors() bbox_util = BBoxUtility(anchors) #-------------------------------------------------------------------------------# # 训练参数的设置 # logging表示tensorboard的保存地址 # checkpoint用于设置权值保存的细节,period用于修改多少epoch保存一次 # reduce_lr用于设置学习率下降的方式 # early_stopping用于设定早停,val_loss多次不下降自动结束训练,表示模型基本收敛 #-------------------------------------------------------------------------------# logging = TensorBoard(log_dir="logs/") checkpoint = ModelCheckpoint('logs/ep{epoch:03d}-loss{loss:.3f}.h5', monitor='loss', save_weights_only=True, save_best_only=False, period=1) reduce_lr = ExponentDecayScheduler(decay_rate=0.92, verbose=1) early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=10, verbose=1) loss_history = LossHistory("logs/") if Freeze_Train: for i in range(freeze_layers): model.layers[i].trainable = False print('Freeze the first {} layers of total {} layers.'.format( freeze_layers, len(model.layers)))
def fit_model(model, Lr, Batch_size, Init_Epoch, run_Epoch, warmup_proportion=0.1, min_scale=1e-2, max_objects=100): # -------------------------------------------------------------------------------# # 训练参数的设置 # logging表示tensorboard的保存地址 # checkpoint用于设置权值保存的细节,period用于修改多少epoch保存一次 # reduce_lr用于设置学习率下降的方式 # early_stopping用于设定早停,val_loss多次不下降自动结束训练,表示模型基本收敛 # -------------------------------------------------------------------------------# logs = path + '/' + datetime.now().strftime("%Y%m%d-%H%M%S") logging = TensorBoard(log_dir=logs, profile_batch=(2, 5)) loss_history = LossHistory(logs) checkpoint = ModelCheckpoint( path + '/ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5', monitor='val_loss', save_weights_only=True, save_best_only=False, period=1) Epoch = Init_Epoch + run_Epoch train_dataloader = OneNetDatasets(lines[:num_train], input_shape, Batch_size, num_classes, train=True, max_objects=max_objects) val_dataloader = OneNetDatasets(lines[num_train:], input_shape, Batch_size, num_classes, train=False, max_objects=max_objects) print('Train on {} samples, val on {} samples, with batch size {}.'. format(num_train, num_val, Batch_size)) # gen = Generator(Batch_size, lines[:num_train], lines[num_train:], input_shape, num_classes, max_objects=max_objects) optimizer = tfa.optimizers.RectifiedAdam( learning_rate=Lr, total_steps=num_train // Batch_size * (Epoch - Init_Epoch), warmup_proportion=warmup_proportion, weight_decay=1e-4, min_lr=Lr * min_scale) loss_list = { 'cls': lambda y_true, y_pred: y_pred, 'loc': lambda y_true, y_pred: y_pred, 'giou': lambda y_true, y_pred: y_pred } loss_weights = [2, 5, 2] model.compile(loss=loss_list, loss_weights=loss_weights, optimizer=optimizer) histogram = model.fit(train_dataloader, steps_per_epoch=num_train // Batch_size, validation_data=val_dataloader, validation_steps=num_val // Batch_size, epochs=Epoch, verbose=1, initial_epoch=Init_Epoch, callbacks=[logging, checkpoint, loss_history]) return histogram
model_dir+"%s-Best.h5" % model_prefix, monitor="val_acc", save_best_only=True, save_weights_only=True, base_model=model), reduce_lr, LearningRateScheduler(schedule=_cosine_anneal_schedule), snapshot, tensorboard, ] """ callback_list = [ ModelCheckpoint(model_dir + "%s-{epoch:02d}-{val_acc:.3f}.h5" % model_prefix, monitor="val_acc", save_best_only=False, save_weights_only=True, base_model=model), reduce_lr, tensorboard, ] # To train the model: history = parallel_model.fit_generator( train_gen, steps_per_epoch=(num_train_images // batch_size) + 1, epochs=num_epoch, validation_data=val_gen, validation_steps=(num_val_images // batch_size) + 1, max_queue_size=100, workers=8,
params_path, args.save_video_length, interval=args.save_video_interval, env_params=env_params).callback elif args.checkpoint: if args.algo in ['multipolar-ppo2', 'ppo2']: interval = n_timesteps / hyperparams['n_steps'] / n_envs / 10 elif args.algo in ['sac', 'multipolar-sac']: interval = n_timesteps / 10 else: raise NotImplementedError() interval = int(interval) callback = ModelCheckpoint(save_path, interval).callback else: callback = None # Parse noise string for DDPG if args.algo == 'ddpg' and hyperparams.get('noise_type') is not None: noise_type = hyperparams['noise_type'].strip() noise_std = hyperparams['noise_std'] n_actions = env.action_space.shape[0] if 'adaptive-param' in noise_type: hyperparams['param_noise'] = AdaptiveParamNoiseSpec( initial_stddev=noise_std, desired_action_stddev=noise_std) elif 'normal' in noise_type: hyperparams['action_noise'] = NormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions))