'label_smoothing': label_smoothing })(loss_input) model = Model([model_body.input, *y_true], model_loss) #-------------------------------------------------------------------------------# # 训练参数的设置 # logging表示tensorboard的保存地址 # checkpoint用于设置权值保存的细节,period用于修改多少epoch保存一次 # reduce_lr用于设置学习率下降的方式 # early_stopping用于设定早停,val_loss多次不下降自动结束训练,表示模型基本收敛 #-------------------------------------------------------------------------------# logging = TensorBoard(log_dir=log_dir) checkpoint = ModelCheckpoint( log_dir + "/ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5", save_weights_only=True, save_best_only=False, period=1) early_stopping = EarlyStopping(min_delta=0, patience=10, verbose=1) #----------------------------------------------------------------------# # 验证集的划分在train.py代码里面进行 # 2007_test.txt和2007_val.txt里面没有内容是正常的。训练不会使用到。 # 当前划分方式下,验证集和训练集的比例为1:9 #----------------------------------------------------------------------# val_split = 0.1 with open(annotation_path) as f: lines = f.readlines() np.random.seed(10101) np.random.shuffle(lines) np.random.seed(None)
# 打开验证集 with open(os.path.join(dataset_path, "ImageSets/Segmentation/val.txt"), "r") as f: val_lines = f.readlines() loss = dice_loss_with_CE() if dice_loss else CE() #-------------------------------------------------------------------------------# # 训练参数的设置 # logging表示tensorboard的保存地址 # checkpoint用于设置权值保存的细节,period用于修改多少epoch保存一次 # reduce_lr用于设置学习率下降的方式 # early_stopping用于设定早停,val_loss多次不下降自动结束训练,patience:多少轮不发生变化才停止 #-------------------------------------------------------------------------------# checkpoint_period = ModelCheckpoint( log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5', monitor='val_loss', save_weights_only=True, save_best_only=False, period=1) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1) early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=100, verbose=1) tensorboard = TensorBoard(log_dir=log_dir) loss_history = LossHistory("logs/") if backbone == "mobilenet": freeze_layers = 146
#-------------------------------# # 创立模型 #-------------------------------# model = RetinaFace(cfg, backbone=backbone) model_path = "model_data/retinaface_mobilenet025.h5" model.load_weights(model_path,by_name=True,skip_mismatch=True) #-------------------------------# # 获得先验框和工具箱 #-------------------------------# anchors = Anchors(cfg, image_size=(img_dim, img_dim)).get_anchors() bbox_util = BBoxUtility(anchors) # 训练参数设置 logging = TensorBoard(log_dir="logs") checkpoint = ModelCheckpoint('logs/ep{epoch:03d}-loss{loss:.3f}.h5', monitor='loss', save_weights_only=True, save_best_only=False, period=1) reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=2, verbose=1) early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=6, verbose=1) for i in range(freeze_layers): model.layers[i].trainable = False print('Freeze the first {} layers of total {} layers.'.format(freeze_layers, len(model.layers))) #------------------------------------------------------# # 主干特征提取网络特征通用,冻结训练可以加快训练速度 # 也可以在训练初期防止权值被破坏。 # Init_Epoch为起始世代 # Freeze_Epoch为冻结训练的世代 # Epoch总训练世代 #------------------------------------------------------# if True: Init_epoch = 0
def main(model_name): ################################## # Initialize saving directory ################################## if not os.path.exists(config.save_dir): os.makedirs(config.save_dir) ################################## # Logging setting ################################## logging.basicConfig( stream=sys.stdout, level=logging.INFO, # 设置输出级别 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') ################################## # Load dataset TODO: 10-fold cross validation ################################## train_dataset = FGVC7Data(root=args.datasets, phase='train', transform=get_transform(config.image_size, 'train')) indices = range(len(train_dataset)) split = int(0.3 * len(train_dataset)) train_indices = indices[split:] test_indices = indices[:split] #train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(test_indices) train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) validate_loader = DataLoader(train_dataset, batch_size=config.batch_size, sampler=valid_sampler, num_workers=config.workers, pin_memory=True) num_classes = 4 print('Train Size: {}'.format(len(train_indices))) print('Valid Size: {}'.format(len(test_indices))) ################################## # Initialize model ################################## logs = {} # 有 lr, epoch, val_loss start_epoch = 0 num_classes = 5 net = efficientnet(net_name=model_name, num_classes=num_classes, weight_path='github') if config.ckpt: # Load ckpt and get state_dict checkpoint = torch.load(config.ckpt) # Get epoch and some logs logs = checkpoint['logs'] start_epoch = int(logs['epoch']) # Load weights state_dict = checkpoint['state_dict'] net.load_state_dict(state_dict) logging.info('Network loaded from {}'.format(config.ckpt)) #net.re_init() logging.info('Network weights save to {}'.format(config.save_dir)) ################################## # Use cuda ################################## net.to(device) if torch.cuda.device_count() > 1: net = nn.DataParallel(net) ################################## # Optimizer, LR Schedulerextract_features(img) ################################## learning_rate = logs['lr'] if 'lr' in logs else config.learning_rate #optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-5) optimizer = torch.optim.AdamW(net.parameters(), lr=learning_rate, amsgrad=True) #scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.9, patience=2) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, config.epochs, eta_min=1e-6) ################################## # ModelCheckpoint ################################## callback_monitor = 'val_{}'.format(raw_metric.name) # topk_accuracy callback = ModelCheckpoint(savepath=os.path.join(config.save_dir, config.model_name), monitor=callback_monitor, mode='max') if callback_monitor in logs: callback.set_best_score(logs[callback_monitor]) else: callback.reset() ################################## # TRAINING ################################## logging.info( 'Start training: Total epochs: {}, Batch size: {}, Training size: {}, Validation size: {}' .format(config.epochs, config.batch_size, len(train_indices), len(test_indices))) logging.info('') for epoch in range(start_epoch, config.epochs): callback.on_epoch_begin() logs['epoch'] = epoch + 1 logs['lr'] = optimizer.param_groups[0]['lr'] logging.info('Epoch {:03d}, LR {:g}'.format( epoch + 1, optimizer.param_groups[0]['lr'])) # 每一个epoch都显示一个进度条 pbar = tqdm(total=len(train_loader), unit='batches') # unit 表示迭代速度的单位 pbar.set_description('Epoch {}/{}'.format(epoch + 1, config.epochs)) train(logs=logs, data_loader=train_loader, net=net, optimizer=optimizer, pbar=pbar) validate(logs=logs, data_loader=validate_loader, net=net, pbar=pbar) if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): scheduler.step(logs['val_loss']) else: scheduler.step(epoch) callback.on_epoch_end(logs, net) pbar.close()
def train_adr(frames, actions, states, hc_dim, ha_dim, ho_dim, za_dim=10, gaussian_a=False, context_frames=2, epochs=1, steps=1000, use_seq_len=12, clr_flag=False, base_lr=None, max_lr=None, half_cycle=4, learning_rate=0.001, action_net_units=256, val_iterator=None, val_steps=None, output_regularizer=None, lstm_units=256, lstm_layers=1, neptune_log=False, neptune_ckpt=False, save_dir='.', reg_lambda=0.0, ckpt_dir='.', ckpt_criteria='val_rec', config=None, ec_filename='Ec_o.h5', a_filename='A_o.h5', eo_filename='Eo.h5', do_filename='D_o.h5', la_filename='La_o.h5', da_filename='Da_o.h5', ec_load_name='Ec_a.h5', a_load_name='A_a.h5', da_load_name='D_a.h5', la_load_name='La.h5', continue_training=False, do_load_name='D_o.h5', eo_load_name='Eo.h5', random_window=True, keep_all=False, reconstruct_random_frame=False, save_model=True): if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir, exist_ok=True) bs, seq_len, w, h, c = [int(s) for s in frames.shape] a_dim = 0 if actions is None else actions.shape[-1] s_dim = 0 if states is None else states.shape[-1] za_dim = za_dim if gaussian_a else 0 La = None clbks = [] sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) K.set_session(sess) # == Instance and load the models Ec = load_recurrent_encoder([bs, context_frames, w, h, c], h_dim=hc_dim, ckpt_dir=ckpt_dir, filename=ec_load_name, trainable=False, load_model_state=True) Da = load_decoder(batch_shape=[bs, seq_len, hc_dim + ha_dim + za_dim], model_name='Da', ckpt_dir=ckpt_dir, output_channels=3, filename=da_load_name, output_activation='sigmoid', trainable=False, load_model_state=True) if continue_training: Eo = load_encoder(batch_shape=[bs, seq_len, w, h, 2 * c], h_dim=ho_dim, model_name='Eo', ckpt_dir=ckpt_dir, filename=eo_load_name, trainable=True, load_model_state=True) Do = load_decoder(batch_shape=[bs, seq_len, hc_dim + ha_dim + ho_dim], model_name='Do', ckpt_dir=ckpt_dir, output_channels=6, filename=do_load_name, output_activation='sigmoid', trainable=True, load_model_state=True) else: Do = image_decoder(batch_shape=[bs, seq_len, hc_dim + ho_dim + ha_dim], output_activation='sigmoid', output_channels=6, name='D_o', reg_lambda=reg_lambda, output_initializer='glorot_uniform', output_regularizer=output_regularizer) Eo = image_encoder(batch_shape=[bs, seq_len, w, h, c * 2], h_dim=ho_dim, name='Eo', reg_lambda=reg_lambda) # Eo = resnet18(batch_shape=[bs, seq_len, w, h, 2 * c], h_dim=ho_dim, name='Eo') if gaussian_a: A = load_action_net(batch_shape=[bs, seq_len, a_dim + s_dim], units=action_net_units, h_dim=ha_dim, ckpt_dir=ckpt_dir, filename=a_load_name, trainable=False, load_model_state=True) La = load_lstm(batch_shape=[bs, seq_len, hc_dim + ha_dim], h_dim=za_dim, lstm_units=lstm_units, n_layers=lstm_layers, ckpt_dir=ckpt_dir, filename=la_load_name, lstm_type='gaussian', trainable=False, load_model_state=False) # --> !!! else: A = load_recurrent_action_net([bs, seq_len, a_dim + s_dim], action_net_units, ha_dim, ckpt_dir=ckpt_dir, filename=a_load_name, trainable=False, load_model_state=True) ckpt_models = [Ec, Eo, Da, Do, A] filenames = [ ec_filename, eo_filename, da_filename, do_filename, a_filename ] if gaussian_a: ckpt_models.append(La) filenames.append(la_filename) adr_model = adr(frames, actions, states, context_frames, Ec=Ec, Eo=Eo, A=A, Da=Da, Do=Do, La=La, use_seq_len=use_seq_len, gaussian_a=gaussian_a, lstm_units=lstm_units, learning_rate=learning_rate, random_window=random_window, reconstruct_random_frame=reconstruct_random_frame) if neptune_log or neptune_ckpt: clbks.append( NeptuneCallback(user='******', project_name='adr', log=neptune_log, ckpt=neptune_ckpt)) if save_model: clbks.append( ModelCheckpoint(models=ckpt_models, criteria=ckpt_criteria, ckpt_dir=save_dir, filenames=filenames, neptune_ckpt=neptune_ckpt, keep_all=keep_all)) if clr_flag: clbks.append( CyclicLR(adr_model, base_lr, max_lr, step_size=half_cycle * steps)) adr_model.fit(x=None, batch_size=bs, epochs=epochs, steps_per_epoch=steps, callbacks=clbks, validation_data=val_iterator, validation_steps=val_steps, verbose=2) return adr_model.history
def train_adr_ao(frames, actions, states=None, context_frames=2, hc_dim=128, ha_dim=16, epochs=1, clr_flag=False, base_lr=None, max_lr=None, continue_training=False, reg_lambda=0.0, recurrent_lambda=0.0, output_regularizer=None, steps=1000, learning_rate=0.001, a_units=256, gaussian=False, z_dim=10, kl_weight=0.1, lstm_units=256, lstm_layers=2, config=None, half_cycle=4, val_steps=None, ckpt_dir='.', ckpt_criteria='val_rec', ec_filename='Ec_a.h5', d_filename='D_a.h5', a_filename='A_a.h5', l_filename='L_a.h5', ec_load_name='Ec_a.h5', d_load_name='D_a.h5', a_load_name='A_a.h5', l_load_name='L_a.h5', neptune_ckpt=False, neptune_log=False, train_iterator=None, val_iterator=None, reconstruct_random_frame=False, random_window=True, keep_all=False, use_seq_len=12, save_model=True): if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir, exist_ok=True) bs, seq_len, w, h, c = [int(s) for s in frames.shape] assert context_frames > 1, 'context frames must be greater or equal than 1' z_dim = 0 if gaussian is False else z_dim a_dim = actions.shape[-1] if actions is not None else 0 s_dim = states.shape[-1] if states is not None else 0 clbks = [] sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) K.set_session(sess) # ===== Define the sub models a_name = 'A' if gaussian else 'rA' # Remove the regularization parameters that are not used anymore Ec = get_sub_model(name='Ec', batch_shape=[bs, context_frames, w, h, c], h_dim=hc_dim, ckpt_dir=ckpt_dir, filename=ec_load_name, trainable=True, load_model_state=continue_training, load_flag=continue_training, conv_lambda=reg_lambda, recurrent_lambda=recurrent_lambda) D = get_sub_model(name='Da', batch_shape=[bs, use_seq_len, hc_dim + ha_dim + z_dim], h_dim=None, ckpt_dir=ckpt_dir, filename=d_load_name, trainable=True, load_model_state=continue_training, load_flag=continue_training, reg_lambda=reg_lambda, output_regularizer=output_regularizer) A = get_sub_model(name=a_name, batch_shape=[bs, use_seq_len, a_dim + s_dim], h_dim=ha_dim, ckpt_dir=ckpt_dir, filename=a_load_name, trainable=True, load_model_state=continue_training, load_flag=continue_training, units=a_units, dense_lambda=reg_lambda, recurrent_lambda=recurrent_lambda) L = get_sub_model(name='La', batch_shape=[bs, use_seq_len, hc_dim + ha_dim], h_dim=z_dim, ckpt_dir=ckpt_dir, filename=l_load_name, trainable=True, load_model_state=continue_training, load_flag=continue_training, units=lstm_units, n_layers=lstm_layers, lstm_type='gaussian', reparameterize=True, reg_lambda=recurrent_lambda) ckpt_models = [Ec, D, A] filenames = [ec_filename, d_filename, a_filename] if gaussian: ckpt_models.append(L) filenames.append(l_filename) ED = adr_ao(frames, actions, states, context_frames, Ec=Ec, A=A, D=D, L=L, use_seq_len=use_seq_len, learning_rate=learning_rate, gaussian=gaussian, kl_weight=kl_weight, lstm_units=lstm_units, lstm_layers=lstm_layers, training=True, random_window=random_window, reconstruct_random_frame=reconstruct_random_frame) # print(len(ED._collected_trainable_weights)) # print(len(E._collected_trainable_weights)) # print(len(C._collected_trainable_weights)) save_gifs_flag = True if save_model: clbks.append( ModelCheckpoint(models=ckpt_models, criteria=ckpt_criteria, ckpt_dir=ckpt_dir, filenames=filenames, neptune_ckpt=neptune_ckpt, keep_all=keep_all)) if neptune_log or neptune_ckpt: clbks.append( NeptuneCallback(user='******', project_name='video-prediction', log=neptune_log, ckpt=neptune_ckpt)) if save_gifs_flag: clbks.append( SaveGifsCallback(period=25, iterator=val_iterator, ckpt_dir=os.path.join(os.path.expanduser('~/'), 'adr/gifs'), name='pred2', bs=bs)) if clr_flag: clbks.append( CyclicLR(ED, base_lr, max_lr, step_size=half_cycle * steps)) eval_flag = True if eval_flag: clbks.append( EvaluateCallback(model=ED, iterator=val_iterator, steps=val_steps, period=25)) # def KLD(_mu, _logvar): # return -0.5 * np.mean(1 + _logvar - np.power(_mu, 2) - np.exp(_logvar), axis=0, keepdims=True) # # if save_kl_flag: # _KLD = KLD(mu, logvar) # _KLD /= 2.0 # # plt.figure() # plt.bar(np.arange(z_dim), np.mean(_KLD.squeeze(), axis=0)) # plt.savefig(os.path.join(os.path.expanduser('~/'), 'adr/gifs/kld_aggressive.png')) # --> !!! ED.fit(x=train_iterator, batch_size=bs, epochs=epochs, steps_per_epoch=steps, callbacks=clbks, validation_data=val_iterator, validation_steps=val_steps, verbose=2) return ED.history