def train( self, config, data_loader, dev_data_loader=None, aug=False, ): print('TRAINING(unsupervised)...') batch_size = config.batch_size * config.repeat logger = Logger(config.print_step) print(f'Pretraining for {config.pretrain_step} steps...') self.pretrain(data_loader=data_loader, dev_data_loader=dev_data_loader, batch_size=batch_size, config=config, logger=logger) print(f'Finetuning for {config.finetune_step} steps...') data_loader.set_use_ratio(use_ratio=config.finetune_ratio, verbose=True) self.finetune( data_loader=data_loader, dev_data_loader=dev_data_loader, batch_size=batch_size, config=config, logger=logger, aug=aug, ) print('=' * 80)
def create_callbacks(name, dumps): log_dir = Path(dumps['path']) / dumps['logs'] / name save_dir = Path(dumps['path']) / dumps['weights'] / name callbacks = Callbacks( [ Logger(log_dir), CheckpointSaver( metric_name=dumps['metric_name'], save_dir=save_dir, save_name='epoch_{epoch}.pth', num_checkpoints=4, mode='max' ), TensorBoard(str(log_dir)), FreezerCallback() ] ) return callbacks
def train( self, config, data_loader, dev_data_loader=None, aug=False, ): print('TRAINING(unsupervised)...') if aug: get_target_batch = data_loader.get_aug_target_batch else: get_target_batch = data_loader.get_target_batch logger = Logger(config.print_step) batch_size = config.batch_size * config.repeat max_err = 100.0 for step in range(1, config.step + 1): batch_sample_feat, batch_sample_len, batch_repeat_num, batch_phn_label = data_loader.get_sample_batch( config.batch_size, repeat=config.repeat, ) self.optimizer.zero_grad() feat_loss, intra_s_loss, inter_s_loss = self.bert_model.predict_feats( batch_sample_feat, batch_sample_len, batch_repeat_num) batch_target_idx, batch_target_len = get_target_batch(batch_size) target_loss = self.bert_model.predict_targets( batch_target_idx, batch_target_len) total_loss = feat_loss + target_loss + intra_s_loss # + inter_s_loss total_loss.backward() self.optimizer.step() logger.update({ 'feat_loss': feat_loss.item(), 'target_loss': target_loss.item(), 'total_loss': total_loss.item(), }) logger.update( { 'intra_s_loss': intra_s_loss.item(), 'inter_s_loss': inter_s_loss.item(), }, group_name='segment_losses') if step % config.eval_step == 0: step_err, labels, preds = self.phn_eval( data_loader, batch_size=batch_size, repeat=config.repeat, ) print(f'EVAL max: {max_err:.2f} step: {step_err:.2f}') logger.update({'val_per': step_err}, ema=False) logger.update( { "LABEL": " ".join(["%3s" % str(l) for l in labels[0]]), "PREDICT": " ".join(["%3s" % str(p) for p in preds[0]]), }, ema=False) if step_err < max_err: max_err = step_err self.save(config.save_path) logger.step() print('=' * 80)
def train( self, config, data_loader, dev_data_loader=None, aug=False, ): print('TRAINING(unsupervised)...') if aug: get_target_batch = data_loader.get_aug_target_batch else: get_target_batch = data_loader.get_target_batch batch_size = config.batch_size * config.repeat logger = Logger(print_step=config.print_step) max_fer = 100.0 frame_temp = 0.9 for step in range(1, config.step + 1): if step == 8000: frame_temp = 0.8 if step == 12000: frame_temp = 0.7 for _ in range(config.dis_iter): batch_sample_feat, batch_sample_len, batch_repeat_num, _ = data_loader.get_sample_batch( config.batch_size, repeat=config.repeat, ) batch_target_idx, batch_target_len = get_target_batch( batch_size) feed_dict = { self.sample_feat: batch_sample_feat, self.sample_len: batch_sample_len, self.target_idx: batch_target_idx, self.target_len: batch_target_len, self.learning_rate: config.dis_lr, self.frame_temp: frame_temp } run_list = [self.dis_loss, self.train_dis_op] dis_loss, _ = self.sess.run(run_list, feed_dict=feed_dict) logger.update({'c_loss': float(dis_loss)}) for _ in range(config.gen_iter): batch_sample_feat, batch_sample_len, batch_repeat_num, _ = data_loader.get_sample_batch( config.batch_size, repeat=config.repeat, ) batch_target_idx, batch_target_len = get_target_batch( batch_size) feed_dict = { self.sample_feat: batch_sample_feat, self.sample_len: batch_sample_len, self.target_idx: batch_target_idx, self.target_len: batch_target_len, self.sample_rep: batch_repeat_num, self.learning_rate: config.gen_lr, self.frame_temp: frame_temp } run_list = [ self.gen_loss, self.seg_loss, self.train_gen_op, self.fake_sample ] gen_loss, seg_loss, _, sample = self.sess.run( run_list, feed_dict=feed_dict) logger.update({ 'g_loss': float(gen_loss), 'seg_loss': float(seg_loss), 'fake_sample': array_to_string(np.argmax(sample[0], axis=-1)), }) if step % config.eval_step == 0: step_fer = frame_eval(self.predict_batch, dev_data_loader) print(f'EVAL max: {max_fer:.2f} step: {step_fer:.2f}') logger.update({'val_fer': step_fer}, ema=False) if step_fer < max_fer: max_fer = step_fer self.saver.save(self.sess, config.save_path) logger.step() print('=' * 80)
def main(): global args, best_loss args = parser.parse_args() print(args) # create model if args.model == 'coord': from model.model_lib import VideoModelCoord as VideoModel elif args.model == 'coord_latent': from model.model_lib import VideoModelCoordLatent as VideoModel elif args.model == 'coord_latent_nl': from model.model_lib import VideoModelCoordLatentNL as VideoModel elif args.model == 'global_coord_latent': from model.model_lib import VideoModelGlobalCoordLatent as VideoModel elif args.model == 'global_coord_latent_nl': from model.model_lib import VideoModelGlobalCoordLatentNL as VideoModel elif args.model == 'global_i3d': from model.model_lib import VideoGlobalModel as VideoModel elif args.model == 'global_coord': from model.model_lib import VideoModelGlobalCoord as VideoModel model = VideoModel(args) # optionally resume from a checkpoint if args.resume: assert os.path.isfile( args.resume), "No checkpoint found at '{}'".format(args.resume) print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) if args.start_epoch is None: args.start_epoch = checkpoint['epoch'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) if args.start_epoch is None: args.start_epoch = 0 model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True # create training and validation dataset dataset_train = VideoFolder( root=args.root_frames, num_boxes=args.num_boxes, file_input=args.json_data_train, file_labels=args.json_file_labels, frames_duration=args.num_frames, args=args, is_val=False, if_augment=True, model=args.model, ) dataset_val = VideoFolder( root=args.root_frames, num_boxes=args.num_boxes, file_input=args.json_data_val, file_labels=args.json_file_labels, frames_duration=args.num_frames, args=args, is_val=True, if_augment=True, model=args.model, ) # create training and validation loader train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, drop_last=True, pin_memory=True) val_loader = torch.utils.data.DataLoader(dataset_val, drop_last=True, batch_size=args.batch_size * 2, shuffle=False, num_workers=4, pin_memory=True) optimizer = torch.optim.SGD(model.parameters(), momentum=args.momentum, lr=args.lr, weight_decay=args.weight_decay) criterion = torch.nn.CrossEntropyLoss() if args.evaluate: validate(val_loader, model, criterion, class_to_idx=dataset_val.classes_dict) return # training, start a logger tb_logdir = os.path.join(args.logdir, args.logname) if not (os.path.exists(tb_logdir)): os.makedirs(tb_logdir) tb_logger = Logger(tb_logdir) for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch, args.lr_steps) # train for one epoch train(train_loader, model, optimizer, epoch, criterion, tb_logger) # evaluate on validation set if (not args.fine_tune) or (epoch + 1) % 10 == 0: loss = validate(val_loader, model, criterion, epoch=epoch, tb_logger=tb_logger) else: loss = 100 # remember best loss and save checkpoint is_best = loss < best_loss best_loss = min(loss, best_loss) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.module.state_dict(), 'best_loss': best_loss, }, is_best, os.path.join(args.ckpt, args.arch.lower() + '_{}'.format(args.logname)))
def train( self, config, data_loader, dev_data_loader=None, aug=False, ): print('TRAINING(unsupervised)...') if aug: get_target_batch = data_loader.get_aug_target_batch else: get_target_batch = data_loader.get_target_batch batch_size = config.batch_size * config.repeat logger = Logger(print_step=config.print_step) max_fer = 100.0 for step in range(1, config.step + 1): self.generator.eval() for _ in range(config.dis_iter): self.c_opt.zero_grad() batch_sample_feat, batch_sample_len, batch_repeat_num, batch_phn_label = data_loader.get_sample_batch( config.batch_size, repeat=config.repeat, ) real_target_idx, batch_target_len = get_target_batch(batch_size) batch_sample_feat = get_tensor_from_array(batch_sample_feat) real_target_idx = get_tensor_from_array(real_target_idx).long() mask = create_attention_mask(batch_sample_len, config.phn_max_length) fake_target_logits, fake_target_idx = self.generator(batch_sample_feat, mask) real_score = self.critic(real_target_idx) fake_score = self.critic(fake_target_idx) if not self.wgan: c_loss = torch.mean(-torch.log(real_score + epsilon)) + \ torch.mean(-torch.log(1 - fake_score + epsilon)) else: c_loss = torch.mean(-real_score) + torch.mean(fake_score) c_loss.backward() self.c_opt.step() logger.update({ 'c_loss': c_loss.item(), 'true_sample': array_to_string(real_target_idx[0].cpu().data.numpy()), }) self.generator.train() self.critic.eval() for _ in range(config.gen_iter): self.g_opt.zero_grad() batch_sample_feat, batch_sample_len, batch_repeat_num, batch_phn_label = data_loader.get_sample_batch( config.batch_size, repeat=config.repeat, ) batch_sample_feat = get_tensor_from_array(batch_sample_feat) mask = create_attention_mask(batch_sample_len, config.phn_max_length) batch_repeat_num = get_tensor_from_array(batch_repeat_num) fake_target_logits, fake_target_idx = self.generator(batch_sample_feat, mask) fake_score = self.critic(fake_target_idx) # shape: (N, 1) reward = self.critic.compute_G_reward(fake_score) kernel = self.critic.get_kernel() g_loss = self.generator.compute_loss(reward, kernel, fake_target_logits, fake_target_idx, mask) segment_loss = intra_segment_loss( fake_target_logits, batch_repeat_num, mask, sep_size=(config.batch_size * config.repeat) // 2, ) total_loss = g_loss + config.seg_loss_ratio * segment_loss total_loss.backward() self.g_opt.step() logger.update({ 'g_loss': g_loss.item(), 'seg_loss': segment_loss.item(), 'fake_sample': array_to_string(fake_target_idx[0].cpu().data.numpy()), 'baseline': self.critic.ema.average.item(), }) self.critic.train() if step % config.eval_step == 0: step_fer = frame_eval(self.predict_batch, dev_data_loader) logger.update({'val_fer': step_fer}, ema=False) print(f'EVAL max: {max_fer:.2f} step: {step_fer:.2f}') if step_fer < max_fer: max_fer = step_fer logger.step() print('=' * 80)
def main(): global args, best_loss args = parser.parse_args() if not os.path.exists(args.ckpt): os.mkdir(args.ckpt) print(args) # create model if args.model == 'coord': from model.model_lib import VideoModelCoord as VideoModel elif args.model == 'coordAttention': from model.model_lib import VideoModelCoordAttention as VideoModel elif args.model == 'coordAdd': from model.model_lib import VideoModelCoordAdd as VideoModel elif args.model == 'coordSemDualAttention': from model.model_lib import VideoModelCoorSemDualdAttention as VideoModel elif args.model == 'coord_latent': from model.model_lib import VideoModelCoordLatent as VideoModel elif args.model == 'coord_latent_nl': from model.model_lib import VideoModelCoordLatentNL as VideoModel elif args.model == 'global_coord_latent': from model.model_lib import VideoModelGlobalCoordLatent as VideoModel elif args.model == 'global_coord_latent_nl': from model.model_lib import VideoModelGlobalCoordLatentNL as VideoModel elif args.model == 'global_i3d': from model.model_lib import VideoGlobalModel as VideoModel elif args.model == 'global_coord': from model.model_lib import VideoModelGlobalCoord as VideoModel model = VideoModel(args) # optionally resume from a checkpoint if args.resume: assert os.path.isfile( args.resume), "No checkpoint found at '{}'".format(args.resume) print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) if args.start_epoch is None: args.start_epoch = checkpoint['epoch'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) if args.start_epoch is None: args.start_epoch = 0 model.to(cuda_device) #model = torch.nn.DataParallel(model) cudnn.benchmark = True # create training and validation dataset dataset_train = VideoFolder( root=args.root_frames, num_boxes=args.num_boxes, file_input=args.json_data_train, file_labels=args.json_file_labels, word2vec_weights=args.word2vec_weights_path, frames_duration=args.num_frames, video_root=args.video_root, args=args, is_val=False, if_augment=True, model=args.model, ) dataset_val = VideoFolder( root=args.root_frames, num_boxes=args.num_boxes, file_input=args.json_data_val, file_labels=args.json_file_labels, word2vec_weights=args.word2vec_weights_path, frames_duration=args.num_frames, video_root=args.video_root, args=args, is_val=True, if_augment=True, model=args.model, ) # create training and validation loader train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, drop_last=True, pin_memory=True) val_loader = torch.utils.data.DataLoader(dataset_val, drop_last=True, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) optimizer = torch.optim.SGD(model.parameters(), momentum=args.momentum, lr=args.lr, weight_decay=args.weight_decay) criterion = torch.nn.CrossEntropyLoss() if args.evaluate: validate(val_loader, model, criterion, class_to_idx=dataset_val.classes_dict) return # training, start a logger tb_logdir = os.path.join(args.logdir, args.logname) if not (os.path.exists(tb_logdir)): os.makedirs(tb_logdir) tb_logger = Logger(tb_logdir) acc_history = {} for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch, args.lr_steps) # train for one epoch loss, acc_top1, acc_top5 = train(train_loader, model, optimizer, epoch, criterion, tb_logger) acc_history["acc_top1_epoch_training"] = (epoch, acc_top1) acc_history["acc_top5_epoch_training"] = (epoch, acc_top5) acc_history["loss_epoch_training"] = (epoch, loss) # evaluate on validation set # if (not args.fine_tune) or (epoch + 1) % 10 == 0: if (not args.fine_tune): loss, acc_top1, acc_top5 = validate(val_loader, model, criterion, epoch=epoch, tb_logger=tb_logger) acc_history["acc_top1_epoch_val"] = (epoch, acc_top1) acc_history["acc_top5_epoch_val"] = (epoch, acc_top5) acc_history["loss_epoch_val"] = (epoch, loss) else: loss = 100 # remember best loss and save checkpoint is_best = loss < best_loss best_loss = min(loss, best_loss) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, #'state_dict': model.module.state_dict(), 'state_dict': model.state_dict(), 'best_loss': best_loss, }, is_best, os.path.join(args.ckpt, args.arch.lower() + '_{}'.format(args.logname))) # Finish one epoch, log the acc accu_path = Path(args.acc_history_dir) / args.logname if not os.path.exists(accu_path): accu_path.mkdir(parents=True) for key in acc_history: acc_file = f"{key}.txt" acc_file = accu_path / acc_file if os.path.exists(acc_file): append_write = 'a' else: append_write = 'w' with open(acc_file, append_write) as f: f.write( str(acc_history[key][0]) + "\t" + str(acc_history[key][1]) + "\n")