def fit(self, train_loader, valid_loader, start_epoch=0, max_epochs=200): best_acc = 0. bar = IncrementalBar(max=max_epochs - start_epoch) for e in range(start_epoch, max_epochs): bar.message = '{:>5.2f}%%'.format(bar.percent) bar.suffix = '{}/{} [{}<{}\t{:.2f}it/s]'.format( bar.index, bar.max, bar.elapsed_td, bar.eta_td, bar.avg) bar.next() if e == self.milestones[0]: schedule_lr(self.optimizer) # update learning rate once if e == self.milestones[1]: schedule_lr(self.optimizer) if e == self.milestones[2]: schedule_lr(self.optimizer) self.train(train_loader, self.model, self.criterion, self.optimizer, e) accuracy, best_threshold, roc_curve_tensor = self.evaluate( self.conf, *valid_loader['agedb_30']) self.board_val('agedb_30', accuracy, best_threshold, roc_curve_tensor, e) if accuracy > best_acc: best_acc = accuracy save_checkpoint(self.model, self.optimizer, self.conf, best_acc, e) bar.finish()
DISP_FREQ = len( train_loader) // 100 # frequency to display training loss & acc NUM_EPOCH_WARM_UP = NUM_EPOCH // 25 # use the first 1/25 epochs to warm up NUM_BATCH_WARM_UP = len( train_loader ) * NUM_EPOCH_WARM_UP # use the first 1/25 epochs to warm up batch = 0 # batch index lambda_t = 0.3 L1_LOSS = nn.L1Loss() MSE_LOSS = nn.MSELoss() for epoch in range(NUM_EPOCH): # start training process if epoch == STAGES[ 0]: # adjust LR for each training stage after warm up, you can also choose to adjust LR manually (with slight modification) once plaueau observed schedule_lr(OPTIMIZER) if epoch == STAGES[1]: schedule_lr(OPTIMIZER) if epoch == STAGES[2]: schedule_lr(OPTIMIZER) BACKBONE.eval() # set to different mode MaskNet.train() losses = AverageMeter() l1_losses = AverageMeter() mse_losses = AverageMeter() for inputs, labels in tqdm(iter(train_loader)): if (epoch + 1 <= NUM_EPOCH_WARM_UP) and ( batch + 1 <= NUM_BATCH_WARM_UP
def train(ds_train, ds_adapt, args): use_cuda = not args.no_cuda and torch.cuda.is_available() print('='*30) print('USE_CUDA SET TO: {}'.format(use_cuda)) print('CUDA AVAILABLE?: {}'.format(torch.cuda.is_available())) print('='*30) device = torch.device("cuda" if use_cuda else "cpu") writer = SummaryWriter(comment=os.path.basename(args.cfg)) num_classes = ds_train.num_classes if args.model_type == 'XTDNN': generator = XTDNN() if args.model_type == 'ETDNN': generator = ETDNN() if args.model_type == 'FTDNN': generator = FTDNN() if args.loss_type == 'adm': classifier = AMSMLoss(512, num_classes) if args.loss_type == 'adacos': classifier = AdaCos(512, num_classes) if args.loss_type == 'l2softmax': classifier = L2SoftMax(512, num_classes) if args.loss_type == 'softmax': classifier = SoftMax(512, num_classes) if args.loss_type == 'xvec': classifier = XVecHead(512, num_classes) if args.loss_type == 'arcface': classifier = ArcFace(512, num_classes) if args.loss_type == 'sphereface': classifier = SphereFace(512, num_classes) generator.train() classifier.train() generator = generator.to(device) classifier = classifier.to(device) if args.resume_checkpoint != 0: model_str = os.path.join(args.model_dir, '{}_{}.pt') for model, modelstr in [(generator, 'g'), (classifier, 'c')]: model.load_state_dict(torch.load(model_str.format(modelstr, args.resume_checkpoint))) if args.use_dropadapt and args.use_dropclass: model_str = os.path.join(args.model_dir, '{}_adapt_start.pt') for model, modelstr in [(generator, 'g'), (classifier, 'c')]: model_path = model_str.format(modelstr) assert os.path.isfile(model_path), "Couldn't find [g|c]_adapt_start.pt models in {}".format(args.model_dir) model.load_state_dict(torch.load(model_path)) optimizer = torch.optim.SGD([{'params': generator.parameters(), 'lr': args.lr}, {'params': classifier.parameters(), 'lr': args.lr * args.classifier_lr_mult}, ], momentum=args.momentum) if args.label_smooth_type == 'None': criterion = nn.CrossEntropyLoss() if args.label_smooth_type == 'disturb': criterion = DisturbLabelLoss(device, disturb_prob=args.label_smooth_prob) if args.label_smooth_type == 'uniform': criterion = LabelSmoothingLoss(smoothing=args.label_smooth_prob) iterations = 0 total_loss = 0 running_loss = [np.nan for _ in range(500)] best_vc1_eer = (-1, 1.0) best_sitw_eer = (-1, 1.0) if os.path.isfile(args.results_pkl): rpkl = pickle.load(open(args.results_pkl, "rb")) if args.test_data_vc1: v1eers = [(rpkl[key]['vc1_eer'], i) for i, key in enumerate(rpkl)] bestvc1 = min(v1eers) best_vc1_eer = (bestvc1[1], bestvc1[0]) if args.test_data_sitw: sitweers = [(rpkl[key]['sitw_eer'], i) for i, key in enumerate(rpkl)] bestsitw = min(sitweers) best_sitw_eer = (bestsitw[1], bestsitw[0]) else: rpkl = OrderedDict({}) if args.multi_gpu: dpp_generator = nn.DataParallel(generator).to(device) data_generator = ds_train.get_batches(batch_size=args.batch_size, max_seq_len=args.max_seq_len) if args.use_dropclass: classifier.drop() else: classifier.nodrop() if args.model_type == 'FTDNN': drop_indexes = np.linspace(0, 1, args.num_iterations) drop_sch = ([0, 0.5, 1], [0, 0.5, 0]) drop_schedule = np.interp(drop_indexes, drop_sch[0], drop_sch[1]) for iterations in range(1, args.num_iterations + 1): if iterations > args.num_iterations: break if iterations in args.scheduler_steps: schedule_lr(optimizer, factor=args.scheduler_lambda) if iterations <= args.resume_checkpoint: print('Skipping iteration {}'.format(iterations)) print('Skipping iteration {}'.format(iterations), file=open(args.log_file, "a")) continue if args.model_type == 'FTDNN': generator.set_dropout_alpha(drop_schedule[iterations-1]) if args.use_dropclass and not args.drop_per_batch and not args.use_dropadapt: if iterations % args.its_per_drop == 0 or iterations == 1: ds_train, classifier = drop_classes(ds_train, classifier, num_drop=args.num_drop) if args.reset_affine_each_it: classifier.reset_parameters() if args.use_dropclass and args.use_dropadapt: if iterations % args.its_per_drop == 0 or iterations == 2: # this feeds one batch in to 'reserve' CUDA memory, having iterations == 1 fails if args.dropadapt_random: ds_train, classifier = drop_adapt_random(classifier, ds_train, num_drop=args.num_drop) else: with torch.no_grad(): print('------ [{}/{}] classes remaining'.format(len(classifier.rem_classes), classifier.n_classes)) print('------ Aggregating training class probs on {}'.format(args.ds_adapt)) full_probs = aggregate_probs(ds_adapt, generator, classifier, device, batch_size=300, max_seq_len=args.max_seq_len, uniform=args.dropadapt_uniform_agg) np.save(os.path.join(args.model_dir, 'probs_{}.npy'.format(iterations)), full_probs) print('------ Dropping ~{} more classes from the next {} training steps'.format(args.num_drop, args.its_per_drop)) if args.dropadapt_combine: print('------ Combining least probable classes into one...') ds_train, classifier = drop_adapt_combine(full_probs, classifier, ds_train, num_drop=args.num_drop) else: if args.dropadapt_onlydata: ds_train = drop_adapt_onlydata(full_probs, ds_train, num_drop=args.num_drop) else: ds_train, classifier = drop_adapt(full_probs, classifier, ds_train, num_drop=args.num_drop) print('------ [{}/{}] classes remaining'.format(len(classifier.rem_classes), classifier.n_classes)) np.save(os.path.join(args.model_dir, 'remclasses_{}.npy'.format(iterations)), classifier.rem_classes) del full_probs feats, iden = next(data_generator) if args.drop_per_batch and args.use_dropclass: classifier = drop_per_batch(iden, classifier) if args.reset_affine_each_it: classifier.reset_parameters() feats = feats.to(device) if args.use_dropclass: iden = classifier.get_mini_labels(iden).to(device) else: iden = torch.LongTensor(iden).to(device) if args.multi_gpu: embeds = dpp_generator(feats) else: embeds = generator(feats) if args.loss_type == 'softmax': preds = classifier(embeds) else: preds = classifier(embeds, iden) loss = criterion(preds, iden) optimizer.zero_grad() loss.backward() optimizer.step() if args.model_type == 'FTDNN': generator.step_ftdnn_layers() running_loss.pop(0) running_loss.append(loss.item()) rmean_loss = np.nanmean(np.array(running_loss)) if iterations % 10 == 0: msg = "{}: {}: [{}/{}] \t C-Loss:{:.4f}, AvgLoss:{:.4f}, lr: {}, bs: {}".format( args.model_dir, time.ctime(), iterations, args.num_iterations, loss.item(), rmean_loss, get_lr(optimizer), len(feats)) print(msg) print(msg, file=open(args.log_file, "a")) writer.add_scalar('class loss', loss.item(), iterations) writer.add_scalar('Avg loss', rmean_loss, iterations) if iterations % args.checkpoint_interval == 0: for model, modelstr in [(generator, 'g'), (classifier, 'c')]: model.eval().cpu() cp_filename = "{}_{}.pt".format(modelstr, iterations) cp_model_path = os.path.join(args.model_dir, cp_filename) torch.save(model.state_dict(), cp_model_path) model.to(device).train() rpkl[iterations] = {} if args.test_data_vc1: vc1_eer = test(generator, ds_test_vc1, device) print('EER on VoxCeleb1: {}'.format(vc1_eer)) print('EER on Voxceleb1: {}'.format(vc1_eer), file=open(args.log_file, "a")) writer.add_scalar('vc1_eer', vc1_eer, iterations) if vc1_eer < best_vc1_eer[1]: best_vc1_eer = (iterations, vc1_eer) print('Best VC1 EER: {}'.format(best_vc1_eer)) print('Best VC1 EER: {}'.format(best_vc1_eer), file=open(args.log_file, "a")) rpkl[iterations]['vc1_eer'] = vc1_eer if args.test_data_sitw: sitw_eer = test_nosil(generator, ds_test_sitw, device) print('EER on SITW(DEV): {}'.format(sitw_eer)) print('EER on SITW(DEV): {}'.format(sitw_eer), file=open(args.log_file, "a")) writer.add_scalar('sitw_eer', sitw_eer, iterations) if sitw_eer < best_sitw_eer[1]: best_sitw_eer = (iterations, sitw_eer) print('Best SITW(DEV) EER: {}'.format(best_sitw_eer)) print('Best SITW(DEV) EER: {}'.format(best_sitw_eer), file=open(args.log_file, "a")) rpkl[iterations]['sitw_eer'] = sitw_eer pickle.dump(rpkl, open(args.results_pkl, "wb")) # ---- Final model saving ----- for model, modelstr in [(generator, 'g'), (classifier, 'c')]: model.eval().cpu() cp_filename = "final_{}_{}.pt".format(modelstr, iterations) cp_model_path = os.path.join(args.model_dir, cp_filename) torch.save(model.state_dict(), cp_model_path)
def train(ds_train): use_cuda = not args.no_cuda and torch.cuda.is_available() print('=' * 30) print('USE_CUDA SET TO: {}'.format(use_cuda)) print('CUDA AVAILABLE?: {}'.format(torch.cuda.is_available())) print('=' * 30) device = torch.device("cuda" if use_cuda else "cpu") writer = SummaryWriter(comment=os.path.basename(args.cfg)) if args.model_type == 'XTDNN': generator = XTDNN(features_per_frame=args.input_dim, embed_features=args.embedding_dim) if args.model_type == 'ETDNN': generator = ETDNN(features_per_frame=args.input_dim, embed_features=args.embedding_dim) if args.model_type == 'FTDNN': generator = FTDNN(in_dim=args.input_dim, embedding_dim=args.embedding_dim) generator.train() generator = generator.to(device) model_dict = {'generator': {'model': generator, 'lr_mult': 1., 'loss_weight': None}} clf_head_dict = {k: {'model': None, 'lr_mult': lr_mult, 'loss_weight': loss_weight} for k, lr_mult, loss_weight in zip(args.classifier_heads, args.classifier_lr_mults, args.classifier_loss_weights)} num_cls_per_task = [ds_train.num_classes[t] for t in args.classifier_heads] for clf_target, clf_type, num_classes, clf_smooth_type in zip(args.classifier_heads, args.classifier_types, num_cls_per_task, args.classifier_smooth_types): if clf_type == 'adm': clf = AMSMLoss(args.embedding_dim, num_classes) elif clf_type == 'adacos': clf = AdaCos(args.embedding_dim, num_classes) elif clf_type == 'l2softmax': clf = L2SoftMax(args.embedding_dim, num_classes) elif clf_type == 'softmax': clf = SoftMax(args.embedding_dim, num_classes) elif clf_type == 'xvec': clf = XVecHead(args.embedding_dim, num_classes) elif clf_type == 'xvec_regression': clf = XVecHead(args.embedding_dim, 1) elif clf_type == 'xvec_uncertain': clf = XVecHeadUncertain(args.embedding_dim, num_classes) elif clf_type == 'arcface': clf = ArcFace(args.embedding_dim, num_classes) elif clf_type == 'sphereface': clf = SphereFace(args.embedding_dim, num_classes) else: assert None, 'Classifier type {} not found'.format(clf_type) if clf_head_dict[clf_target]['loss_weight'] >= 0.0: clf_head_dict[clf_target]['model'] = clf.train().to(device) else: # GRL for negative loss weight abs_lw = np.abs(clf_head_dict[clf_target]['loss_weight']) clf_head_dict[clf_target]['model'] = nn.Sequential( GradientReversal(lambda_=abs_lw), clf ).train().to(device) clf_head_dict[clf_target]['loss_weight'] = 1.0 # this is lambda_ in the GRL if clf_smooth_type == 'none': if clf_target.endswith('regression'): clf_smooth = nn.SmoothL1Loss() else: clf_smooth = nn.CrossEntropyLoss() elif clf_smooth_type == 'twoneighbour': clf_smooth = TwoNeighbourSmoothingLoss(smoothing=args.label_smooth_prob) elif clf_smooth_type == 'uniform': clf_smooth = LabelSmoothingLoss(smoothing=args.label_smooth_prob) elif clf_smooth_type == 'disturb': clf_smooth = DisturbLabelLoss(device, disturb_prob=args.label_smooth_prob) else: assert None, 'Smooth type not found: {}'.format(clf_smooth_type) clf_head_dict[clf_target]['criterion'] = clf_smooth model_dict.update(clf_head_dict) if args.classifier_loss_weighting_type == 'uncertainty_kendall': model_dict['loss_aggregator'] = { 'model': MultiTaskUncertaintyLossKendall(len(args.classifier_heads)).to(device), 'lr_mult': 1., 'loss_weight': None } if args.classifier_loss_weighting_type == 'uncertainty_liebel': model_dict['loss_aggregator'] = { 'model': MultiTaskUncertaintyLossLiebel(len(args.classifier_heads)).to(device), 'lr_mult': 1., 'loss_weight': None } if args.resume_checkpoint != 0: model_str = os.path.join(args.model_dir, '{}_{}.pt') for m in model_dict: model_dict[m]['model'].load_state_dict(torch.load(model_str.format(m, args.resume_checkpoint))) optimizer = torch.optim.SGD( [{'params': model_dict[m]['model'].parameters(), 'lr': args.lr * model_dict[m]['lr_mult']} for m in model_dict], momentum=args.momentum) iterations = 0 total_loss = 0 running_loss = [np.nan for _ in range(500)] non_spk_clf_heads = [a for a in args.classifier_heads if a != 'speaker'] best_test_eer = (-1, 1.0) best_test_dcf = (-1, 1.0) best_acc = {k: (-1, 0.0) for k in non_spk_clf_heads} if os.path.isfile(args.results_pkl) and args.resume_checkpoint != 0: rpkl = pickle.load(open(args.results_pkl, "rb")) keylist = list(rpkl.keys()) if args.test_data: test_eers = [(rpkl[key]['test_eer'], key) for i, key in enumerate(rpkl)] best_teer = min(test_eers) best_test_eer = (best_teer[1], best_teer[0]) test_dcfs = [(rpkl[key]['test_dcf'], key) for i, key in enumerate(rpkl)] besttest_dcf = min(test_dcfs) best_test_dcf = (besttest_dcf[1], besttest_dcf[0]) else: rpkl = OrderedDict({}) if args.multi_gpu: dpp_generator = nn.DataParallel(generator).to(device) data_generator = ds_train.get_batches(batch_size=args.batch_size, max_seq_len=args.max_seq_len) if args.model_type == 'FTDNN': drop_indexes = np.linspace(0, 1, args.num_iterations) drop_sch = ([0, 0.5, 1], [0, 0.5, 0]) drop_schedule = np.interp(drop_indexes, drop_sch[0], drop_sch[1]) for iterations in range(1, args.num_iterations + 1): if iterations > args.num_iterations: break if iterations in args.scheduler_steps: schedule_lr(optimizer, factor=args.scheduler_lambda) if iterations <= args.resume_checkpoint: print('Skipping iteration {}'.format(iterations), file=open(args.log_file, "a")) continue if args.model_type == 'FTDNN': if args.dropout: generator.set_dropout_alpha(drop_schedule[iterations - 1]) feats, labels = next(data_generator) feats = feats.to(device) if args.multi_gpu: embeds = dpp_generator(feats) else: embeds = generator(feats) total_loss = 0 losses = [] loss_tensors = [] for m in args.classifier_heads: lab = labels[m].to(device) if m == 'rec': preds = model_dict[m]['model'](embeds) else: preds = model_dict[m]['model'](embeds, lab) loss = model_dict[m]['criterion'](preds, lab) if args.classifier_loss_weighting_type == 'none': total_loss += loss * model_dict[m]['loss_weight'] else: loss_tensors.append(loss) losses.append(round(loss.item(), 4)) if args.classifier_loss_weighting_type.startswith('uncertainty'): loss_tensors = torch.FloatTensor(loss_tensors).to(device) total_loss = model_dict['loss_aggregator']['model'](loss_tensors) if args.classifier_loss_weighting_type == 'dwa': loss_tensors = loss_tensors if iterations < 4: loss_t_1 = np.ones(len(loss_tensors)) for l in loss_tensors: total_loss += l else: dwa_w = loss_t_1/loss_t_2 K = len(loss_tensors) per_task_weight = torch.FloatTensor(dwa_w/args.dwa_temperature) #lambda_k per_task_weight = torch.nn.functional.softmax(per_task_weight, dim=0) * K per_task_weight = per_task_weight.numpy() for l, w in zip(loss_tensors, per_task_weight): total_loss += l * w loss_t_2 = loss_t_1.copy() loss_t_1 = torch.FloatTensor(loss_tensors).detach().cpu().numpy() optimizer.zero_grad() total_loss.backward() optimizer.step() if args.model_type == 'FTDNN': generator.step_ftdnn_layers() running_loss.pop(0) running_loss.append(total_loss.item()) rmean_loss = np.nanmean(np.array(running_loss)) if iterations % 10 == 0: msg = "{}: {}: [{}/{}] \t C-Loss:{:.4f}, AvgLoss:{:.4f}, losses: {}, lr: {}, bs: {}".format( args.model_dir, time.ctime(), iterations, args.num_iterations, total_loss.item(), rmean_loss, losses, get_lr(optimizer), len(feats)) print(msg) print(msg, file=open(args.log_file, "a")) writer.add_scalar('combined loss', total_loss.item(), iterations) writer.add_scalar('Avg loss', rmean_loss, iterations) if iterations % args.checkpoint_interval == 0: for m in model_dict: model_dict[m]['model'].eval().cpu() cp_filename = "{}_{}.pt".format(m, iterations) cp_model_path = os.path.join(args.model_dir, cp_filename) torch.save(model_dict[m]['model'].state_dict(), cp_model_path) model_dict[m]['model'].to(device).train() if args.test_data: rpkl, best_test_eer, best_test_dcf = eval_step(model_dict, device, ds_test, iterations, rpkl, writer, best_test_eer, best_test_dcf, best_acc) # ---- Final model saving ----- for m in model_dict: model_dict[m]['model'].eval().cpu() cp_filename = "final_{}_{}.pt".format(m, iterations) cp_model_path = os.path.join(args.model_dir, cp_filename) torch.save(model_dict[m]['model'].state_dict(), cp_model_path)