def test_mandatory_mapping(self): mandatory_mapping = [[['"group.1.1"', '"group.1.2"', '"group.1.3"']]] self.assertTrue( validate(self.variables, mandatory_mapping=mandatory_mapping)['passed']) mandatory_mapping = [[['"group.2.1"', '"group.2.2"']]] self.assertFalse( validate(self.variables, mandatory_mapping=mandatory_mapping)['passed']) mandatory_mapping = [[['"group.3.1"', '"group.3.2"', '"group.3.3"']]] self.assertFalse( validate(self.variables, mandatory_mapping=mandatory_mapping)['passed']) mandatory_mapping = [[['"group.1.1"', '"group.1.2"', '"group.1.3"'], ['"group.3.1"', '"group.3.2"', '"group.3.3"']]] self.assertTrue( validate(self.variables, mandatory_mapping=mandatory_mapping)['passed']) mandatory_mapping = [[['"group.2.1"', '"group.2.2"'], ['"group.3.1"', '"group.3.2"', '"group.3.3"']]] self.assertFalse( validate(self.variables, mandatory_mapping=mandatory_mapping)['passed'])
def test_type_mapping_int(self): type_mapping = {'int': 'int'} self.assertTrue( validate(self.variables, type_mapping=type_mapping)['passed']) type_mapping = {'string': 'int'} self.assertFalse( validate(self.variables, type_mapping=type_mapping)['passed']) type_mapping = {'int': {'type': 'int', 'min_value': -10}} self.assertTrue( validate(self.variables, type_mapping=type_mapping)['passed']) type_mapping = {'int': {'type': 'int', 'min_value': 10}} self.assertFalse( validate(self.variables, type_mapping=type_mapping)['passed']) type_mapping = {'int': {'type': 'int', 'max_value': 10}} self.assertTrue( validate(self.variables, type_mapping=type_mapping)['passed']) type_mapping = {'int': {'type': 'int', 'max_value': -10}} self.assertFalse( validate(self.variables, type_mapping=type_mapping)['passed']) type_mapping = { 'int': { 'type': 'int', 'min_value': -10, 'max_value': 10 } } self.assertTrue( validate(self.variables, type_mapping=type_mapping)['passed']) type_mapping = { 'int': { 'type': 'int', 'min_value': 5, 'max_value': 10 } } self.assertFalse( validate(self.variables, type_mapping=type_mapping)['passed']) type_mapping = {'int': {'type': 'int', 'ranges': [[-10, 10]]}} self.assertTrue( validate(self.variables, type_mapping=type_mapping)['passed']) type_mapping = {'int': {'type': 'int', 'ranges': [[5, 10]]}} self.assertFalse( validate(self.variables, type_mapping=type_mapping)['passed'])
def test_incompatible_mapping(self): incompatible_mapping = [[['"group.1.1"', '"group.1.2"', '"group.1.3"'], ['"group.2.1"', '"group.2.2"']]] self.assertFalse( validate(self.variables, incompatible_mapping=incompatible_mapping)['passed']) incompatible_mapping = [[['"group.1.1"', '"group.1.2"', '"group.1.3"'], ['"group.3.1"', '"group.3.2"', '"group.3.3"']]] self.assertTrue( validate(self.variables, incompatible_mapping=incompatible_mapping)['passed']) incompatible_mapping = [[['"group.2.1"', '"group.2.2"'], ['"group.3.1"', '"group.3.2"', '"group.3.3"']]] self.assertTrue( validate(self.variables, incompatible_mapping=incompatible_mapping)['passed'])
from models.own_model import create_own_model import torch from utils.dataloaders import validation_dataloader, training_dataloader from utils.confusion_matrix import generate_conf_matrix from utils.validation import get_predicted_actual, validate device = get_device() if len(sys.argv) > 1 and sys.argv[1] == '-s': print("Using Squeezenet model") model = create_squeezenet_model() path = PATH_TO_SQUEEZENET_MODEL else: print("Using own model") model = create_own_model() path = PATH_TO_OWN_MODEL model.to(device) # data = training_dataloader data = validation_dataloader # Load from map of layers to parameter tensors model.load_state_dict(torch.load(path)) accuracy = validate(model, data) print(f"Accuracy: {accuracy}%") predicted, actual = get_predicted_actual(model, data) generate_conf_matrix(predicted, actual)
def train(args, pt_dir, chkpt_path, trainloader, valloader, writer, logger, hp, hp_str): model_g = Generator(hp.audio.n_mel_channels).cuda() model_d = MultiScaleDiscriminator(hp.model.num_D, hp.model.ndf, hp.model.n_layers, hp.model.downsampling_factor, hp.model.disc_out).cuda() model_d_mpd = MPD().cuda() optim_g = torch.optim.Adam(model_g.parameters(), lr=hp.train.adam.lr, betas=(hp.train.adam.beta1, hp.train.adam.beta2)) optim_d = torch.optim.Adam(itertools.chain(model_d.parameters(), model_d_mpd.parameters()), lr=hp.train.adam.lr, betas=(hp.train.adam.beta1, hp.train.adam.beta2)) stft = TacotronSTFT(filter_length=hp.audio.filter_length, hop_length=hp.audio.hop_length, win_length=hp.audio.win_length, n_mel_channels=hp.audio.n_mel_channels, sampling_rate=hp.audio.sampling_rate, mel_fmin=hp.audio.mel_fmin, mel_fmax=hp.audio.mel_fmax) # githash = get_commit_hash() init_epoch = -1 step = 0 if chkpt_path is not None: logger.info("Resuming from checkpoint: %s" % chkpt_path) checkpoint = torch.load(chkpt_path) model_g.load_state_dict(checkpoint['model_g']) model_d.load_state_dict(checkpoint['model_d']) model_d_mpd.load_state_dict(checkpoint['model_d_mpd']) optim_g.load_state_dict(checkpoint['optim_g']) optim_d.load_state_dict(checkpoint['optim_d']) step = checkpoint['step'] init_epoch = checkpoint['epoch'] if hp_str != checkpoint['hp_str']: logger.warning( "New hparams is different from checkpoint. Will use new.") # if githash != checkpoint['githash']: # logger.warning("Code might be different: git hash is different.") # logger.warning("%s -> %s" % (checkpoint['githash'], githash)) else: logger.info("Starting new training run.") # this accelerates training when the size of minibatch is always consistent. # if not consistent, it'll horribly slow down. torch.backends.cudnn.benchmark = True try: model_g.train() model_d.train() stft_loss = MultiResolutionSTFTLoss() criterion = torch.nn.MSELoss().cuda() l1loss = torch.nn.L1Loss() for epoch in itertools.count(init_epoch + 1): if epoch % hp.log.validation_interval == 0: with torch.no_grad(): validate(hp, model_g, model_d, model_d_mpd, valloader, stft_loss, l1loss, criterion, stft, writer, step) trainloader.dataset.shuffle_mapping() loader = tqdm.tqdm(trainloader, desc='Loading train data') avg_g_loss = [] avg_d_loss = [] avg_adv_loss = [] for (melG, audioG), (melD, audioD) in loader: melG = melG.cuda() # torch.Size([16, 80, 64]) audioG = audioG.cuda() # torch.Size([16, 1, 16000]) melD = melD.cuda() # torch.Size([16, 80, 64]) audioD = audioD.cuda() # torch.Size([16, 1, 16000] # generator optim_g.zero_grad() fake_audio = model_g( melG)[:, :, :hp.audio. segment_length] # torch.Size([16, 1, 12800]) loss_g = 0.0 sc_loss, mag_loss = stft_loss( fake_audio[:, :, :audioG.size(2)].squeeze(1), audioG.squeeze(1)) loss_g += sc_loss + mag_loss # STFT Loss adv_loss = 0.0 loss_mel = 0.0 if step > hp.train.discriminator_train_start_steps: disc_real = model_d(audioG) disc_fake = model_d(fake_audio) # for multi-scale discriminator for feats_fake, score_fake in disc_fake: # adv_loss += torch.mean(torch.sum(torch.pow(score_fake - 1.0, 2), dim=[1, 2])) adv_loss += criterion(score_fake, torch.ones_like(score_fake)) adv_loss = adv_loss / len(disc_fake) # len(disc_fake) = 3 # MPD Adverserial loss out1, out2, out3, out4, out5 = model_d_mpd(fake_audio) adv_mpd_loss = criterion(out1, torch.ones_like(out1)) + criterion(out2, torch.ones_like(out2)) + \ criterion(out3, torch.ones_like(out3)) + criterion(out4, torch.ones_like(out4)) + \ criterion(out5, torch.ones_like(out5)) adv_mpd_loss = adv_mpd_loss / 5 adv_loss = adv_loss + adv_mpd_loss # Adv Loss # Mel Loss mel_fake = stft.mel_spectrogram(fake_audio.squeeze(1)) loss_mel += l1loss(melG[:, :, :mel_fake.size(2)], mel_fake.cuda()) # Mel L1 loss loss_g += hp.model.lambda_mel * loss_mel if hp.model.feat_loss: for (feats_fake, score_fake), (feats_real, _) in zip(disc_fake, disc_real): for feat_f, feat_r in zip(feats_fake, feats_real): adv_loss += hp.model.feat_match * torch.mean( torch.abs(feat_f - feat_r)) loss_g += hp.model.lambda_adv * adv_loss loss_g.backward() optim_g.step() # discriminator loss_d_avg = 0.0 if step > hp.train.discriminator_train_start_steps: fake_audio = model_g(melD)[:, :, :hp.audio.segment_length] fake_audio = fake_audio.detach() loss_d_sum = 0.0 for _ in range(hp.train.rep_discriminator): optim_d.zero_grad() disc_fake = model_d(fake_audio) disc_real = model_d(audioD) loss_d = 0.0 loss_d_real = 0.0 loss_d_fake = 0.0 for (_, score_fake), (_, score_real) in zip( disc_fake, disc_real): loss_d_real += criterion( score_real, torch.ones_like(score_real)) loss_d_fake += criterion( score_fake, torch.zeros_like(score_fake)) loss_d_real = loss_d_real / len( disc_real) # len(disc_real) = 3 loss_d_fake = loss_d_fake / len( disc_fake) # len(disc_fake) = 3 loss_d += loss_d_real + loss_d_fake # MSD loss loss_d_sum += loss_d # MPD Adverserial loss out1, out2, out3, out4, out5 = model_d_mpd(fake_audio) out1_real, out2_real, out3_real, out4_real, out5_real = model_d_mpd( audioD) loss_mpd_fake = criterion(out1, torch.zeros_like(out1)) + criterion(out2, torch.zeros_like(out2)) + \ criterion(out3, torch.zeros_like(out3)) + criterion(out4, torch.zeros_like(out4)) + \ criterion(out5, torch.zeros_like(out5)) loss_mpd_real = criterion(out1_real, torch.ones_like(out1_real)) + criterion(out2_real, torch.ones_like(out2_real)) + \ criterion(out3_real, torch.ones_like(out3_real)) + criterion(out4_real, torch.ones_like(out4_real)) + \ criterion(out5_real, torch.ones_like(out5_real)) loss_mpd = (loss_mpd_fake + loss_mpd_real) / 5 # MPD Loss loss_d += loss_mpd loss_d.backward() optim_d.step() loss_d_sum += loss_mpd loss_d_avg = loss_d_sum / hp.train.rep_discriminator loss_d_avg = loss_d_avg.item() step += 1 # logging loss_g = loss_g.item() avg_g_loss.append(loss_g) avg_d_loss.append(loss_d_avg) avg_adv_loss.append(adv_loss) if any([ loss_g > 1e8, math.isnan(loss_g), loss_d_avg > 1e8, math.isnan(loss_d_avg) ]): logger.error("loss_g %.01f loss_d_avg %.01f at step %d!" % (loss_g, loss_d_avg, step)) raise Exception("Loss exploded") if step % hp.log.summary_interval == 0: writer.log_training(loss_g, loss_d_avg, adv_loss, loss_mel, step) loader.set_description( "Avg : g %.04f d %.04f ad %.04f| step %d" % (sum(avg_g_loss) / len(avg_g_loss), sum(avg_d_loss) / len(avg_d_loss), sum(avg_adv_loss) / len(avg_adv_loss), step)) if epoch % hp.log.save_interval == 0: save_path = os.path.join(pt_dir, '%s_%04d.pt' % (args.name, epoch)) torch.save( { 'model_g': model_g.state_dict(), 'model_d': model_d.state_dict(), 'model_d_mpd': model_d_mpd.state_dict(), 'optim_g': optim_g.state_dict(), 'optim_d': optim_d.state_dict(), 'step': step, 'epoch': epoch, 'hp_str': hp_str }, save_path) logger.info("Saved checkpoint to: %s" % save_path) except Exception as e: logger.info("Exiting due to exception: %s" % e) traceback.print_exc()
def main_worker(local_rank, nprocs, args): args.local_rank = local_rank init_seeds(local_rank+1) # 获得init_method的通信端口 init_method = 'tcp://' + args.ip + ':' + args.port # 1. 分布式初始化,对于每一个进程都需要进行初始化,所以定义在 main_worker中 cudnn.benchmark = True dist.init_process_group(backend='nccl', init_method=init_method, world_size=args.nprocs, rank=local_rank) # 2. 基本定义,模型-损失函数-优化器 model = resnet18() torch.cuda.set_device(local_rank) model.cuda(local_rank) criterion = nn.CrossEntropyLoss().cuda(local_rank) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=1e-4) train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[60, 120, 160], gamma=0.2) # apex初始化 model = apex.parallel.convert_syncbn_model(model).to(local_rank) # 使用 apex 提供的 SyncBatchNorm 操作 model, optimizer = amp.initialize(model, optimizer) model = DDP(model) # 3. 加载数据, batch_size = int(args.batch_size / nprocs) train_dataset = get_train_dataset() train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=4, pin_memory=True, sampler=train_sampler) test_dataset = get_test_dataset() test_sampler = torch.utils.data.distributed.DistributedSampler(test_dataset) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=4, pin_memory=True, sampler=test_sampler) for epoch in range(args.epochs): start = time.time() model.train() train_sampler.set_epoch(epoch) train_scheduler.step(epoch) for step, (images, labels) in enumerate(train_loader): # 将对应进程的数据放到对应 GPU 上 images = images.cuda(local_rank, non_blocking=True) labels = labels.cuda(local_rank, non_blocking=True) outputs = model(images) loss = criterion(outputs, labels) torch.distributed.barrier() reduced_loss = reduce_mean(loss, args.nprocs) # 更新优化模型权重, 用scale_loss修饰loss optimizer.zero_grad() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() if args.local_rank == 0: print( 'Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}'.format( reduced_loss, optimizer.param_groups[0]['lr'], epoch=epoch+1, trained_samples=step * args.batch_size + len(images), total_samples=len(train_loader.dataset) )) finish = time.time() if args.local_rank == 0: print('epoch {} training time consumed: {:.2f}s'.format(epoch, finish - start)) # validate after every epoch validate(test_loader, model, criterion, local_rank, args)
def main_worker(local_rank, nprocs, args): args.local_rank = local_rank init_seeds(local_rank + 1) # set different seed for each worker # 获得init_method的通信端口 init_method = 'tcp://' + args.ip + ':' + args.port # 1. 分布式初始化,对于每一个进程都需要进行初始化,所以定义在 main_worker中 cudnn.benchmark = True dist.init_process_group(backend='nccl', init_method=init_method, world_size=args.nprocs, rank=local_rank) # 2. 基本定义,模型-损失函数-优化器 model = resnet18( ) # 定义模型,将对应进程放到对应的GPU上, .cuda(local_rank) / .set_device(local_rank) # 以下是需要加 local_rank 的部分:模型 # ================================ torch.cuda.set_device(local_rank) # 使用 set_device 和 cuda 来指定需要的 GPU model.cuda(local_rank) model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(local_rank) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank]) # 将模型用 DistributedDataParallel 包裹 # ================================= criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=1e-4) train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[60, 120, 160], gamma=0.2) # 3. 加载数据, batch_size = int(args.batch_size / nprocs) # 需要手动划分 batch_size 为 mini-batch_size train_dataset = get_train_dataset() train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=4, pin_memory=True, sampler=train_sampler) test_dataset = get_test_dataset() test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=4, pin_memory=True, sampler=test_sampler) for epoch in range(args.epochs): start = time.time() model.train() # 需要设置sampler的epoch为当前epoch来保证dataloader的shuffle的有效性 train_sampler.set_epoch(epoch) # 设置 train_scheduler 来调整学习率 train_scheduler.step(epoch) for step, (images, labels) in enumerate(train_loader): # 将对应进程的数据放到 GPU 上 images = images.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) outputs = model(images) loss = criterion(outputs, labels) # torch.distributed.barrier()的作用是,阻塞进程,保证每个进程运行完这一行代码之前的所有代码,才能继续执行,这样才计算平均loss和平均acc的时候不会出现因为进程执行速度不一致的错误 torch.distributed.barrier() reduced_loss = reduce_mean(loss, args.nprocs) # 更新优化模型权重 optimizer.zero_grad() loss.backward() optimizer.step() if args.local_rank == 0: print( 'Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}' .format(reduced_loss, optimizer.param_groups[0]['lr'], epoch=epoch + 1, trained_samples=step * args.batch_size + len(images), total_samples=len(train_loader.dataset))) finish = time.time() if args.local_rank == 0: print('epoch {} training time consumed: {:.2f}s'.format( epoch, finish - start)) # validate after every epoch validate(test_loader, model, criterion, local_rank, args)