示例#1
0
def train(opt, logging):
    
    ## Data Prepare ##
    if opt.main_proc:
        logging.info("Building dataset")
                                           
    train_dataset = DeepSpeakerUttDataset(opt, os.path.join(opt.dataroot, 'train'))
    if not opt.distributed:
        train_sampler = BucketingSampler(train_dataset, batch_size=opt.batch_size)
    else:
        train_sampler = DistributedBucketingSampler(train_dataset, batch_size=opt.batch_size,
                                                    num_replicas=opt.num_gpus, rank=opt.local_rank)
    train_loader = DeepSpeakerUttDataLoader(train_dataset, num_workers=opt.num_workers, batch_sampler=train_sampler)
             
    val_dataset = DeepSpeakerTestDataset(opt, os.path.join(opt.dataroot, 'test'))
    val_loader = DeepSpeakerTestDataLoader(val_dataset, batch_size=1, num_workers=opt.num_workers, shuffle=False, pin_memory=True)
    
    opt.in_size = train_dataset.in_size
    opt.out_size = train_dataset.class_nums  
    print('opt.in_size {} opt.out_size {}'.format(opt.in_size, opt.out_size))  
                                           
    if opt.main_proc:
        logging.info("Building dataset Sucessed")
    
    ##  Building Model ##
    if opt.main_proc:
        logging.info("Building Model")
    
    model = model_select(opt)
    margin = margin_select(opt)
    
    if opt.resume:
        model, opt.total_iters = load(model, opt.resume, 'state_dict')
        margin, opt.total_iters = load(margin, opt.resume, 'margin_state_dict')
    
    # define optimizers for different layer
    criterion = torch.nn.CrossEntropyLoss().to(opt.device)
    if opt.optim_type == 'sgd':
        optimizer = optim.SGD([
            {'params': model.parameters(), 'weight_decay': 5e-4},
            {'params': margin.parameters(), 'weight_decay': 5e-4}
        ], lr=opt.lr, momentum=0.9, nesterov=True)
    elif opt.optim_type == 'adam':
        optimizer = optim.Adam([
            {'params': model.parameters(), 'weight_decay': 5e-4},
            {'params': margin.parameters(), 'weight_decay': 5e-4}
        ], lr=opt.lr, betas=(opt.beta1, 0.999))
    elif opt.optim_type == 'radam':
        optimizer = RAdam([
            {'params': model.parameters(), 'weight_decay': 5e-4},
            {'params': margin.parameters(), 'weight_decay': 5e-4}
        ], lr=opt.lr)
        
    scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20, 40], gamma=0.1)
        
    model.to(opt.device)
    margin.to(opt.device)
    
    if opt.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[opt.local_rank],
                                                          output_device=opt.local_rank)
        margin = torch.nn.parallel.DistributedDataParallel(margin, device_ids=[opt.local_rank],
                                                           output_device=opt.local_rank)
    if opt.main_proc:
        print(model)
        print(margin)
        logging.info("Building Model Sucessed") 
        
    best_perform_eer = 1.0
    
    losses = utils.AverageMeter()
    acc = utils.AverageMeter()

    # Initial performance
    if opt.main_proc:
        EER = evaluate(opt, model, val_loader, logging)
        best_perform_eer = EER
        print('>>Start performance: EER = {}<<'.format(best_perform_eer))
    
    total_iters = opt.total_iters
    for epoch in range(1, opt.total_epoch + 1):
        train_sampler.shuffle(epoch)
        scheduler.step()
        # train model
        if opt.main_proc:
            logging.info('Train Epoch: {}/{} ...'.format(epoch, opt.total_epoch))
        model.train()
        margin.train()

        since = time.time()
        for i, (data) in enumerate(train_loader, start=0):
            utt_ids, inputs, targets = data
            inputs, label = inputs.to(opt.device), targets.to(opt.device)
            optimizer.zero_grad()
            
            raw_logits, attn, w, b = model(inputs)
            output = margin(raw_logits, label)
            #loss = criterion(output, label)
            loss = cal_loss(output, label, criterion, smoothing=opt.smoothing)
            loss_dict_reduced = reduce_loss_dict(opt, {'loss': loss})
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            loss_value = losses_reduced.item()
            
            # Check the loss and avoid the invaided loss
            inf = float("inf")
            if loss_value == inf or loss_value == -inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                loss_value = 0
                continue
                    
            loss.backward()
            if utils.check_grad(model.parameters(), opt.clip_grad, opt.ignore_grad):
                if opt.main_proc:
                    logging.info('Not a finite gradient or too big, ignoring')
                optimizer.zero_grad()
                continue
            optimizer.step()

            total_iters += opt.num_gpus
            losses.update(loss_value)
            
            # print train information
            if total_iters % opt.print_freq == 0 and opt.main_proc:
                # current training accuracy
                _, predict = torch.max(output.data, 1)
                total = label.size(0)
                correct = (np.array(predict.cpu()) == np.array(label.data.cpu())).sum()
                time_cur = (time.time() - since) / 100
                since = time.time()
                logging.info("Iters: {:0>6d}/[{:0>2d}], loss: {:.4f} ({:.4f}), train_accuracy: {:.4f}, time: {:.2f} s/iter, learning rate: {}".format(total_iters, epoch, loss_value, losses.avg, correct/total, time_cur, scheduler.get_lr()[0]))
              
            # save model
            if total_iters % opt.save_freq == 0 and opt.main_proc:
                logging.info('Saving checkpoint: {}'.format(total_iters))
                if opt.distributed:
                    model_state_dict = model.module.state_dict()
                    margin_state_dict = margin.module.state_dict()
                else:
                    model_state_dict = model.state_dict()
                    margin_state_dict = margin.state_dict()
                state = {'state_dict': model_state_dict, 'margin_state_dict': margin_state_dict, 'total_iters': total_iters,}
                filename = 'newest_model.pth'
                if os.path.isfile(os.path.join(opt.model_dir, filename)):
                    shutil.copy(os.path.join(opt.model_dir, filename), os.path.join(opt.model_dir, 'newest_model.pth_bak'))
                utils.save_checkpoint(state, opt.model_dir, filename=filename)
                    
            # Validate the trained model
            if total_iters % opt.validate_freq == 0:
                EER = evaluate(opt, model, val_loader, logging)
                ##scheduler.step(EER)
                
                if opt.main_proc and EER < best_perform_eer:
                    best_perform_eer = EER
                    logging.info("Found better validated model (EER = %.3f), saving to model_best.pth" % (best_perform_eer))
                    if opt.distributed:
                        model_state_dict = model.module.state_dict()
                        margin_state_dict = margin.module.state_dict()
                    else:
                        model_state_dict = model.state_dict()
                        margin_state_dict = margin.state_dict()
                    state = {'state_dict': model_state_dict, 'margin_state_dict': margin_state_dict, 'total_iters': total_iters,}  
                    filename = 'model_best.pth'
                    if os.path.isfile(os.path.join(opt.model_dir, filename)):
                        shutil.copy(os.path.join(opt.model_dir, filename), os.path.join(opt.model_dir, 'model_best.pth_bak'))                   
                    utils.save_checkpoint(state, opt.model_dir, filename=filename)

                model.train()
                margin.train()
                losses.reset()
                   
def train(opt, logging):
    ## Data Prepare ##
    if opt.main_proc:
        logging.info("Building dataset")

    train_dataset = DeepSpeakerSeqDataset(opt,
                                          os.path.join(opt.dataroot, 'dev'))
    train_loader = DeepSpeakerSeqDataLoader(train_dataset,
                                            batch_size=1,
                                            num_workers=opt.num_workers,
                                            shuffle=True,
                                            pin_memory=True)

    val_dataset = DeepSpeakerTestDataset(opt,
                                         os.path.join(opt.dataroot, 'test'))
    val_loader = DeepSpeakerTestDataLoader(val_dataset,
                                           batch_size=1,
                                           num_workers=opt.num_workers,
                                           shuffle=False,
                                           pin_memory=True)

    opt.in_size = train_dataset.in_size
    opt.out_size = train_dataset.class_nums
    print('opt.in_size {} opt.out_size {}'.format(opt.in_size, opt.out_size))

    if opt.main_proc:
        logging.info("Building dataset Sucessed")

    ##  Building Model ##
    if opt.main_proc:
        logging.info("Building Model")

    model = model_select(opt, seq_training=True)

    if opt.resume:
        model, opt.total_iters = load(model, opt.resume, 'state_dict')

    # define optimizers for different layer
    if opt.optim_type == 'sgd':
        optimizer = optim.SGD([
            {
                'params': model.parameters(),
                'weight_decay': 5e-4
            },
        ],
                              lr=opt.lr,
                              momentum=0.9,
                              nesterov=True)
    elif opt.optim_type == 'adam':
        optimizer = optim.Adam([
            {
                'params': model.parameters(),
                'weight_decay': 5e-4
            },
        ],
                               lr=opt.lr,
                               betas=(opt.beta1, 0.999))

    scheduler = lr_scheduler.StepLR(optimizer=optimizer,
                                    step_size=opt.lr_reduce_step,
                                    gamma=opt.lr_reduce_factor,
                                    last_epoch=-1)

    model.to(opt.device)

    if opt.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[opt.local_rank], output_device=opt.local_rank)
    if opt.main_proc:
        print(model)
        logging.info("Building Model Sucessed")

    best_perform_acc = 1.0

    losses = utils.AverageMeter()
    embedding_losses = utils.AverageMeter()
    embedding_segment_losses = utils.AverageMeter()
    penalty_losses = utils.AverageMeter()
    embedding_segment_losses = utils.AverageMeter()

    # Initial performance
    if opt.main_proc:
        EER = evaluate(opt, model, val_loader, logging)
        best_perform_acc = EER
        print('>>Start performance: EER = {}<<'.format(best_perform_acc))

    save_model = model
    if isinstance(model, DistributedDataParallel):
        save_model = model.module

    # Start Training
    total_iters = opt.total_iters
    for epoch in range(1, opt.total_epoch + 1):
        while True:
            model.train()
            for i, (data) in enumerate(train_loader, start=0):
                if i == len(train_loader):
                    break

                optimizer.zero_grad()

                # Perform forward and Obtain the loss
                feature_input, seq_len, spk_ids = data
                feature_input = feature_input.to(opt.device)
                seq_len = seq_len.squeeze(0).to(opt.device)
                out, out_segment, attn, w, b = model(feature_input, seq_len)

                sim_matrix_out = save_model.similarity(out, w, b)
                embedding_loss = opt.embedding_loss_lamda * save_model.loss_cal(
                    sim_matrix_out)

                if opt.segment_type == 'average':
                    sim_matrix_out_seg = save_model.similarity(
                        out_segment, w, b)
                    embedding_loss_segment = opt.segment_loss_lamda * save_model.loss_cal(
                        sim_matrix_out_seg)
                elif opt.segment_type == 'all':
                    sim_matrix_out_seg = save_model.similarity_segment(
                        out_segment, seq_len, w, b)
                    embedding_loss_segment = opt.segment_loss_lamda * save_model.loss_cal_segment(
                        sim_matrix_out_seg, seq_len)
                else:
                    sim_matrix_out_seg = None
                    embedding_loss_segment = 0

                if opt.att_type == 'multi_attention' and attn is not None:
                    penalty_loss = opt.penalty_loss_lamda * save_model.penalty_loss_cal(
                        attn)
                else:
                    penalty_loss = 0

                loss_dict_reduced = reduce_loss_dict(
                    opt, {
                        'embedding_loss': embedding_loss,
                        'penalty_loss': penalty_loss,
                        'embedding_loss_segment': embedding_loss_segment
                    })
                losses_reduced = sum(loss
                                     for loss in loss_dict_reduced.values())
                loss_value = losses_reduced.item()
                embedding_loss_value = loss_dict_reduced[
                    'embedding_loss'].item()
                penalty_loss_value = loss_dict_reduced['penalty_loss'].item()
                embedding_loss_segment_value = loss_dict_reduced[
                    'embedding_loss_segment'].item()
                loss = embedding_loss + penalty_loss

                # Check the loss and avoid the invaided loss
                inf = float("inf")
                if loss_value == inf or loss_value == -inf:
                    print(
                        "WARNING: received an inf loss, setting loss value to 0"
                    )
                    loss_value = 0
                    embedding_loss_value = 0
                    penalty_loss_value = 0
                    embedding_loss_segment_value = 0
                    continue

                # Perform backward and Check and update the grad
                loss.backward()
                if utils.check_grad(model.parameters(), opt.clip_grad,
                                    opt.ignore_grad):
                    if opt.main_proc:
                        logging.info(
                            'Not a finite gradient or too big, ignoring')
                    optimizer.zero_grad()
                    continue
                optimizer.step()

                total_iters += opt.num_gpus

                # Update the loss for logging
                losses.update(loss_value)
                embedding_losses.update(embedding_loss_value)
                penalty_losses.update(penalty_loss_value)
                embedding_segment_losses.update(embedding_loss_segment_value)

                # Print the performance on the training dateset 'opt': opt, 'learning_rate': lr,
                if total_iters % opt.print_freq == 0:
                    scheduler.step(total_iters)
                    if opt.main_proc:
                        lr = scheduler.get_lr()
                        if isinstance(lr, list):
                            lr = max(lr)
                        logging.info(
                            '==> Train set steps {} lr: {:.6f}, loss: {:.4f} [ embedding: {:.4f}, embedding_segment: {:.4f}, penalty_loss {:.4f}]'
                            .format(total_iters, lr, losses.avg,
                                    embedding_losses.avg,
                                    embedding_segment_losses.avg,
                                    penalty_losses.avg))

                        save_model = model
                        if isinstance(model, DistributedDataParallel):
                            save_model = model.module
                        state = {
                            'state_dict': save_model.state_dict(),
                            'total_iters': total_iters
                        }
                        filename = 'newest_model.pth'
                        if os.path.isfile(os.path.join(opt.model_dir,
                                                       filename)):
                            shutil.copy(
                                os.path.join(opt.model_dir, filename),
                                os.path.join(opt.model_dir,
                                             'newest_model.pth_bak'))
                        utils.save_checkpoint(state,
                                              opt.model_dir,
                                              filename=filename)

                # Validate the trained model
                if total_iters % opt.validate_freq == 0:
                    EER = evaluate(opt, model, val_loader, logging)
                    ##scheduler.step(EER)

                    if opt.main_proc and EER < best_perform_acc:
                        best_perform_acc = EER
                        print(
                            "Found better validated model (EER = %.3f), saving to model_best.pth"
                            % (best_perform_acc))
                        save_model = model
                        if isinstance(model, DistributedDataParallel):
                            save_model = model.module
                        state = {
                            'state_dict': save_model.state_dict(),
                            'total_iters': total_iters
                        }
                        filename = 'model_best.pth'
                        if os.path.isfile(os.path.join(opt.model_dir,
                                                       filename)):
                            shutil.copy(
                                os.path.join(opt.model_dir, filename),
                                os.path.join(opt.model_dir,
                                             'model_best.pth_bak'))
                        utils.save_checkpoint(state,
                                              opt.model_dir,
                                              filename=filename)

                    model.train()
                    losses.reset()
                    embedding_losses.reset()
                    penalty_losses.reset()
                    embedding_segment_losses.reset()

                if total_iters > opt.max_iters and opt.main_proc:
                    logging.info(
                        'finish training, steps is  {}'.format(total_iters))
                    return model