Пример #1
0
def main_worker(ngpus_per_node, args):
    cprint('=> modeling the network ...', 'green')
    model = builder_inf(args)
    model = torch.nn.DataParallel(model).cuda()

    cprint('=> building the dataloader ...', 'green')
    trans = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0., 0., 0.], std=[1., 1., 1.]),
    ])
    inf_dataset = ImgInfLoader(ann_file=args.inf_list, transform=trans)

    inf_loader = torch.utils.data.DataLoader(inf_dataset,
                                             batch_size=args.batch_size,
                                             num_workers=args.workers,
                                             pin_memory=True,
                                             shuffle=False)

    cprint('=> starting inference engine ...', 'green')
    cprint('=> embedding features will be saved into {}'.format(
        args.feat_list))

    batch_time = utils.AverageMeter('Time', ':6.3f')
    data_time = utils.AverageMeter('Data', ':6.3f')

    progress = utils.ProgressMeter(len(inf_loader), [batch_time, data_time],
                                   prefix="Extract Features: ")

    # switch to evaluate mode
    model.eval()

    fio = open(args.feat_list, 'w')
    with torch.no_grad():
        end = time.time()

        for i, (input, img_paths) in enumerate(inf_loader):
            # measure data loading time
            data_time.update(time.time() - end)

            # compute output
            embedding_feat = model(input[0])

            # embedding_feat = F.normalize(embedding_feat, p=2, dim=1)
            _feat = embedding_feat.data.cpu().numpy()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                progress.display(i)

            # write feat into files
            for feat, path in zip(_feat, img_paths):
                fio.write('{} '.format(path))
                for e in feat:
                    fio.write('{} '.format(e))
                fio.write('\n')
    # close
    fio.close()
Пример #2
0
def validate_epoch(val_loader, model, criterion, use_cuda=True):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.4e')
    top1 = utils.AverageMeter('Acc@1', ':6.2f')
    top5 = utils.AverageMeter('Acc@5', ':6.2f')
    progress = utils.ProgressMeter(len(val_loader),
                                   batch_time,
                                   top1,
                                   top5,
                                   losses,
                                   prefix='Val: ')

    # switch to evaluate mode
    all_preds = []
    all_labels = []
    model.eval()
    print_freq = len(val_loader) // 4 + 1
    with torch.no_grad():
        end = time.time()
        for i, (_, inputs, labels) in enumerate(val_loader):
            if use_cuda:
                inputs, labels = inputs.cuda(), labels.cuda()

            # compute output
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # measure accuracy and record loss
            acc1, acc5 = utils.accuracy(outputs, labels, topk=(1, 5))
            losses.update(loss.item(), inputs.size(0))
            top1.update(acc1[0], inputs.size(0))
            top5.update(acc5[0], inputs.size(0))

            # for confusion matrix calculation
            _, preds = outputs.topk(1, 1, True, True)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % print_freq == 0 or i + 1 == len(val_loader):
                progress.print(i + 1)

        print(confusion_matrix(all_labels, all_preds))
        return top1.avg, top5.avg
Пример #3
0
def run_train_epoch(model, optimizer, criterion, train_dataloader, epoch,
                    args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    #data_time = utils.AverageMeter('Data', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.4e')
    grad_norm = utils.AverageMeter('grad_norm', ':.4e')
    progress = utils.ProgressMeter(len(train_dataloader),
                                   batch_time,
                                   losses,
                                   grad_norm,
                                   prefix="Epoch: [{}]".format(epoch))

    end = time.time()
    # trainloader is an iterator. This line extract one minibatch at one time
    for i, data in enumerate(train_dataloader, 0):
        feat = data["x"]
        label = data["y"]

        x = feat.to(th.float32)
        y = label.unsqueeze(2).long()

        if th.cuda.is_available():
            x = x.cuda()
            y = y.cuda()

        prediction = model(x)
        loss = criterion(prediction.view(-1, prediction.shape[2]), y.view(-1))

        optimizer.zero_grad()
        loss.backward()

        # Gradient Clipping
        norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        optimizer.step()

        grad_norm.update(norm)

        # update loss
        losses.update(loss.item(), x.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)

        if i % args.print_freq == 0:
            #        if not args.hvd or hvd.rank() == 0:
            progress.print(i)
Пример #4
0
def run_train_epoch(model, optimizer, criterion, train_dataloader, epoch,
                    args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.4e')
    grad_norm = utils.AverageMeter('grad_norm', ':.4e')
    progress = utils.ProgressMeter(len(train_dataloader),
                                   batch_time,
                                   losses,
                                   grad_norm,
                                   prefix="Epoch: [{}]".format(epoch))

    end = time.time()
    # trainloader is an iterator. This line extract one minibatch at one time
    for i, data in enumerate(train_dataloader, 0):
        feat = data["x"]
        label = data["y"]
        num_frs = data["num_frs"]
        utt_ids = data["utt_ids"]

        x = feat.to(th.float32)
        y = label.squeeze(2).long()

        if th.cuda.is_available():
            x = x.cuda()
            y = y.cuda()

        x = x.transpose(0, 1)
        key_padding_mask = th.ones((x.size(1), x.size(0)))

        for utt in range(len(num_frs)):
            key_padding_mask[utt, :num_frs[utt]] = 0

        src_mask = None
        if (args.look_ahead > -1):
            src_mask = th.tril(th.ones(x.size(0), x.size(0)),
                               diagonal=args.look_ahead)
            src_mask = src_mask.float().masked_fill(src_mask == 0,
                                                    float('-inf')).masked_fill(
                                                        src_mask == 1,
                                                        float(0.0))
            src_mask = src_mask.cuda()

        key_padding_mask = key_padding_mask.bool().cuda()
        prediction = model(x, src_mask, key_padding_mask)
        prediction = prediction.transpose(0, 1).contiguous()
        loss = criterion(prediction.view(-1, prediction.size(2)), y.view(-1))

        optimizer.zero_grad()
        loss.backward()

        # Gradient Clipping
        norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        #update lr
        step = len(train_dataloader) * epoch + i + 1
        lr = utils.noam_decay(step, args.warmup_step, args.lr)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        optimizer.step()

        grad_norm.update(norm)

        # update loss
        losses.update(loss.item(), x.size(1))

        # measure elapsed time
        batch_time.update(time.time() - end)

        if i % args.print_freq == 0:
            #        if not args.hvd or hvd.rank() == 0:
            progress.print(i)
Пример #5
0
def run_train_epoch(model, optimizer, log_prior, dataloader, epoch,
                    asr_decoder, trans_model, silence_ids, args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.4e')
    grad_norm = utils.AverageMeter('grad_norm', ':.4e')
    progress = utils.ProgressMeter(len(dataloader),
                                   batch_time,
                                   losses,
                                   grad_norm,
                                   prefix="Epoch: [{}]".format(epoch))

    ce_criterion = nn.CrossEntropyLoss(ignore_index=-100, reduction='sum')

    if args.criterion == "mmi":
        se_criterion = ops.MMIFunction.apply
    else:
        se_criterion = ops.sMBRFunction.apply

    end = time.time()
    for i, batch in enumerate(dataloader, 0):
        feat = batch["x"]
        label = batch["y"]  #pdf-ids for ce loss
        num_frs = batch["num_frs"]
        utt_ids = batch["utt_ids"]
        aux = batch["aux"]  #trans_ids for se loss

        x = feat.to(th.float32)
        y = label.long()
        x = x.cuda()
        y = y.cuda()

        prediction = model(x)
        ce_loss = ce_criterion(prediction.view(-1, prediction.shape[2]),
                               y.view(-1))

        se_loss = 0.0
        for j in range(len(num_frs)):
            log_like_j = prediction[j, :, :]
            log_like_j = log_like_j[:num_frs[j], :]
            log_like_j = log_like_j - log_prior
            #trans_id = label[j, :num_frs[j], 0].tolist()
            trans_id = th.from_numpy(aux[j][0][0].astype(int)).tolist()
            #    print(len(trans_id), num_frs[j])

            if args.criterion == "mmi":
                se_loss += se_criterion(log_like_j, asr_decoder, trans_model,
                                        trans_id)
            else:
                se_loss += se_criterion(log_like_j, asr_decoder, trans_model,
                                        trans_id, args.criterion, silence_ids)

        loss = se_loss.cuda() + args.ce_ratio * ce_loss
        optimizer.zero_grad()
        loss.backward()

        # Gradient Clipping (th 5.0)
        norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        optimizer.step()

        grad_norm.update(norm)

        # update loss
        tot_frs = np.array(num_frs).sum()
        losses.update(loss.item() / tot_frs)

        # measure elapsed time
        batch_time.update(time.time() - end)

        # save model
        if hvd.rank() == 0 and i % args.save_freq == 0:
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            output_file = args.exp_dir + '/model.se.' + str(i) + '.tar'
            th.save(checkpoint, output_file)

        if hvd.rank() == 0 and i % args.print_freq == 0:
            progress.print(i)
Пример #6
0
def train_model(trainloader, testloader, net, device):
    if torch.cuda.device_count() > 1:
        # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
        print("Activate multi GPU support.")
        net = nn.DataParallel(net)
    net.to(device)
    # define the loss function
    criterion = (nn.CrossEntropyLoss().cuda()
                 if torch.cuda.is_available() else nn.CrossEntropyLoss())
    # Scale the lr linearly with the batch size.
    # Should be 0.1 when batch_size=128
    initial_lr = 0.1 * batch_size / 128
    # initialize the optimizer
    optimizer = optim.SGD(net.parameters(),
                          lr=initial_lr,
                          momentum=0.9,
                          weight_decay=_WEIGHT_DECAY)
    # multiply the lr by 0.1 at 100, 150, and 200 epochs
    div = num_epoch // 4
    lr_decay_milestones = [div * 2, div * 3]
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                               milestones=lr_decay_milestones,
                                               gamma=0.1,
                                               last_epoch=_LAST_EPOCH)

    for epoch in range(num_epoch):  # loop over the dataset multiple times

        # set printing functions
        batch_time = util.AverageMeter('Time/batch', ':.3f')
        losses = util.AverageMeter('Loss', ':6.2f')
        top1 = util.AverageMeter('Acc', ':6.2f')
        progress = util.ProgressMeter(len(trainloader),
                                      [losses, top1, batch_time],
                                      prefix="Epoch: [{}]".format(epoch + 1))

        # switch the model to the training mode
        net.train()

        print('current learning rate = {}'.format(
            optimizer.param_groups[0]['lr']))

        # each epoch
        end = time.time()
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data[0].to(device), data[1].to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            for name, param in net.named_parameters():
                if 'threshold' in name:
                    loss += args.sigma * torch.norm(param - args.gtarget)
            loss.backward()
            optimizer.step()

            # measure accuracy and record loss
            _, batch_predicted = torch.max(outputs.data, 1)
            batch_accu = 100.0 * (batch_predicted
                                  == labels).sum().item() / labels.size(0)
            losses.update(loss.item(), labels.size(0))
            top1.update(batch_accu, labels.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % 50 == 49:
                # print statistics every 100 mini-batches each epoch
                progress.display(i)  # i = batch id in the epoch

        # update the learning rate
        scheduler.step()

        # print test accuracy every few epochs
        if epoch % 10 == 9:
            print('epoch {}'.format(epoch + 1))
            test_accu(testloader, net, device)

    # save the model if required
    if args.save:
        print("Saving the trained model.")
        util.save_models(net.state_dict(), save_folder, suffix=_ARCH)

    print('Finished Training')
Пример #7
0
def do_train(train_loader, model, criterion, optimizer, grad_scaler, epoch, args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    data_time = utils.AverageMeter('Data', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.3f')
    top1 = utils.AverageMeter('Acc@1', ':6.2f')
    learning_rate = utils.AverageMeter('LR', ':.4f')
    throughputs = utils.AverageMeter('ThroughPut', ':.2f')

    losses_id = utils.AverageMeter('L_ID', ':.3f')
    losses_mag = utils.AverageMeter('L_mag', ':.6f')
    progress_template = [batch_time, data_time, throughputs, 'images/s',
                         losses, losses_id, losses_mag, 
                         top1, learning_rate]

    progress = utils.ProgressMeter(
        len(train_loader),
        progress_template,
        prefix="Epoch: [{}]".format(epoch))
    end = time.time()

    # update lr
    learning_rate.update(current_lr)

    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        global iters
        iters += 1

        input = input.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # compute output
        with autocast(enabled=args.amp_mode):
            output, x_norm = model(input, target)
        
        # x_norm is not needed to be gathered, as feature x is in each rank
        target = ts.distributed.gather(target, dim=0)

        # loss
        with autocast(enabled=args.amp_mode):
            loss_id, loss_g, one_hot = criterion(output, 
                                                 target,
                                                 x_norm)
        loss = loss_id + args.lambda_g * loss_g
        # compute gradient and do solver step
        optimizer.zero_grad()

        # backward
        grad_scaler.scale(loss).backward()
        # update weights
        grad_scaler.step(optimizer)
        grad_scaler.update() 

        # syn for logging
        torch.cuda.synchronize()   

        # measure elapsed time
        if args.rank == 0:
            duration = time.time() - end
            end = time.time()
            batch_time.update(duration)
            bs = args.batch_size
            throughputs.update(args.world_size * bs / duration)

        # measure accuracy and record loss
        output = ts.distributed.gather(output[0], dim=-1)
        acc1, acc5 = accuracy(output, target, topk=(1, 5))

        losses.update(loss.item(), input.size(0))
        top1.update(acc1[0], input.size(0))

        losses_id.update(loss_id.item(), input.size(0))
        losses_mag.update(args.lambda_g*loss_g.item(), input.size(0))

        if i % args.print_freq == 0 and args.rank == 0:
            progress.display(i)
            debug_info(x_norm, args.l_a, args.u_a,
                           args.l_margin, args.u_margin)
Пример #8
0
def do_train(train_loader, model, criterion, optimizer, epoch, args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    data_time = utils.AverageMeter('Data', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.3f')
    top1 = utils.AverageMeter('Acc@1', ':6.2f')
    top5 = utils.AverageMeter('Acc@5', ':6.2f')
    learning_rate = utils.AverageMeter('LR', ':.4f')
    throughputs = utils.AverageMeter('ThroughPut', ':.2f')

    losses_id = utils.AverageMeter('L_ID', ':.3f')
    losses_mag = utils.AverageMeter('L_mag', ':.6f')
    progress_template = [
        batch_time, data_time, throughputs, 'images/s', losses, losses_id,
        losses_mag, top1, top5, learning_rate
    ]

    progress = utils.ProgressMeter(len(train_loader),
                                   progress_template,
                                   prefix="Epoch: [{}]".format(epoch))
    end = time.time()

    # update lr
    learning_rate.update(current_lr)

    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        global iters
        iters += 1

        input = input.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # compute output
        output, x_norm = model(input, target)

        loss_id, loss_g, one_hot = criterion(output, target, x_norm)
        loss = loss_id + args.lambda_g * loss_g

        # measure accuracy and record loss
        acc1, acc5 = utils.accuracy(args, output[0], target, topk=(1, 5))

        losses.update(loss.item(), input.size(0))
        top1.update(acc1[0], input.size(0))
        top5.update(acc5[0], input.size(0))

        losses_id.update(loss_id.item(), input.size(0))
        losses_mag.update(args.lambda_g * loss_g.item(), input.size(0))

        # compute gradient and do solver step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        duration = time.time() - end
        batch_time.update(duration)
        end = time.time()
        throughputs.update(args.batch_size / duration)

        if i % args.print_freq == 0:
            progress.display(i)
            debug_info(x_norm, args.l_a, args.u_a, args.l_margin,
                       args.u_margin)

        if args.vis_mag:
            if (i > 10000) and (i % 100 == 0):
                x_norm = x_norm.detach().cpu().numpy()
                cos_theta = torch.masked_select(
                    output[0], one_hot.bool()).detach().cpu().numpy()
                logit = torch.masked_select(F.softmax(
                    output[0]), one_hot.bool()).detach().cpu().numpy()
                np.savez(
                    '{}/vis/epoch_{}_iter{}'.format(args.pth_save_fold, epoch,
                                                    i), x_norm, logit,
                    cos_theta)
Пример #9
0
def do_train(train_loader, model, criterion, optimizer, grad_scaler, epoch,
             args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    data_time = utils.AverageMeter('Data', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.3f')
    top1 = utils.AverageMeter('Acc@1', ':6.2f')
    learning_rate = utils.AverageMeter('LR', ':.4f')
    throughputs = utils.AverageMeter('ThroughPut', ':.2f')

    losses_id = utils.AverageMeter('L_ID', ':.3f')
    losses_mag = utils.AverageMeter('L_mag', ':.6f')
    progress_template = [
        batch_time, data_time, throughputs, 'images/s', losses, losses_id,
        losses_mag, top1, learning_rate
    ]

    progress = utils.ProgressMeter(len(train_loader),
                                   progress_template,
                                   prefix="Epoch: [{}]".format(epoch))
    end = time.time()

    # update lr
    learning_rate.update(current_lr)

    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        global iters
        iters += 1

        input = input.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # compute output
        with autocast(enabled=args.amp_mode):
            output, x_norm = model(input, target)

        # x_norm is not needed to be gathered, as feature x is in each rank
        target = mpu._gather(target, dim=0)

        # loss
        with autocast(enabled=args.amp_mode):
            loss_id, loss_g, one_hot = criterion(output, target, x_norm)
        loss = loss_id + args.lambda_g * loss_g * args.world_size
        # compute gradient and do solver step
        optimizer.zero_grad()

        # backward
        grad_scaler.scale(loss).backward()
        # update weights
        grad_scaler.step(optimizer)
        grad_scaler.update()

        # syn for logging
        torch.cuda.synchronize()

        # measure elapsed time
        if args.rank == 0:
            duration = time.time() - end
            end = time.time()
            batch_time.update(duration)
            bs = args.batch_size
            throughputs.update(args.world_size * bs / duration)

        # measure accuracy and record loss
        acc1, _ = mpu.accuracy(args, output, target, topk=(1, 1))

        losses.update(loss.item(), input.size(0))
        top1.update(acc1[0], input.size(0))

        losses_id.update(loss_id.item(), input.size(0))
        losses_mag.update(args.lambda_g * loss_g.item(), input.size(0))

        if i % args.print_freq == 0 and args.rank == 0:
            progress.display(i)
            debug_info(x_norm, args.l_a, args.u_a, args.l_margin,
                       args.u_margin)

        if args.vis_mag:
            if (epoch == args.epochs - 1) and (i % 1000 == 0):
                one_hot = one_hot.bool()
                mask = torch.sum(one_hot, dim=1).bool()
                x_norm_cur_rank = torch.masked_select(
                    x_norm.squeeze(), mask).detach().cpu().numpy()
                cos_theta_cur_rank = torch.masked_select(
                    output[0], one_hot).detach().cpu().numpy()
                np.savez(
                    '{}/vis/epoch_{}_iter{}_rank_{}'.format(
                        args.pth_save_fold, epoch, i, args.rank),
                    x_norm_cur_rank, cos_theta_cur_rank)
Пример #10
0
def run_train_epoch(model, optimizer, log_prior, dataloader, epoch,
                    asr_decoder, trans_model, silence_ids, aligner, args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.4e')
    grad_norm = utils.AverageMeter('grad_norm', ':.4e')
    progress = utils.ProgressMeter(len(dataloader),
                                   batch_time,
                                   losses,
                                   grad_norm,
                                   prefix="Epoch: [{}]".format(epoch))

    ce_criterion = nn.CrossEntropyLoss(ignore_index=-100, reduction='sum')
    if args.criterion == "mmi":
        criterion = ops.MMIFunction.apply
    else:
        criterion = ops.sMBRFunction.apply

    end = time.time()
    for i, batch in enumerate(dataloader):
        feat = batch["x"]
        label = batch["y"]
        num_frs = batch["num_frs"]
        utt_ids = batch["utt_ids"]
        aux = batch["aux"]  #word labels for se loss

        x = feat.to(th.float32)
        y = label.long()
        x = x.cuda()
        y = y.cuda()

        prediction = model(x)
        ce_loss = ce_criterion(prediction.view(-1, prediction.shape[2]),
                               y.view(-1))
        loss = args.ce_ratio * ce_loss

        for j in range(len(num_frs)):
            loglike = prediction[j, :, :]
            loglike_j = loglike[:num_frs[j], :]
            loglike_j = loglike_j - log_prior

            text = th.from_numpy(aux[j][0][0].astype(int)).tolist()
            #text = ' '.join(str(k) for k in text)
            try:
                align_in = kaldi_matrix.Matrix(
                    loglike_j.detach().cpu().numpy())
                align_out = aligner.align(align_in, text)
                trans_ids = align_out["alignment"]

                if args.criterion == "mmi":
                    se_loss = criterion(loglike_j, asr_decoder, trans_model,
                                        trans_ids)
                else:
                    se_loss = criterion(loglike_j, asr_decoder, trans_model,
                                        trans_ids, args.criterion, silence_ids)
                loss += se_loss.cuda()
            except:
                print(
                    "Warning: failed to align utterance {}, skip the utterance for SE loss"
                    .format(utt_ids[j]))

        optimizer.zero_grad()
        loss.backward()

        # Gradient Clipping (th 5.0)
        norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        optimizer.step()

        grad_norm.update(norm)

        # update loss
        tot_frs = np.array(num_frs).sum()
        losses.update(loss.item() / tot_frs)

        # measure elapsed time
        batch_time.update(time.time() - end)

        # save model
        if hvd.rank() == 0 and i % args.save_freq == 0:
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            output_file = args.exp_dir + '/model.se.' + str(i) + '.tar'
            th.save(checkpoint, output_file)

        if hvd.rank() == 0 and i % args.print_freq == 0:
            progress.print(i)
Пример #11
0
def run_train_epoch(model, optimizer, dataloader, epoch, trans_model, tree,
                    supervision_opts, aligner, den, chain_opts, args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.4e')
    grad_norm = utils.AverageMeter('grad_norm', ':.4e')
    progress = utils.ProgressMeter(len(dataloader),
                                   batch_time,
                                   losses,
                                   grad_norm,
                                   prefix="Epoch: [{}]".format(epoch))

    criterion = ops.ChainObjtiveFunction.apply
    end = time.time()
    for i, batch in enumerate(dataloader):
        feat = batch["x"]
        label = batch["y"]
        num_frs = batch["num_frs"]
        utt_ids = batch["utt_ids"]
        aux = batch["aux"]  #word labels for se loss

        frame_shift = (epoch % supervision_opts.frame_subsampling_factor) * -1

        x = feat.to(th.float32)
        x = th.roll(x, frame_shift, 1)
        x = x.unfold(1, 1,
                     supervision_opts.frame_subsampling_factor).squeeze(-1)
        x = x.cuda()
        y = label.squeeze(2)

        loss = 0.0
        prediction = model(x)
        for j in range(len(num_frs)):
            trans_ids = y[j, :num_frs[j]].tolist()
            phone_ali = aligner.to_phone_alignment(trans_ids)

            phones = list()
            durations = list()
            for item in phone_ali:
                phones.append(item[0])
                durations.append(item[2])

            proto_supervision = kaldi_chain.alignment_to_proto_supervision(
                supervision_opts, phones, durations)
            supervision = kaldi_chain.proto_supervision_to_supervision(
                tree, trans_model, proto_supervision, True)

            loglike_j = prediction[j, :supervision.frames_per_sequence, :]
            loss += criterion(loglike_j, den, supervision, chain_opts)

        optimizer.zero_grad()
        loss.backward()

        #update lr
        step = len(dataloader) * epoch + i + 1
        lr = utils.noam_decay(step, args.warmup_steps, args.lr)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        # Gradient Clipping (th 5.0)
        norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        optimizer.step()

        grad_norm.update(norm)

        # update the loss
        tot_frs = np.array(num_frs).sum()
        losses.update(loss.item() / tot_frs)

        # measure the elapsed time
        batch_time.update(time.time() - end)

        # save model
        if hvd.rank() == 0 and i % args.save_freq == 0:
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            output_file = args.exp_dir + '/chain.model.' + str(i) + '.tar'
            th.save(checkpoint, output_file)

        if hvd.rank() == 0 and i % args.print_freq == 0:
            progress.print(i)
Пример #12
0
def train_epoch(epoch,
                train_loader,
                model,
                criterion,
                optimizer,
                use_cuda=True):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    data_time = utils.AverageMeter('Data', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.4e')
    top1 = utils.AverageMeter('Acc@1', ':6.2f')
    top5 = utils.AverageMeter('Acc@5', ':6.2f')
    progress = utils.ProgressMeter(len(train_loader),
                                   batch_time,
                                   data_time,
                                   top1,
                                   top5,
                                   losses,
                                   prefix="Epoch: [{}]".format(epoch + 1))

    print_freq = len(train_loader) // 4 + 1
    all_preds = []
    all_labels = []
    model.train()
    end = time.time()
    for i, (paths, inputs, labels) in enumerate(train_loader):

        if use_cuda:
            inputs, labels = inputs.cuda(), labels.cuda()
        data_time.update(time.time() - end)

        # forward + backward + optimize
        if type(model).__name__ == 'Inception3' and model.aux_logits:
            outputs, aux_outputs = model(inputs)
            loss_aux = criterion(aux_outputs, labels)
            loss_final = criterion(outputs, labels)
            loss = loss_final + 0.4 * loss_aux
        else:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
        acc1, acc5 = utils.accuracy(outputs, labels, topk=(1, 5))
        losses.update(loss.item(), inputs.size(0))
        top1.update(acc1[0], inputs.size(0))
        top5.update(acc5[0], inputs.size(0))

        # for confusion matrix calculation
        _, preds = outputs.topk(1, 1, True, True)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        # zero the parameter gradients
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # print statistics
        if i % print_freq == 0 or i + 1 == len(train_loader):
            progress.print(i + 1)

    print(confusion_matrix(all_labels, all_preds))
    return top1.avg, top5.avg
Пример #13
0
def train_model(trainloader, testloader, net, optimizer, scheduler,
                start_epoch, device):
    # define the loss function
    criterion = (nn.CrossEntropyLoss().cuda()
                 if torch.cuda.is_available() else nn.CrossEntropyLoss())

    best_acc = 0.0
    best_model = copy.deepcopy(net.state_dict())

    for epoch in range(start_epoch,
                       args.num_epoch):  # loop over the dataset multiple times

        # set printing functions
        batch_time = util.AverageMeter('Time/batch', ':.2f')
        losses = util.AverageMeter('Loss', ':6.2f')
        top1 = util.AverageMeter('Acc', ':6.2f')
        progress = util.ProgressMeter(len(trainloader),
                                      [losses, top1, batch_time],
                                      prefix="Epoch: [{}]".format(epoch + 1))

        # switch the model to the training mode
        net.train()

        print('current learning rate = {}'.format(
            optimizer.param_groups[0]['lr']))

        # each epoch
        end = time.time()
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data[0].to(device), data[1].to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            if 'pg' in _ARCH:
                for name, param in net.named_parameters():
                    if 'threshold' in name:
                        loss += (0.00001 * 0.5 *
                                 torch.norm(param - args.gtarget) *
                                 torch.norm(param - args.gtarget))
            loss.backward()
            optimizer.step()

            # measure accuracy and record loss
            _, batch_predicted = torch.max(outputs.data, 1)
            batch_accu = 100.0 * (batch_predicted
                                  == labels).sum().item() / labels.size(0)
            losses.update(loss.item(), labels.size(0))
            top1.update(batch_accu, labels.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % 100 == 99:
                # print statistics every 100 mini-batches each epoch
                progress.display(i)  # i = batch id in the epoch

        # update the learning rate
        scheduler.step()

        # print test accuracy every few epochs
        if epoch % 1 == 0:
            print('epoch {}'.format(epoch + 1))
            epoch_acc = test_accu(testloader, net, device)
            if 'pg' in _ARCH:
                sparsity(testloader, net, device)
            if epoch_acc >= best_acc:
                best_acc = epoch_acc
                best_model = copy.deepcopy(net.state_dict())
            print("The best test accuracy so far: {:.1f}".format(best_acc))

            # save the model if required
            if args.save:
                print("Saving the trained model and states.")
                this_file_path = os.path.dirname(os.path.abspath(__file__))
                save_folder = os.path.join(this_file_path,
                                           'save_CIFAR10_model')
                util.save_models(best_model,
                                 save_folder,
                                 suffix=_ARCH +
                                 '-finetune' if args.finetune else _ARCH)
                """
                states = {'epoch':epoch+1, 
                          'optimizer':optimizer.state_dict(), 
                          'scheduler':scheduler.state_dict()}
                util.save_states(states, save_folder, suffix=_ARCH)
                """

    print('Finished Training')
Пример #14
0
def train_model(trainloader, testloader, net, optimizer, scheduler,
                start_epoch, num_epoch, device):
    # define the loss function
    criterion = (nn.KLDivLoss(reduction='batchmean').cuda()
                 if torch.cuda.is_available() else nn.KLDivLoss(
                     reduction='batchmean'))

    best_acc = 0.
    best_model = copy.deepcopy(net.state_dict())
    states = {
        'epoch': start_epoch,
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict()
    }

    for epoch in range(start_epoch, num_epoch):

        # set printing functions
        batch_time = util.AverageMeter('Time/batch', ':.2f')
        losses = util.AverageMeter('Loss', ':6.2f')
        top1 = util.AverageMeter('Acc@1', ':6.2f')
        top5 = util.AverageMeter('Acc@5', ':6.2f')
        progress = util.ProgressMeter(len(trainloader),
                                      [losses, top1, top5, batch_time],
                                      prefix="Epoch: [{}]".format(epoch + 1))

        # switch the model to the training mode
        net.train()

        print('current learning rate = {}'.format(
            optimizer.param_groups[0]['lr']))

        # each epoch
        end = time.time()
        for i, data in enumerate(trainloader, 0):

            # get the inputs; data is a tuple of (inputs, labels)
            inputs, labels = data[0].to(device), data[1].to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs, lessons = net(inputs)
            loss = criterion(outputs.log_softmax(dim=1),
                             lessons.softmax(dim=1))
            if 'pg' in _ARCH:
                for name, param in net.named_parameters():
                    if 'threshold' in name:
                        loss += (0.00001 * 0.5 *
                                 torch.norm(param - args.gtarget) *
                                 torch.norm(param - args.gtarget))
            loss.backward()
            optimizer.step()

            # measure accuracy and record loss
            acc1, acc5 = accuracy(outputs, labels, topk=(1, 5))
            losses.update(loss.item(), inputs.size(0))
            top1.update(acc1[0], inputs.size(0))
            top5.update(acc5[0], inputs.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % 100 == 99:
                # print statistics every 100 mini-batches each epoch
                progress.display(i)  # i = batch id in the epoch

        # update the learning rate every epoch
        scheduler.step()

        # print test accuracy every few epochs
        if epoch % 1 == 0:
            print('epoch {}'.format(epoch + 1))
            epoch_acc = test_accu(testloader, net, device)
            if 'pg' in _ARCH:
                sparsity(testloader, net, device)
            if epoch_acc >= best_acc:
                best_acc = epoch_acc
                best_model = copy.deepcopy(net.state_dict())
                states = {
                    'epoch': epoch + 1,
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict()
                }
            print("Best test accuracy so far: {:.1f}".format(best_acc))

            # save the model if required
            if args.save:
                print("Saving the trained model.")
                this_file_path = os.path.dirname(os.path.abspath(__file__))
                save_folder = os.path.join(this_file_path,
                                           'save_ImageNet_model')
                util.save_models(best_model,
                                 save_folder,
                                 suffix=_ARCH +
                                 '-finetune' if args.finetune else _ARCH)
                util.save_states(states,
                                 save_folder,
                                 suffix=_ARCH +
                                 '-finetune' if args.finetune else _ARCH)

    print('Finished Training')