def main_worker(ngpus_per_node, args): cprint('=> modeling the network ...', 'green') model = builder_inf(args) model = torch.nn.DataParallel(model).cuda() cprint('=> building the dataloader ...', 'green') trans = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0., 0., 0.], std=[1., 1., 1.]), ]) inf_dataset = ImgInfLoader(ann_file=args.inf_list, transform=trans) inf_loader = torch.utils.data.DataLoader(inf_dataset, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True, shuffle=False) cprint('=> starting inference engine ...', 'green') cprint('=> embedding features will be saved into {}'.format( args.feat_list)) batch_time = utils.AverageMeter('Time', ':6.3f') data_time = utils.AverageMeter('Data', ':6.3f') progress = utils.ProgressMeter(len(inf_loader), [batch_time, data_time], prefix="Extract Features: ") # switch to evaluate mode model.eval() fio = open(args.feat_list, 'w') with torch.no_grad(): end = time.time() for i, (input, img_paths) in enumerate(inf_loader): # measure data loading time data_time.update(time.time() - end) # compute output embedding_feat = model(input[0]) # embedding_feat = F.normalize(embedding_feat, p=2, dim=1) _feat = embedding_feat.data.cpu().numpy() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) # write feat into files for feat, path in zip(_feat, img_paths): fio.write('{} '.format(path)) for e in feat: fio.write('{} '.format(e)) fio.write('\n') # close fio.close()
def validate_epoch(val_loader, model, criterion, use_cuda=True): batch_time = utils.AverageMeter('Time', ':6.3f') losses = utils.AverageMeter('Loss', ':.4e') top1 = utils.AverageMeter('Acc@1', ':6.2f') top5 = utils.AverageMeter('Acc@5', ':6.2f') progress = utils.ProgressMeter(len(val_loader), batch_time, top1, top5, losses, prefix='Val: ') # switch to evaluate mode all_preds = [] all_labels = [] model.eval() print_freq = len(val_loader) // 4 + 1 with torch.no_grad(): end = time.time() for i, (_, inputs, labels) in enumerate(val_loader): if use_cuda: inputs, labels = inputs.cuda(), labels.cuda() # compute output outputs = model(inputs) loss = criterion(outputs, labels) # measure accuracy and record loss acc1, acc5 = utils.accuracy(outputs, labels, topk=(1, 5)) losses.update(loss.item(), inputs.size(0)) top1.update(acc1[0], inputs.size(0)) top5.update(acc5[0], inputs.size(0)) # for confusion matrix calculation _, preds = outputs.topk(1, 1, True, True) all_preds.extend(preds.cpu().numpy()) all_labels.extend(labels.cpu().numpy()) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % print_freq == 0 or i + 1 == len(val_loader): progress.print(i + 1) print(confusion_matrix(all_labels, all_preds)) return top1.avg, top5.avg
def run_train_epoch(model, optimizer, criterion, train_dataloader, epoch, args): batch_time = utils.AverageMeter('Time', ':6.3f') #data_time = utils.AverageMeter('Data', ':6.3f') losses = utils.AverageMeter('Loss', ':.4e') grad_norm = utils.AverageMeter('grad_norm', ':.4e') progress = utils.ProgressMeter(len(train_dataloader), batch_time, losses, grad_norm, prefix="Epoch: [{}]".format(epoch)) end = time.time() # trainloader is an iterator. This line extract one minibatch at one time for i, data in enumerate(train_dataloader, 0): feat = data["x"] label = data["y"] x = feat.to(th.float32) y = label.unsqueeze(2).long() if th.cuda.is_available(): x = x.cuda() y = y.cuda() prediction = model(x) loss = criterion(prediction.view(-1, prediction.shape[2]), y.view(-1)) optimizer.zero_grad() loss.backward() # Gradient Clipping norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() grad_norm.update(norm) # update loss losses.update(loss.item(), x.size(0)) # measure elapsed time batch_time.update(time.time() - end) if i % args.print_freq == 0: # if not args.hvd or hvd.rank() == 0: progress.print(i)
def run_train_epoch(model, optimizer, criterion, train_dataloader, epoch, args): batch_time = utils.AverageMeter('Time', ':6.3f') losses = utils.AverageMeter('Loss', ':.4e') grad_norm = utils.AverageMeter('grad_norm', ':.4e') progress = utils.ProgressMeter(len(train_dataloader), batch_time, losses, grad_norm, prefix="Epoch: [{}]".format(epoch)) end = time.time() # trainloader is an iterator. This line extract one minibatch at one time for i, data in enumerate(train_dataloader, 0): feat = data["x"] label = data["y"] num_frs = data["num_frs"] utt_ids = data["utt_ids"] x = feat.to(th.float32) y = label.squeeze(2).long() if th.cuda.is_available(): x = x.cuda() y = y.cuda() x = x.transpose(0, 1) key_padding_mask = th.ones((x.size(1), x.size(0))) for utt in range(len(num_frs)): key_padding_mask[utt, :num_frs[utt]] = 0 src_mask = None if (args.look_ahead > -1): src_mask = th.tril(th.ones(x.size(0), x.size(0)), diagonal=args.look_ahead) src_mask = src_mask.float().masked_fill(src_mask == 0, float('-inf')).masked_fill( src_mask == 1, float(0.0)) src_mask = src_mask.cuda() key_padding_mask = key_padding_mask.bool().cuda() prediction = model(x, src_mask, key_padding_mask) prediction = prediction.transpose(0, 1).contiguous() loss = criterion(prediction.view(-1, prediction.size(2)), y.view(-1)) optimizer.zero_grad() loss.backward() # Gradient Clipping norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) #update lr step = len(train_dataloader) * epoch + i + 1 lr = utils.noam_decay(step, args.warmup_step, args.lr) for param_group in optimizer.param_groups: param_group['lr'] = lr optimizer.step() grad_norm.update(norm) # update loss losses.update(loss.item(), x.size(1)) # measure elapsed time batch_time.update(time.time() - end) if i % args.print_freq == 0: # if not args.hvd or hvd.rank() == 0: progress.print(i)
def run_train_epoch(model, optimizer, log_prior, dataloader, epoch, asr_decoder, trans_model, silence_ids, args): batch_time = utils.AverageMeter('Time', ':6.3f') losses = utils.AverageMeter('Loss', ':.4e') grad_norm = utils.AverageMeter('grad_norm', ':.4e') progress = utils.ProgressMeter(len(dataloader), batch_time, losses, grad_norm, prefix="Epoch: [{}]".format(epoch)) ce_criterion = nn.CrossEntropyLoss(ignore_index=-100, reduction='sum') if args.criterion == "mmi": se_criterion = ops.MMIFunction.apply else: se_criterion = ops.sMBRFunction.apply end = time.time() for i, batch in enumerate(dataloader, 0): feat = batch["x"] label = batch["y"] #pdf-ids for ce loss num_frs = batch["num_frs"] utt_ids = batch["utt_ids"] aux = batch["aux"] #trans_ids for se loss x = feat.to(th.float32) y = label.long() x = x.cuda() y = y.cuda() prediction = model(x) ce_loss = ce_criterion(prediction.view(-1, prediction.shape[2]), y.view(-1)) se_loss = 0.0 for j in range(len(num_frs)): log_like_j = prediction[j, :, :] log_like_j = log_like_j[:num_frs[j], :] log_like_j = log_like_j - log_prior #trans_id = label[j, :num_frs[j], 0].tolist() trans_id = th.from_numpy(aux[j][0][0].astype(int)).tolist() # print(len(trans_id), num_frs[j]) if args.criterion == "mmi": se_loss += se_criterion(log_like_j, asr_decoder, trans_model, trans_id) else: se_loss += se_criterion(log_like_j, asr_decoder, trans_model, trans_id, args.criterion, silence_ids) loss = se_loss.cuda() + args.ce_ratio * ce_loss optimizer.zero_grad() loss.backward() # Gradient Clipping (th 5.0) norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() grad_norm.update(norm) # update loss tot_frs = np.array(num_frs).sum() losses.update(loss.item() / tot_frs) # measure elapsed time batch_time.update(time.time() - end) # save model if hvd.rank() == 0 and i % args.save_freq == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() output_file = args.exp_dir + '/model.se.' + str(i) + '.tar' th.save(checkpoint, output_file) if hvd.rank() == 0 and i % args.print_freq == 0: progress.print(i)
def train_model(trainloader, testloader, net, device): if torch.cuda.device_count() > 1: # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs print("Activate multi GPU support.") net = nn.DataParallel(net) net.to(device) # define the loss function criterion = (nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()) # Scale the lr linearly with the batch size. # Should be 0.1 when batch_size=128 initial_lr = 0.1 * batch_size / 128 # initialize the optimizer optimizer = optim.SGD(net.parameters(), lr=initial_lr, momentum=0.9, weight_decay=_WEIGHT_DECAY) # multiply the lr by 0.1 at 100, 150, and 200 epochs div = num_epoch // 4 lr_decay_milestones = [div * 2, div * 3] scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=lr_decay_milestones, gamma=0.1, last_epoch=_LAST_EPOCH) for epoch in range(num_epoch): # loop over the dataset multiple times # set printing functions batch_time = util.AverageMeter('Time/batch', ':.3f') losses = util.AverageMeter('Loss', ':6.2f') top1 = util.AverageMeter('Acc', ':6.2f') progress = util.ProgressMeter(len(trainloader), [losses, top1, batch_time], prefix="Epoch: [{}]".format(epoch + 1)) # switch the model to the training mode net.train() print('current learning rate = {}'.format( optimizer.param_groups[0]['lr'])) # each epoch end = time.time() for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data[0].to(device), data[1].to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) for name, param in net.named_parameters(): if 'threshold' in name: loss += args.sigma * torch.norm(param - args.gtarget) loss.backward() optimizer.step() # measure accuracy and record loss _, batch_predicted = torch.max(outputs.data, 1) batch_accu = 100.0 * (batch_predicted == labels).sum().item() / labels.size(0) losses.update(loss.item(), labels.size(0)) top1.update(batch_accu, labels.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % 50 == 49: # print statistics every 100 mini-batches each epoch progress.display(i) # i = batch id in the epoch # update the learning rate scheduler.step() # print test accuracy every few epochs if epoch % 10 == 9: print('epoch {}'.format(epoch + 1)) test_accu(testloader, net, device) # save the model if required if args.save: print("Saving the trained model.") util.save_models(net.state_dict(), save_folder, suffix=_ARCH) print('Finished Training')
def do_train(train_loader, model, criterion, optimizer, grad_scaler, epoch, args): batch_time = utils.AverageMeter('Time', ':6.3f') data_time = utils.AverageMeter('Data', ':6.3f') losses = utils.AverageMeter('Loss', ':.3f') top1 = utils.AverageMeter('Acc@1', ':6.2f') learning_rate = utils.AverageMeter('LR', ':.4f') throughputs = utils.AverageMeter('ThroughPut', ':.2f') losses_id = utils.AverageMeter('L_ID', ':.3f') losses_mag = utils.AverageMeter('L_mag', ':.6f') progress_template = [batch_time, data_time, throughputs, 'images/s', losses, losses_id, losses_mag, top1, learning_rate] progress = utils.ProgressMeter( len(train_loader), progress_template, prefix="Epoch: [{}]".format(epoch)) end = time.time() # update lr learning_rate.update(current_lr) for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) global iters iters += 1 input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # compute output with autocast(enabled=args.amp_mode): output, x_norm = model(input, target) # x_norm is not needed to be gathered, as feature x is in each rank target = ts.distributed.gather(target, dim=0) # loss with autocast(enabled=args.amp_mode): loss_id, loss_g, one_hot = criterion(output, target, x_norm) loss = loss_id + args.lambda_g * loss_g # compute gradient and do solver step optimizer.zero_grad() # backward grad_scaler.scale(loss).backward() # update weights grad_scaler.step(optimizer) grad_scaler.update() # syn for logging torch.cuda.synchronize() # measure elapsed time if args.rank == 0: duration = time.time() - end end = time.time() batch_time.update(duration) bs = args.batch_size throughputs.update(args.world_size * bs / duration) # measure accuracy and record loss output = ts.distributed.gather(output[0], dim=-1) acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(acc1[0], input.size(0)) losses_id.update(loss_id.item(), input.size(0)) losses_mag.update(args.lambda_g*loss_g.item(), input.size(0)) if i % args.print_freq == 0 and args.rank == 0: progress.display(i) debug_info(x_norm, args.l_a, args.u_a, args.l_margin, args.u_margin)
def do_train(train_loader, model, criterion, optimizer, epoch, args): batch_time = utils.AverageMeter('Time', ':6.3f') data_time = utils.AverageMeter('Data', ':6.3f') losses = utils.AverageMeter('Loss', ':.3f') top1 = utils.AverageMeter('Acc@1', ':6.2f') top5 = utils.AverageMeter('Acc@5', ':6.2f') learning_rate = utils.AverageMeter('LR', ':.4f') throughputs = utils.AverageMeter('ThroughPut', ':.2f') losses_id = utils.AverageMeter('L_ID', ':.3f') losses_mag = utils.AverageMeter('L_mag', ':.6f') progress_template = [ batch_time, data_time, throughputs, 'images/s', losses, losses_id, losses_mag, top1, top5, learning_rate ] progress = utils.ProgressMeter(len(train_loader), progress_template, prefix="Epoch: [{}]".format(epoch)) end = time.time() # update lr learning_rate.update(current_lr) for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) global iters iters += 1 input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # compute output output, x_norm = model(input, target) loss_id, loss_g, one_hot = criterion(output, target, x_norm) loss = loss_id + args.lambda_g * loss_g # measure accuracy and record loss acc1, acc5 = utils.accuracy(args, output[0], target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(acc1[0], input.size(0)) top5.update(acc5[0], input.size(0)) losses_id.update(loss_id.item(), input.size(0)) losses_mag.update(args.lambda_g * loss_g.item(), input.size(0)) # compute gradient and do solver step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time duration = time.time() - end batch_time.update(duration) end = time.time() throughputs.update(args.batch_size / duration) if i % args.print_freq == 0: progress.display(i) debug_info(x_norm, args.l_a, args.u_a, args.l_margin, args.u_margin) if args.vis_mag: if (i > 10000) and (i % 100 == 0): x_norm = x_norm.detach().cpu().numpy() cos_theta = torch.masked_select( output[0], one_hot.bool()).detach().cpu().numpy() logit = torch.masked_select(F.softmax( output[0]), one_hot.bool()).detach().cpu().numpy() np.savez( '{}/vis/epoch_{}_iter{}'.format(args.pth_save_fold, epoch, i), x_norm, logit, cos_theta)
def do_train(train_loader, model, criterion, optimizer, grad_scaler, epoch, args): batch_time = utils.AverageMeter('Time', ':6.3f') data_time = utils.AverageMeter('Data', ':6.3f') losses = utils.AverageMeter('Loss', ':.3f') top1 = utils.AverageMeter('Acc@1', ':6.2f') learning_rate = utils.AverageMeter('LR', ':.4f') throughputs = utils.AverageMeter('ThroughPut', ':.2f') losses_id = utils.AverageMeter('L_ID', ':.3f') losses_mag = utils.AverageMeter('L_mag', ':.6f') progress_template = [ batch_time, data_time, throughputs, 'images/s', losses, losses_id, losses_mag, top1, learning_rate ] progress = utils.ProgressMeter(len(train_loader), progress_template, prefix="Epoch: [{}]".format(epoch)) end = time.time() # update lr learning_rate.update(current_lr) for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) global iters iters += 1 input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # compute output with autocast(enabled=args.amp_mode): output, x_norm = model(input, target) # x_norm is not needed to be gathered, as feature x is in each rank target = mpu._gather(target, dim=0) # loss with autocast(enabled=args.amp_mode): loss_id, loss_g, one_hot = criterion(output, target, x_norm) loss = loss_id + args.lambda_g * loss_g * args.world_size # compute gradient and do solver step optimizer.zero_grad() # backward grad_scaler.scale(loss).backward() # update weights grad_scaler.step(optimizer) grad_scaler.update() # syn for logging torch.cuda.synchronize() # measure elapsed time if args.rank == 0: duration = time.time() - end end = time.time() batch_time.update(duration) bs = args.batch_size throughputs.update(args.world_size * bs / duration) # measure accuracy and record loss acc1, _ = mpu.accuracy(args, output, target, topk=(1, 1)) losses.update(loss.item(), input.size(0)) top1.update(acc1[0], input.size(0)) losses_id.update(loss_id.item(), input.size(0)) losses_mag.update(args.lambda_g * loss_g.item(), input.size(0)) if i % args.print_freq == 0 and args.rank == 0: progress.display(i) debug_info(x_norm, args.l_a, args.u_a, args.l_margin, args.u_margin) if args.vis_mag: if (epoch == args.epochs - 1) and (i % 1000 == 0): one_hot = one_hot.bool() mask = torch.sum(one_hot, dim=1).bool() x_norm_cur_rank = torch.masked_select( x_norm.squeeze(), mask).detach().cpu().numpy() cos_theta_cur_rank = torch.masked_select( output[0], one_hot).detach().cpu().numpy() np.savez( '{}/vis/epoch_{}_iter{}_rank_{}'.format( args.pth_save_fold, epoch, i, args.rank), x_norm_cur_rank, cos_theta_cur_rank)
def run_train_epoch(model, optimizer, log_prior, dataloader, epoch, asr_decoder, trans_model, silence_ids, aligner, args): batch_time = utils.AverageMeter('Time', ':6.3f') losses = utils.AverageMeter('Loss', ':.4e') grad_norm = utils.AverageMeter('grad_norm', ':.4e') progress = utils.ProgressMeter(len(dataloader), batch_time, losses, grad_norm, prefix="Epoch: [{}]".format(epoch)) ce_criterion = nn.CrossEntropyLoss(ignore_index=-100, reduction='sum') if args.criterion == "mmi": criterion = ops.MMIFunction.apply else: criterion = ops.sMBRFunction.apply end = time.time() for i, batch in enumerate(dataloader): feat = batch["x"] label = batch["y"] num_frs = batch["num_frs"] utt_ids = batch["utt_ids"] aux = batch["aux"] #word labels for se loss x = feat.to(th.float32) y = label.long() x = x.cuda() y = y.cuda() prediction = model(x) ce_loss = ce_criterion(prediction.view(-1, prediction.shape[2]), y.view(-1)) loss = args.ce_ratio * ce_loss for j in range(len(num_frs)): loglike = prediction[j, :, :] loglike_j = loglike[:num_frs[j], :] loglike_j = loglike_j - log_prior text = th.from_numpy(aux[j][0][0].astype(int)).tolist() #text = ' '.join(str(k) for k in text) try: align_in = kaldi_matrix.Matrix( loglike_j.detach().cpu().numpy()) align_out = aligner.align(align_in, text) trans_ids = align_out["alignment"] if args.criterion == "mmi": se_loss = criterion(loglike_j, asr_decoder, trans_model, trans_ids) else: se_loss = criterion(loglike_j, asr_decoder, trans_model, trans_ids, args.criterion, silence_ids) loss += se_loss.cuda() except: print( "Warning: failed to align utterance {}, skip the utterance for SE loss" .format(utt_ids[j])) optimizer.zero_grad() loss.backward() # Gradient Clipping (th 5.0) norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() grad_norm.update(norm) # update loss tot_frs = np.array(num_frs).sum() losses.update(loss.item() / tot_frs) # measure elapsed time batch_time.update(time.time() - end) # save model if hvd.rank() == 0 and i % args.save_freq == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() output_file = args.exp_dir + '/model.se.' + str(i) + '.tar' th.save(checkpoint, output_file) if hvd.rank() == 0 and i % args.print_freq == 0: progress.print(i)
def run_train_epoch(model, optimizer, dataloader, epoch, trans_model, tree, supervision_opts, aligner, den, chain_opts, args): batch_time = utils.AverageMeter('Time', ':6.3f') losses = utils.AverageMeter('Loss', ':.4e') grad_norm = utils.AverageMeter('grad_norm', ':.4e') progress = utils.ProgressMeter(len(dataloader), batch_time, losses, grad_norm, prefix="Epoch: [{}]".format(epoch)) criterion = ops.ChainObjtiveFunction.apply end = time.time() for i, batch in enumerate(dataloader): feat = batch["x"] label = batch["y"] num_frs = batch["num_frs"] utt_ids = batch["utt_ids"] aux = batch["aux"] #word labels for se loss frame_shift = (epoch % supervision_opts.frame_subsampling_factor) * -1 x = feat.to(th.float32) x = th.roll(x, frame_shift, 1) x = x.unfold(1, 1, supervision_opts.frame_subsampling_factor).squeeze(-1) x = x.cuda() y = label.squeeze(2) loss = 0.0 prediction = model(x) for j in range(len(num_frs)): trans_ids = y[j, :num_frs[j]].tolist() phone_ali = aligner.to_phone_alignment(trans_ids) phones = list() durations = list() for item in phone_ali: phones.append(item[0]) durations.append(item[2]) proto_supervision = kaldi_chain.alignment_to_proto_supervision( supervision_opts, phones, durations) supervision = kaldi_chain.proto_supervision_to_supervision( tree, trans_model, proto_supervision, True) loglike_j = prediction[j, :supervision.frames_per_sequence, :] loss += criterion(loglike_j, den, supervision, chain_opts) optimizer.zero_grad() loss.backward() #update lr step = len(dataloader) * epoch + i + 1 lr = utils.noam_decay(step, args.warmup_steps, args.lr) for param_group in optimizer.param_groups: param_group['lr'] = lr # Gradient Clipping (th 5.0) norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() grad_norm.update(norm) # update the loss tot_frs = np.array(num_frs).sum() losses.update(loss.item() / tot_frs) # measure the elapsed time batch_time.update(time.time() - end) # save model if hvd.rank() == 0 and i % args.save_freq == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() output_file = args.exp_dir + '/chain.model.' + str(i) + '.tar' th.save(checkpoint, output_file) if hvd.rank() == 0 and i % args.print_freq == 0: progress.print(i)
def train_epoch(epoch, train_loader, model, criterion, optimizer, use_cuda=True): batch_time = utils.AverageMeter('Time', ':6.3f') data_time = utils.AverageMeter('Data', ':6.3f') losses = utils.AverageMeter('Loss', ':.4e') top1 = utils.AverageMeter('Acc@1', ':6.2f') top5 = utils.AverageMeter('Acc@5', ':6.2f') progress = utils.ProgressMeter(len(train_loader), batch_time, data_time, top1, top5, losses, prefix="Epoch: [{}]".format(epoch + 1)) print_freq = len(train_loader) // 4 + 1 all_preds = [] all_labels = [] model.train() end = time.time() for i, (paths, inputs, labels) in enumerate(train_loader): if use_cuda: inputs, labels = inputs.cuda(), labels.cuda() data_time.update(time.time() - end) # forward + backward + optimize if type(model).__name__ == 'Inception3' and model.aux_logits: outputs, aux_outputs = model(inputs) loss_aux = criterion(aux_outputs, labels) loss_final = criterion(outputs, labels) loss = loss_final + 0.4 * loss_aux else: outputs = model(inputs) loss = criterion(outputs, labels) acc1, acc5 = utils.accuracy(outputs, labels, topk=(1, 5)) losses.update(loss.item(), inputs.size(0)) top1.update(acc1[0], inputs.size(0)) top5.update(acc5[0], inputs.size(0)) # for confusion matrix calculation _, preds = outputs.topk(1, 1, True, True) all_preds.extend(preds.cpu().numpy()) all_labels.extend(labels.cpu().numpy()) # zero the parameter gradients optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() # print statistics if i % print_freq == 0 or i + 1 == len(train_loader): progress.print(i + 1) print(confusion_matrix(all_labels, all_preds)) return top1.avg, top5.avg
def train_model(trainloader, testloader, net, optimizer, scheduler, start_epoch, device): # define the loss function criterion = (nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()) best_acc = 0.0 best_model = copy.deepcopy(net.state_dict()) for epoch in range(start_epoch, args.num_epoch): # loop over the dataset multiple times # set printing functions batch_time = util.AverageMeter('Time/batch', ':.2f') losses = util.AverageMeter('Loss', ':6.2f') top1 = util.AverageMeter('Acc', ':6.2f') progress = util.ProgressMeter(len(trainloader), [losses, top1, batch_time], prefix="Epoch: [{}]".format(epoch + 1)) # switch the model to the training mode net.train() print('current learning rate = {}'.format( optimizer.param_groups[0]['lr'])) # each epoch end = time.time() for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data[0].to(device), data[1].to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) if 'pg' in _ARCH: for name, param in net.named_parameters(): if 'threshold' in name: loss += (0.00001 * 0.5 * torch.norm(param - args.gtarget) * torch.norm(param - args.gtarget)) loss.backward() optimizer.step() # measure accuracy and record loss _, batch_predicted = torch.max(outputs.data, 1) batch_accu = 100.0 * (batch_predicted == labels).sum().item() / labels.size(0) losses.update(loss.item(), labels.size(0)) top1.update(batch_accu, labels.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % 100 == 99: # print statistics every 100 mini-batches each epoch progress.display(i) # i = batch id in the epoch # update the learning rate scheduler.step() # print test accuracy every few epochs if epoch % 1 == 0: print('epoch {}'.format(epoch + 1)) epoch_acc = test_accu(testloader, net, device) if 'pg' in _ARCH: sparsity(testloader, net, device) if epoch_acc >= best_acc: best_acc = epoch_acc best_model = copy.deepcopy(net.state_dict()) print("The best test accuracy so far: {:.1f}".format(best_acc)) # save the model if required if args.save: print("Saving the trained model and states.") this_file_path = os.path.dirname(os.path.abspath(__file__)) save_folder = os.path.join(this_file_path, 'save_CIFAR10_model') util.save_models(best_model, save_folder, suffix=_ARCH + '-finetune' if args.finetune else _ARCH) """ states = {'epoch':epoch+1, 'optimizer':optimizer.state_dict(), 'scheduler':scheduler.state_dict()} util.save_states(states, save_folder, suffix=_ARCH) """ print('Finished Training')
def train_model(trainloader, testloader, net, optimizer, scheduler, start_epoch, num_epoch, device): # define the loss function criterion = (nn.KLDivLoss(reduction='batchmean').cuda() if torch.cuda.is_available() else nn.KLDivLoss( reduction='batchmean')) best_acc = 0. best_model = copy.deepcopy(net.state_dict()) states = { 'epoch': start_epoch, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() } for epoch in range(start_epoch, num_epoch): # set printing functions batch_time = util.AverageMeter('Time/batch', ':.2f') losses = util.AverageMeter('Loss', ':6.2f') top1 = util.AverageMeter('Acc@1', ':6.2f') top5 = util.AverageMeter('Acc@5', ':6.2f') progress = util.ProgressMeter(len(trainloader), [losses, top1, top5, batch_time], prefix="Epoch: [{}]".format(epoch + 1)) # switch the model to the training mode net.train() print('current learning rate = {}'.format( optimizer.param_groups[0]['lr'])) # each epoch end = time.time() for i, data in enumerate(trainloader, 0): # get the inputs; data is a tuple of (inputs, labels) inputs, labels = data[0].to(device), data[1].to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs, lessons = net(inputs) loss = criterion(outputs.log_softmax(dim=1), lessons.softmax(dim=1)) if 'pg' in _ARCH: for name, param in net.named_parameters(): if 'threshold' in name: loss += (0.00001 * 0.5 * torch.norm(param - args.gtarget) * torch.norm(param - args.gtarget)) loss.backward() optimizer.step() # measure accuracy and record loss acc1, acc5 = accuracy(outputs, labels, topk=(1, 5)) losses.update(loss.item(), inputs.size(0)) top1.update(acc1[0], inputs.size(0)) top5.update(acc5[0], inputs.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % 100 == 99: # print statistics every 100 mini-batches each epoch progress.display(i) # i = batch id in the epoch # update the learning rate every epoch scheduler.step() # print test accuracy every few epochs if epoch % 1 == 0: print('epoch {}'.format(epoch + 1)) epoch_acc = test_accu(testloader, net, device) if 'pg' in _ARCH: sparsity(testloader, net, device) if epoch_acc >= best_acc: best_acc = epoch_acc best_model = copy.deepcopy(net.state_dict()) states = { 'epoch': epoch + 1, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() } print("Best test accuracy so far: {:.1f}".format(best_acc)) # save the model if required if args.save: print("Saving the trained model.") this_file_path = os.path.dirname(os.path.abspath(__file__)) save_folder = os.path.join(this_file_path, 'save_ImageNet_model') util.save_models(best_model, save_folder, suffix=_ARCH + '-finetune' if args.finetune else _ARCH) util.save_states(states, save_folder, suffix=_ARCH + '-finetune' if args.finetune else _ARCH) print('Finished Training')