def train(): # Turn on training mode which enables dropout. print('Load training data') model.train() if hasattr(model.rnn, 'step_slope'): model.rnn.step_slope = step_slope total_loss = 0 start_time = time.time() hidden = model.init_hidden(args.batch_size) # Shuffle order of talks train_data = data_shuffle(datafile_train) print('Start training') for (data,targets,batch) in data_producer(train_data, args.batch_size, args.bptt, cuda=args.cuda, use_durs=args.use_durs): # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() optimizer.zero_grad() output, hidden = model(data, hidden) if args.tier=='combined': loss, loss_phone, loss_word = model.criterion(output, targets) else: loss = model.criterion(output, targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) # for p in model.parameters(): # p.data.add_(-lr, p.grad.data) optimizer.step() total_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.6f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data[0]) // args.batch_size // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def evaluate(data_source): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 phone_loss = 0; word_loss = 0 hidden = model.init_hidden(eval_batch_size) for (data,targets,batch) in data_producer(data_source, eval_batch_size, args.bptt, cuda=args.cuda, use_durs=args.use_durs, evaluation=True): output, hidden = model(data, hidden) if args.tier=='combined': loss, loss_phone, loss_word = model.criterion(output, targets) total_loss += loss.data phone_loss += loss_phone.data word_loss += loss_word.data else: total_loss += model.criterion(output, targets).data hidden = repackage_hidden(hidden) if args.tier=='combined': return (total_loss[0]/(batch+1), phone_loss[0]/(batch+1), word_loss[0]/(batch+1)) else: return total_loss[0]/(batch+1)
def test_forward_and_backward(self): # Note: model and dataset all transfer to CUDA device. batches = self.loader.load(self.trainset, batch_size=10, shuffle=True, to_tensor=True, to_cuda=True) model = Net(word_vocab_size=self.params.word_vocab_size, tag_vocab_size=self.params.tag_vocab_size, embedding_dim=self.params.embedding_dim, lstm_hidden_dim=self.params.lstm_hidden_dim).cuda() for batch in batches: inputs, targets = batch outputs = model(inputs) loss = criterion(outputs, targets) loss.backward() self.logger.debug('loss: {}'.format(loss.item())) break
def evaluate_model(epoch, history=None): model.eval() loss = 0 dev_pred_1 = [] # logtic 阈值0.3 dev_pred_2 = [] # logtic 阈值0.6 with torch.no_grad(): # try finally的简写形式 for img_batch, mask_batch, gaussian_batch, regr_batch in tqdm( dev_loader, desc="验证中"): img_batch = img_batch.to(device) mask_batch = mask_batch.to(device) gaussian_batch = gaussian_batch.to(device) regr_batch = regr_batch.to(device) output = model(img_batch) loss += criterion(output, mask_batch, gaussian_batch, regr_batch, size_average=False).data output = output.data.cpu().numpy() for out in output: coords_1 = extract_coords(out, threshold=0.5) coords_2 = extract_coords(out, threshold=0.6) s_1 = coords2str(coords_1) s_2 = coords2str(coords_2) dev_pred_1.append(s_1) dev_pred_2.append(s_2) loss /= len(dev_loader.dataset) df_dev_pred_1['PredictionString'] = dev_pred_1 df_dev_pred_2['PredictionString'] = dev_pred_2 if history is not None: history.loc[epoch, 'dev_loss'] = loss.cpu().numpy() print('Dev loss: {:.4f}'.format(loss)) mAP_1 = calculate_mAP(valid_df=df_dev_pred_1, train_df=df_dev) mAP_2 = calculate_mAP(valid_df=df_dev_pred_2, train_df=df_dev) print('mAP threshold: 0.5:', mAP_1) print('mAP threshold: 0.6:', mAP_2) mAP_history.loc[epoch, '0.5'] = mAP_1 mAP_history.loc[epoch, '0.6'] = mAP_2
def train_model(epoch, history=None): model.train() loss_all = 0 mask_focal_loss_all = 0 regr_loss_all = 0 for batch_idx, (img_batch, mask_batch, gaussian_mask_batch, regr_batch) in enumerate(tqdm(train_loader, desc="训练中")): img_batch = img_batch.to(device) mask_batch = mask_batch.to(device) gaussian_mask_batch = gaussian_mask_batch.to(device) regr_batch = regr_batch.to(device) optimizer.zero_grad() output = model(img_batch) loss, mask_focal_loss, regr_loss = criterion(output, mask_batch, gaussian_mask_batch, regr_batch, split_loss=True) if history is not None: history.loc[epoch + batch_idx / len(train_loader), 'train_loss'] = loss.data.cpu().numpy() loss.backward() loss_all += loss.data mask_focal_loss_all += mask_focal_loss.data regr_loss_all += regr_loss.data optimizer.step() exp_lr_scheduler.step() # 学习率衰减 print( '\nTrain Epoch: {} \tLR: {:.6f}\tLoss: {:.6f}\tbinary loss {:.6f}\tregression loss {:.6f}' .format(epoch, optimizer.state_dict()['param_groups'][0]['lr'], loss_all / len(train_loader), mask_focal_loss_all / len(train_loader), regr_loss_all / len(train_loader)))
model = opts.model(base_model, dataset_train.num_training_classes).cuda() model_weights, model_biases, base_model_weights, base_model_biases = [[p for k, p in model.named_parameters() if p.requires_grad and ('bias' in k) == is_bias and ('base' in k) == is_base] for is_base in [False, True] for is_bias in [False, True]] base_model_lr_mult = model.optimizer_params.pop('base_model_lr_mult', 1.0) optimizer = model.optimizer([dict(params = base_model_weights, lr = base_model_lr_mult * model.optimizer_params['lr']), dict(params = base_model_biases, lr = base_model_lr_mult * model.optimizer_params['lr'], weight_decay = 0.0), dict(params = model_biases, weight_decay = 0.0)], **model.optimizer_params) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, **model.lr_scheduler_params) log = open(opts.log, 'w', 0) for epoch in range(opts.epochs): scheduler.step() model.train() loss_all, norm_all = [], [] for batch_idx, batch in enumerate(loader_train if model.criterion is not None else []): tic = time.time() images, labels = [tensor.cuda() for tensor in batch] loss = model.criterion(model(images), labels) loss_all.append(float(loss)) optimizer.zero_grad() loss.backward() optimizer.step() print('train {:>3}.{:05} loss {:.04f} hz {:.02f}'.format(epoch, batch_idx, loss_all[-1], len(images) / (time.time() - tic))) log.write('loss epoch {}: {:.04f}\n'.format(epoch, torch.Tensor(loss_all or [0.0]).mean())) if epoch < 10 or epoch % 5 == 0 or epoch == opts.epochs - 1: model.eval() embeddings_all, labels_all = [], [] for batch_idx, batch in enumerate(loader_eval): tic = time.time() images, labels = [tensor.cuda() for tensor in batch] with torch.no_grad(): output = model(images)
num_workers=8, batch_size=opts.batch, drop_last=True) model.train() # batch train scheduler.step() loss_all = [] for batch_idx, batch in enumerate( loader_train if model.criterion is not None else []): a_images, p_images, n_images, p_w, n_w = [ torch.autograd.Variable(tensor.cuda()) for tensor in batch ] loss = model.criterion(model(a_images), model(p_images), model(n_images), p_w, n_w, margin=opts.margin) loss_all.append(loss.data.item()) optimizer.zero_grad() loss.backward() optimizer.step() print('loss epoch {}: {:.04f}'.format(epoch, np.mean(loss_all))) log.write('loss epoch {}: {:.04f}\n'.format(epoch, np.mean(loss_all))) # evaluate on test set if epoch < 10 or (epoch + 1) % 5 == 0 or (epoch + 1) == opts.epochs: model.eval() embeddings_all, labels_all = get_dataset_embeddings( model, dataset_eval) rec = [