def validate_hvd(val_loader, model, epoch, writer, verbose, early_stopping, hvd, start): model.eval() val_loss = utils.Metric('val_loss', hvd) val_top1 = utils.Metric('val_top1', hvd) val_top5 = utils.Metric('val_top5', hvd) with tqdm(total=len(val_loader), desc='Validate Epoch #{}'.format(epoch + 1), disable=not verbose) as t: with torch.no_grad(): for data, target in val_loader: data, target = data.cuda(), target.cuda() output = model(data) val_loss.update(F.cross_entropy(output, target)) prec1, prec5 = utils.accuracy(output, target, topk=(1, 5)) val_top1.update(prec1) val_top5.update(prec5) t.set_postfix({ 'loss': val_loss.avg.item(), 'top1': 100. * val_top1.avg.item(), 'top5': 100. * val_top5.avg.item() }) t.update(1) early_stopping(val_loss.avg.item(), model, ckpt_dir=config.path) if early_stopping.early_stop: print("Early stopping") utils.time(time.time() - start) os._exit(0) writer.add_scalar('val/loss', val_loss.avg, epoch) writer.add_scalar('val/top1', val_top1.avg, epoch) writer.add_scalar('val/top5', val_top5.avg, epoch) return val_top1.avg
def train_hvd(train_loader, model, optimizer, epoch, config, writer, verbose, hvd): train_loss = utils.Metric('train_loss', hvd) train_top1 = utils.Metric('train_top1', hvd) train_top5 = utils.Metric('train_top1', hvd) model.train() with tqdm(total=len(train_loader), desc='Train Epoch #{}'.format(epoch + 1), disable=not verbose) as t: for batch_idx, (data, target) in enumerate(train_loader): utils.adjust_learning_rate_hvd(epoch, batch_idx, config, train_loader, hvd.size(), optimizer) data, target = data.cuda(), target.cuda() optimizer.zero_grad() # Split data into sub-batches of size batch_size for i in range(0, len(data), config.batch_size): data_batch = data[i:i + config.batch_size] target_batch = target[i:i + config.batch_size] output = model(data_batch) prec1, prec5 = utils.accuracy(output, target_batch, topk=(1, 5)) train_top1.update(prec1) train_top5.update(prec5) loss = F.cross_entropy(output, target_batch) train_loss.update(loss) # Average gradients among sub-batches loss.div_(math.ceil(float(len(data)) / config.batch_size)) loss.backward() # Gradient is applied across all ranks optimizer.step() t.set_postfix({ 'loss': train_loss.avg.item(), 'top1': 100. * train_top1.avg.item(), 'top5': 100. * train_top5.avg.item() }) t.update(1) writer.add_scalar('train/loss', train_loss.avg, epoch) writer.add_scalar('train/top1', train_top1.avg, epoch) writer.add_scalar('train/top5', train_top5.avg, epoch)
def train_classify(self, net, loss_fn, args, trainLoader, valLoader): net.train() opt = torch.optim.Adam(net.parameters(), lr=args.lr_siamese) opt.zero_grad() train_losses = [] time_start = time.time() queue = deque(maxlen=20) # print('steps:', args.max_steps) # epochs = int(np.ceil(args.max_steps / len(trainLoader))) epochs = 1 total_batch_id = 0 metric = utils.Metric() for epoch in range(epochs): train_loss = 0 metric.reset_acc() with tqdm(total=len(trainLoader), desc=f'Epoch {epoch + 1}/{epochs}') as t: for batch_id, (img, label) in enumerate(trainLoader, 1): # print('input: ', img1.size()) if args.cuda: img, label = Variable(img.cuda()), Variable( label.cuda()) else: img, label = Variable(img), Variable(label) net.train() opt.zero_grad() output = net.forward(img) metric.update_acc(output, label) loss = loss_fn(output, label) # print('loss: ', loss.item()) train_loss += loss.item() loss.backward() opt.step() total_batch_id += 1 t.set_postfix(loss=f'{train_loss / batch_id:.4f}', train_acc=f'{metric.get_acc():.4f}') train_losses.append(train_loss) t.update() return net
def eval(model, dataset, args): model.eval() with torch.no_grad(): all_ids = [] all_preds = [] all_labels = [] all_lengths = [] all_sens_lengths = [] all_token_ranges = [] for i in range(dataset.batch_count): sentence_ids, tokens, lengths, masks, sens_lens, token_ranges, aspect_tags, tags = dataset.get_batch(i) preds = model(tokens, masks) preds = torch.argmax(preds, dim=3) all_preds.append(preds) all_labels.append(tags) all_lengths.append(lengths) all_sens_lengths.extend(sens_lens) all_token_ranges.extend(token_ranges) all_ids.extend(sentence_ids) all_preds = torch.cat(all_preds, dim=0).cpu().tolist() all_labels = torch.cat(all_labels, dim=0).cpu().tolist() all_lengths = torch.cat(all_lengths, dim=0).cpu().tolist() metric = utils.Metric(args, all_preds, all_labels, all_lengths, all_sens_lengths, all_token_ranges, ignore_index=-1) precision, recall, f1 = metric.score_uniontags() aspect_results = metric.score_aspect() opinion_results = metric.score_opinion() print('Aspect term\tP:{:.5f}\tR:{:.5f}\tF1:{:.5f}'.format(aspect_results[0], aspect_results[1], aspect_results[2])) print('Opinion term\tP:{:.5f}\tR:{:.5f}\tF1:{:.5f}'.format(opinion_results[0], opinion_results[1], opinion_results[2])) print(args.task + '\tP:{:.5f}\tR:{:.5f}\tF1:{:.5f}\n'.format(precision, recall, f1)) model.train() return precision, recall, f1
drop_hidden=0.1, initer_stddev=0.02, loss=ks.losses.SparseCategoricalCrossentropy(from_logits=True), metric=ks.metrics.SparseCategoricalCrossentropy(from_logits=True), num_epochs=2, num_heads=3, num_rounds=2, num_shards=2, optimizer=ks.optimizers.Adam(), width_dec=40, width_enc=50, ) params.update( loss=qu.Loss(), metric=qu.Metric(), ) def main(ps, fn, root=None, groups=None, count=None): qu.Config.runtime.is_training = True groups = groups or qs.groups for r in range(ps.num_rounds): for g in groups: print(f'\nRound {r + 1}, group {g}...\n=======================') fn(ps, qd.dset_for(ps, root, g, count=count), model_for(ps, g)) if __name__ == '__main__': ps = qu.Params(**params) root = f'/tmp/q/data/small'
def test(data, model, args, iteration, device, logger=None, num=None, plot=False): model.eval() metric = utils.Metric() # metric_im = utils.Metric_image() loss_list = [] if iteration is not None: print(f"{iteration}") for i, ret in enumerate(data.load()): Xs, Xt, Ys, Yt, labels = ret labels = torch.from_numpy(np.array(labels, dtype=np.float32)).to(device) Xs, Xt, Ys, Yt = ( Xs.to(device), Xt.to(device), Ys.to(device), Yt.to(device), ) preds, predt, pred_det = model(Xs, Xt) loss_p = BCE_loss(predt, Yt, with_logits=True) loss_q = BCE_loss(preds, Ys, with_logits=True) loss_det = F.binary_cross_entropy_with_logits(pred_det.squeeze(), labels.squeeze()) loss = loss_p + loss_q + args.gamma * loss_det loss_list.append(loss.data.cpu().numpy()) print(f"{i}:") def fnp(x): return x.data.cpu().numpy() predt = torch.sigmoid(predt) preds = torch.sigmoid(preds) metric.update([fnp(Ys), fnp(Yt)], [fnp(preds), fnp(predt)]) if logger: logger.add_scalar("test_loss/total", loss, iteration) if plot: plot_dir = Path("tmp_plot") / args.dataset plot_dir.mkdir(exist_ok=True, parents=True) for ii in range(Xt.shape[0]): im1, im2 = torch_to_im(Xt[ii]), torch_to_im(Xs[ii]) gt1, gt2 = torch_to_im(Yt[ii]), torch_to_im(Ys[ii]) pred1, pred2 = torch_to_im(predt[ii]), torch_to_im(preds[ii]) fig, axes = plt.subplots(nrows=3, ncols=2) axes[0, 0].imshow(im1) axes[0, 1].imshow(im2) axes[1, 0].imshow(gt1, cmap="jet") axes[1, 1].imshow(gt2, cmap="jet") axes[2, 0].imshow(pred1, cmap="jet") axes[2, 1].imshow(pred2, cmap="jet") fig.savefig(str(plot_dir / f"{i}_{ii}.jpg")) plt.close("all") if num is not None and i >= num: break out = metric.final() test_loss = np.mean(loss_list) print(f"\ntest loss : {test_loss:.4f}\n") return out, test_loss
def train_fewshot(self, net, loss_fn, args, train_loader, val_loaders): net.train() val_tol = args.early_stopping opt = torch.optim.Adam([{ 'params': net.sm_net.parameters() }, { 'params': net.ft_net.parameters(), 'lr': args.lr_resnet }], lr=args.lr_siamese) opt.zero_grad() train_losses = [] time_start = time.time() queue = deque(maxlen=20) # print('steps:', args.max_steps) # epochs = int(np.ceil(args.max_steps / len(trainLoader))) epochs = args.epochs metric = utils.Metric() max_val_acc = 0 max_val_acc_knwn = 0 max_val_acc_unknwn = 0 best_model = '' drew_graph = False val_counter = 0 for epoch in range(epochs): train_loss = 0 metric.reset_acc() with tqdm(total=len(train_loader), desc=f'Epoch {epoch + 1}/{args.epochs}') as t: for batch_id, (img1, img2, label) in enumerate(train_loader, 1): # print('input: ', img1.size()) if args.cuda: img1, img2, label = Variable(img1.cuda()), Variable( img2.cuda()), Variable(label.cuda()) else: img1, img2, label = Variable(img1), Variable( img2), Variable(label) if not drew_graph: self.writer.add_graph(net, (img1, img2), verbose=True) self.writer.flush() drew_graph = True net.train() opt.zero_grad() output = net.forward(img1, img2) metric.update_acc(output, label) loss = loss_fn(output, label) # print('loss: ', loss.item()) train_loss += loss.item() loss.backward() opt.step() t.set_postfix(loss=f'{train_loss / batch_id:.4f}', train_acc=f'{metric.get_acc():.4f}') # if total_batch_id % args.log_freq == 0: # logger.info('epoch: %d, batch: [%d]\tacc:\t%.5f\tloss:\t%.5f\ttime lapsed:\t%.2f s' % ( # epoch, batch_id, metric.get_acc(), train_loss / args.log_freq, time.time() - time_start)) # train_loss = 0 # metric.reset_acc() # time_start = time.time() train_losses.append(train_loss) t.update() self.writer.add_scalar('Train/Loss', train_loss / len(train_loader), epoch) self.writer.add_scalar('Train/Acc', metric.get_acc(), epoch) self.writer.flush() if val_loaders is not None and epoch % args.test_freq == 0: net.eval() val_acc_unknwn, val_acc_knwn = -1, -1 if args.eval_mode == 'fewshot': if not self.new_split_type: val_rgt, val_err, val_acc = self.test_fewshot( args, net, val_loaders[0], loss_fn, val=True, epoch=epoch) else: val_rgt_knwn, val_err_knwn, val_acc_knwn = self.test_fewshot( args, net, val_loaders[0], loss_fn, val=True, epoch=epoch, comment='known') val_rgt_unknwn, val_err_unknwn, val_acc_unknwn = self.test_fewshot( args, net, val_loaders[1], loss_fn, val=True, epoch=epoch, comment='unknown') elif args.eval_mode == 'simple': # todo not compatible with new data-splits val_rgt, val_err, val_acc = self.test_simple( args, net, val_loaders, loss_fn, val=True, epoch=epoch) else: raise Exception('Unsupporeted eval mode') if self.new_split_type: self.logger.info( 'known val acc: [%f], unknown val acc [%f]' % (val_acc_knwn, val_acc_unknwn)) self.logger.info('*' * 30) if val_acc_knwn > max_val_acc_knwn: self.logger.info( 'known val acc: [%f], beats previous max [%f]' % (val_acc_knwn, max_val_acc_knwn)) self.logger.info( 'known rights: [%d], known errs [%d]' % (val_rgt_knwn, val_err_knwn)) max_val_acc_knwn = val_acc_knwn if val_acc_unknwn > max_val_acc_unknwn: self.logger.info( 'unknown val acc: [%f], beats previous max [%f]' % (val_acc_unknwn, max_val_acc_unknwn)) self.logger.info( 'unknown rights: [%d], unknown errs [%d]' % (val_rgt_unknwn, val_err_unknwn)) max_val_acc_unknwn = val_acc_unknwn val_acc = ((val_rgt_knwn + val_rgt_unknwn) * 1.0) / (val_rgt_knwn + val_rgt_unknwn + val_err_knwn + val_err_unknwn) self.writer.add_scalar('Total_Val/Acc', val_acc, epoch) self.writer.flush() val_rgt = (val_rgt_knwn + val_rgt_unknwn) val_err = (val_err_knwn + val_err_unknwn) if val_acc > max_val_acc: val_counter = 0 self.logger.info( 'saving model... current val acc: [%f], previous val acc [%f]' % (val_acc, max_val_acc)) best_model = self.save_model(args, net, epoch, val_acc) max_val_acc = val_acc else: val_counter += 1 self.logger.info( 'Not saving, best val [%f], current was [%f]' % (max_val_acc, val_acc)) if val_counter >= val_tol: # early stopping self.logger.info( '*** Early Stopping, validation acc did not exceed [%f] in %d val accuracies ***' % (max_val_acc, val_tol)) break queue.append(val_rgt * 1.0 / (val_rgt + val_err)) self._tb_draw_histograms(args, net, epoch) with open('train_losses', 'wb') as f: pickle.dump(train_losses, f) acc = 0.0 for d in queue: acc += d print("#" * 70) print('queue len: ', len(queue)) if args.project_tb: print("Start projecting") # self._tb_project_embeddings(args, net.ft_net, train_loader, 1000) print("Projecting done") return net, best_model