# training tr_ds = Corpus(data_config.train, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=True, collate_fn=batchify) val_ds = Corpus(data_config.validation, tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=model_config.batch_size, num_workers=4, collate_fn=batchify) loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(params=model.parameters(), lr=model_config.learning_rate) scheduler = ReduceLROnPlateau(opt, patience=5) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) writer = SummaryWriter('{}/runs'.format(model_dir)) checkpoint_manager = CheckpointManager(model_dir) summary_manager = SummaryManager(model_dir) best_val_loss = 1e+10 for epoch in tqdm(range(model_config.epochs), desc='epochs'): tr_loss = 0 tr_acc = 0
def main(args): dataset_config = Config(args.dataset_config) model_config = Config(args.model_config) exp_dir = Path("experiments") / model_config.type exp_dir = exp_dir.joinpath( f"epochs_{args.epochs}_batch_size_{args.batch_size}_learning_rate_{args.learning_rate}" ) if not exp_dir.exists(): exp_dir.mkdir(parents=True) if args.fix_seed: torch.manual_seed(777) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False tokenizer = get_tokenizer(dataset_config) tr_dl, val_dl = get_data_loaders(dataset_config, tokenizer, args.batch_size, collate_fn=batchify) model = SAN(num_classes=model_config.num_classes, lstm_hidden_dim=model_config.lstm_hidden_dim, hidden_dim=model_config.hidden_dim, da=model_config.da, r=model_config.r, vocab=tokenizer.vocab) loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(model.parameters(), lr=args.learning_rate) device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") model.to(device) writer = SummaryWriter(f"{exp_dir}/runs") checkpoint_manager = CheckpointManager(exp_dir) summary_manager = SummaryManager(exp_dir) best_val_loss = 1e10 for epoch in tqdm(range(args.epochs), desc="epochs"): tr_loss = 0 tr_acc = 0 model.train() for step, mb in tqdm(enumerate(tr_dl), desc="steps", total=len(tr_dl)): qa_mb, qb_mb, y_mb = map(lambda elm: elm.to(device), mb) opt.zero_grad() q_mb = (qa_mb, qb_mb) opt.zero_grad() y_hat_mb, qa_attn_mat, qb_attn_mat = model(q_mb) a_reg = regularize(qa_attn_mat, model_config.r, device) b_reg = regularize(qb_attn_mat, model_config.r, device) mb_loss = loss_fn(y_hat_mb, y_mb) mb_loss.add_(a_reg) mb_loss.add_(b_reg) mb_loss.backward() opt.step() with torch.no_grad(): mb_acc = acc(y_hat_mb, y_mb) tr_loss += mb_loss.item() tr_acc += mb_acc.item() if (epoch * len(tr_dl) + step) % args.summary_step == 0: val_loss = evaluate(model, val_dl, {"loss": loss_fn}, device)["loss"] writer.add_scalars("loss", { "train": tr_loss / (step + 1), "test": val_loss }, epoch * len(tr_dl) + step) model.train() else: tr_loss /= step + 1 tr_acc /= step + 1 tr_summary = {"loss": tr_loss, "acc": tr_acc} val_summary = evaluate(model, val_dl, { "loss": loss_fn, "acc": acc }, device) tqdm.write( f"epoch: {epoch+1}\n" f"tr_loss: {tr_summary['loss']:.3f}, val_loss: {val_summary['loss']:.3f}\n" f"tr_acc: {tr_summary['acc']:.2%}, val_acc: {val_summary['acc']:.2%}" ) val_loss = val_summary["loss"] is_best = val_loss < best_val_loss if is_best: state = { "epoch": epoch + 1, "model_state_dict": model.state_dict(), "opt_state_dict": opt.state_dict() } summary = {"train": tr_summary, "validation": val_summary} summary_manager.update(summary) summary_manager.save("summary.json") checkpoint_manager.save_checkpoint(state, "best.tar") best_val_loss = val_loss
def main(json_path): cwd = Path.cwd() with open(cwd / json_path) as io: params = json.loads(io.read()) # tokenizer vocab_path = params['filepath'].get('vocab') with open(cwd / vocab_path, mode='rb') as io: vocab = pickle.load(io) tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs) # model num_classes = params['model'].get('num_classes') lstm_hidden_dim = params['model'].get('lstm_hidden_dim') hidden_dim = params['model'].get('hidden_dim') da = params['model'].get('da') r = params['model'].get('r') model = SAN(num_classes=num_classes, lstm_hidden_dim=lstm_hidden_dim, hidden_dim=hidden_dim, da=da, r=r, vocab=tokenizer.vocab) # training epochs = params['training'].get('epochs') batch_size = params['training'].get('batch_size') learning_rate = params['training'].get('learning_rate') global_step = params['training'].get('global_step') tr_path = cwd / params['filepath'].get('tr') val_path = cwd / params['filepath'].get('val') tr_ds = Corpus(tr_path, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True, collate_fn=batchify) val_ds = Corpus(val_path, tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4, collate_fn=batchify) loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(params=model.parameters(), lr=learning_rate) scheduler = ReduceLROnPlateau(opt, patience=5) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) writer = SummaryWriter('./runs/{}'.format(params['version'])) for epoch in tqdm(range(epochs), desc='epochs'): tr_loss = 0 model.train() for step, mb in tqdm(enumerate(tr_dl), desc='steps', total=len(tr_dl)): queries_a_mb, queries_b_mb, y_mb = map(lambda elm: elm.to(device), mb) queries_mb = (queries_a_mb, queries_b_mb) opt.zero_grad() score, queries_a_attn_mat, queries_b_attn_mat = model(queries_mb) a_reg = regularize(queries_a_attn_mat, r, device) b_reg = regularize(queries_b_attn_mat, r, device) mb_loss = loss_fn(score, y_mb) mb_loss.add_(a_reg) mb_loss.add_(b_reg) mb_loss.backward() opt.step() tr_loss += mb_loss.item() if (epoch * len(tr_dl) + step) % global_step == 0: val_loss = evaluate(model, val_dl, loss_fn, device) writer.add_scalars('loss', { 'train': tr_loss / (step + 1), 'validation': val_loss }, epoch * len(tr_dl) + step) model.train() else: tr_loss /= (step + 1) val_loss = evaluate(model, val_dl, loss_fn, device) scheduler.step(val_loss) tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format( epoch + 1, tr_loss, val_loss)) ckpt = { 'model_state_dict': model.state_dict(), 'opt_state_dict': opt.state_dict() } save_path = cwd / params['filepath'].get('ckpt') torch.save(ckpt, save_path)
def main(args): dataset_config = Config(args.dataset_config) model_config = Config(args.model_config) exp_dir = Path("experiments") / model_config.type exp_dir = exp_dir.joinpath( f"epochs_{args.epochs}_batch_size_{args.batch_size}_learning_rate_{args.learning_rate}" ) if not exp_dir.exists(): exp_dir.mkdir(parents=True) if args.fix_seed: torch.manual_seed(777) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False preprocessor = get_preprocessor(dataset_config, coarse_split_fn=split_morphs, fine_split_fn=split_jamos) tr_dl, val_dl = get_data_loaders(dataset_config, preprocessor, args.batch_size, collate_fn=batchify) # model model = SAN(model_config.num_classes, preprocessor.coarse_vocab, preprocessor.fine_vocab, model_config.fine_embedding_dim, model_config.hidden_dim, model_config.multi_step, model_config.prediction_drop_ratio) opt = optim.Adam(model.parameters(), lr=args.learning_rate) device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") model.to(device) writer = SummaryWriter(f"{exp_dir}/runs") checkpoint_manager = CheckpointManager(exp_dir) summary_manager = SummaryManager(exp_dir) best_val_loss = 1e10 for epoch in tqdm(range(args.epochs), desc="epochs"): tr_loss = 0 tr_acc = 0 model.train() for step, mb in tqdm(enumerate(tr_dl), desc="steps", total=len(tr_dl)): qa_mb, qb_mb, y_mb = map( lambda elm: (el.to(device) for el in elm) if isinstance(elm, tuple) else elm.to(device), mb) opt.zero_grad() y_hat_mb = model((qa_mb, qb_mb)) mb_loss = log_loss(y_hat_mb, y_mb) mb_loss.backward() opt.step() with torch.no_grad(): mb_acc = acc(y_hat_mb, y_mb) tr_loss += mb_loss.item() tr_acc += mb_acc.item() if (epoch * len(tr_dl) + step) % args.summary_step == 0: val_loss = evaluate(model, val_dl, {"loss": log_loss}, device)["loss"] writer.add_scalars("loss", { "train": tr_loss / (step + 1), "val": val_loss }, epoch * len(tr_dl) + step) model.train() else: tr_loss /= step + 1 tr_acc /= step + 1 tr_summary = {"loss": tr_loss, "acc": tr_acc} val_summary = evaluate(model, val_dl, { "loss": log_loss, "acc": acc }, device) tqdm.write( f"epoch: {epoch+1}\n" f"tr_loss: {tr_summary['loss']:.3f}, val_loss: {val_summary['loss']:.3f}\n" f"tr_acc: {tr_summary['acc']:.2%}, val_acc: {val_summary['acc']:.2%}" ) val_loss = val_summary["loss"] is_best = val_loss < best_val_loss if is_best: state = { "epoch": epoch + 1, "model_state_dict": model.state_dict(), "opt_state_dict": opt.state_dict(), } summary = {"train": tr_summary, "validation": val_summary} summary_manager.update(summary) summary_manager.save("summary.json") checkpoint_manager.save_checkpoint(state, "best.tar") best_val_loss = val_loss