def main(json_path): cwd = Path.cwd() with open(cwd / json_path) as io: params = json.loads(io.read()) # tokenizer token_vocab_path = params['filepath'].get('token_vocab') label_vocab_path = params['filepath'].get('label_vocab') with open(token_vocab_path, mode='rb') as io: token_vocab = pickle.load(io) with open(label_vocab_path, mode='rb') as io: label_vocab = pickle.load(io) token_tokenizer = Tokenizer(token_vocab, split_to_self) label_tokenizer = Tokenizer(label_vocab, split_to_self) # model (restore) save_path = cwd / params['filepath'].get('ckpt') ckpt = torch.load(save_path) lstm_hidden_dim = params['model'].get('lstm_hidden_dim') model = BilstmCRF(label_vocab, token_vocab, lstm_hidden_dim) model.load_state_dict(ckpt['model_state_dict']) # evaluation batch_size = params['training'].get('batch_size') tr_path = cwd / params['filepath'].get('tr') val_path = cwd / params['filepath'].get('val') tr_ds = Corpus(tr_path, token_tokenizer.split_and_transform, label_tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=batch_size, num_workers=4, collate_fn=batchify) val_ds = Corpus(val_path, token_tokenizer.split_and_transform, label_tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4, collate_fn=batchify) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) tr_acc = get_accuracy(model, tr_dl, device) val_acc = get_accuracy(model, val_dl, device) print('tr_acc: {:.2%}, val_acc: {:.2%}'.format(tr_acc, val_acc))
def main(json_path): cwd = Path.cwd() with open(cwd / json_path) as io: params = json.loads(io.read()) # tokenizer vocab_path = params['filepath'].get('vocab') with open(cwd / vocab_path, mode='rb') as io: vocab = pickle.load(io) tokenizer = Tokenizer(vocab=vocab, split_fn=split_to_jamo) # model (restore) save_path = cwd / params['filepath'].get('ckpt') ckpt = torch.load(save_path) num_classes = params['model'].get('num_classes') embedding_dim = params['model'].get('embedding_dim') hidden_dim = params['model'].get('hidden_dim') model = ConvRec(num_classes=num_classes, embedding_dim=embedding_dim, hidden_dim=hidden_dim, vocab=tokenizer.vocab) model.load_state_dict(ckpt['model_state_dict']) # evaluation batch_size = params['training'].get('batch_size') min_length = params['training'].get('min_length') tr_path = cwd / params['filepath'].get('tr') val_path = cwd / params['filepath'].get('val') tst_path = cwd / params['filepath'].get('tst') tr_ds = Corpus(tr_path, tokenizer.split_and_transform, min_length, tokenizer.vocab.to_indices(' ')) tr_dl = DataLoader(tr_ds, batch_size=batch_size, num_workers=4, collate_fn=batchify) val_ds = Corpus(val_path, tokenizer.split_and_transform, min_length, tokenizer.vocab.to_indices(' ')) val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4, collate_fn=batchify) tst_ds = Corpus(tst_path, tokenizer.split_and_transform, min_length, tokenizer.vocab.to_indices(' ')) tst_dl = DataLoader(tst_ds, batch_size=batch_size, num_workers=4, collate_fn=batchify) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) tr_acc = get_accuracy(model, tr_dl, device) val_acc = get_accuracy(model, val_dl, device) tst_acc = get_accuracy(model, tst_dl, device) print('tr_acc: {:.2%}, val_acc: {:.2%}, tst_acc: {:.2%}'.format( tr_acc, val_acc, tst_acc))
def main(json_path): cwd = Path.cwd() with open(cwd / json_path) as io: params = json.loads(io.read()) # tokenizer vocab_path = params['filepath'].get('vocab') with open(cwd / vocab_path, mode='rb') as io: vocab = pickle.load(io) tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs) # model (restore) save_path = cwd / params['filepath'].get('ckpt') ckpt = torch.load(save_path) num_classes = params['model'].get('num_classes') lstm_hidden_dim = params['model'].get('lstm_hidden_dim') hidden_dim = params['model'].get('hidden_dim') da = params['model'].get('da') r = params['model'].get('r') model = SAN(num_classes=num_classes, lstm_hidden_dim=lstm_hidden_dim, hidden_dim=hidden_dim, da=da, r=r, vocab=tokenizer.vocab) model.load_state_dict(ckpt['model_state_dict']) # evaluation batch_size = params['training'].get('batch_size') tr_path = cwd / params['filepath'].get('tr') val_path = cwd / params['filepath'].get('val') tr_ds = Corpus(tr_path, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=batch_size, num_workers=4, collate_fn=batchify) val_ds = Corpus(val_path, tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4, collate_fn=batchify) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) tr_acc = get_accuracy(model, tr_dl, device) val_acc = get_accuracy(model, val_dl, device) print('tr_acc: {:.2%}, val_acc: {:.2%}'.format(tr_acc, val_acc))
def main(json_path): cwd = Path.cwd() with open(cwd / json_path) as io: params = json.loads(io.read()) # tokenizer vocab_path = params['filepath'].get('vocab') with open(cwd / vocab_path, mode='rb') as io: vocab = pickle.load(io) length = params['padder'].get('length') padder = PadSequence(length=length, pad_val=vocab.to_indices(vocab.padding_token)) tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs, pad_fn=padder) # model (restore) save_path = cwd / params['filepath'].get('ckpt') ckpt = torch.load(save_path) num_classes = params['model'].get('num_classes') model = SenCNN(num_classes=num_classes, vocab=tokenizer.vocab) model.load_state_dict(ckpt['model_state_dict']) # evaluation batch_size = params['training'].get('batch_size') tr_path = cwd / params['filepath'].get('tr') val_path = cwd / params['filepath'].get('val') tst_path = cwd / params['filepath'].get('tst') tr_ds = Corpus(tr_path, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=batch_size, num_workers=4) val_ds = Corpus(val_path, tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4) tst_ds = Corpus(tst_path, tokenizer.split_and_transform) tst_dl = DataLoader(tst_ds, batch_size=batch_size, num_workers=4) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) tr_acc = get_accuracy(model, tr_dl, device) val_acc = get_accuracy(model, val_dl, device) tst_acc = get_accuracy(model, tst_dl, device) print('tr_acc: {:.2%}, val_acc : {:.2%}, tst_acc: {:.2%}'.format(tr_acc, val_acc, tst_acc))
def main(json_path): cwd = Path.cwd() with open(cwd / json_path) as io: params = json.loads(io.read()) # tokenizer vocab_path = params['filepath'].get('vocab') with open(cwd / vocab_path, mode='rb') as io: vocab = pickle.load(io) tokenizer = Tokenizer(vocab=vocab, split_fn=split_to_jamo) # model num_classes = params['model'].get('num_classes') embedding_dim = params['model'].get('embedding_dim') hidden_dim = params['model'].get('hidden_dim') model = ConvRec(num_classes=num_classes, embedding_dim=embedding_dim, hidden_dim=hidden_dim, vocab=tokenizer.vocab) # training epochs = params['training'].get('epochs') batch_size = params['training'].get('batch_size') learning_rate = params['training'].get('learning_rate') global_step = params['training'].get('global_step') min_length = params['training'].get('min_length') tr_path = cwd / params['filepath'].get('tr') val_path = cwd / params['filepath'].get('val') tr_ds = Corpus(tr_path, tokenizer.split_and_transform, min_length, tokenizer.vocab.to_indices(' ')) tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True, collate_fn=batchify) val_ds = Corpus(val_path, tokenizer.split_and_transform, min_length, tokenizer.vocab.to_indices(' ')) val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4, collate_fn=batchify) loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(params=model.parameters(), lr=learning_rate) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) writer = SummaryWriter('./runs/{}'.format(params['version'])) for epoch in tqdm(range(epochs), desc='epochs'): tr_loss = 0 model.train() for step, mb in tqdm(enumerate(tr_dl), desc='steps', total=len(tr_dl)): x_mb, y_mb, _ = map(lambda elm: elm.to(device), mb) opt.zero_grad() mb_loss = loss_fn(model(x_mb), y_mb) mb_loss.backward() clip_grad_norm_(model.parameters(), 5) opt.step() tr_loss += mb_loss.item() if (epoch * len(tr_dl) + step) % global_step == 0: val_loss = evaluate(model, val_dl, loss_fn, device) writer.add_scalars('loss', { 'train': tr_loss / (step + 1), 'validation': val_loss }, epoch * len(tr_dl) + step) model.train() else: tr_loss /= (step + 1) val_loss = evaluate(model, val_dl, loss_fn, device) tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format( epoch + 1, tr_loss, val_loss)) ckpt = { 'model_state_dict': model.state_dict(), 'opt_state_dict': opt.state_dict() } save_path = cwd / params['filepath'].get('ckpt') torch.save(ckpt, save_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--epoch', default=10, type=int) parser.add_argument('--batch_size', default=128, type=int) # parser.add_argument('--data_type', default='senCNN') parser.add_argument('--classes', default=2, type=int) parser.add_argument('--gpu', default=0, type=int) parser.add_argument('--learning_rate', default=1e-3, type=float) # parser.add_argument('--print_freq', default=3000, type=int) # parser.add_argument('--weight_decay', default=5e-5, type=float) parser.add_argument('--word_dim', default=16, type=int) parser.add_argument('--word_max_len', default=300, type=int) parser.add_argument('--global_step', default=1000, type=int) parser.add_argument('--data_path', default='../data_in') parser.add_argument('--file_path', default='../nsmc-master') # parser.add_argument('--build_preprocessing', default=False) # parser.add_argument('--build_vocab', default=False) args = parser.parse_args() # p = Preprocessing(args) # p.makeProcessing() # v = Build_Vocab(args) # v.make_vocab() with open(args.data_path + '/' + 'vocab_char.pkl', mode='rb') as io: vocab = pickle.load(io) padder = PadSequence(length=args.word_max_len, pad_val=vocab.to_indices(vocab.padding_token)) tokenizer = Tokenizer(vocab=vocab, split_fn=split_to_jamo, pad_fn=padder) model = EfficientCharCRNN(args, vocab) epochs = args.epoch batch_size = args.batch_size learning_rate = args.learning_rate global_step = args.global_step tr_ds = Corpus(args.data_path + '/train.txt', tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True) val_ds = Corpus(args.data_path + '/val.txt', tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4) loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(params=model.parameters(), lr=learning_rate) scheduler = ReduceLROnPlateau(opt, patience=5) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) best_val_loss = 1e+10 for epoch in tqdm(range(args.epoch), desc='epochs'): tr_loss = 0 tr_acc = 0 model.train() for step, mb in tqdm(enumerate(tr_dl), desc='steps', total=len(tr_dl)): x, y = map(lambda elm: elm.to(device), mb) opt.zero_grad() y_h = model(x) m_loss = loss_fn(y_h, y) m_loss.backward() clip_grad_norm_(model._fc.weight, 5) opt.step() with torch.no_grad(): m_acc = acc(y_h, y) tr_loss += m_loss.item() tr_acc += m_acc.item() else: tr_loss /= (step + 1) tr_acc /= (step + 1) tr_summ = {'loss': tr_loss, 'acc': tr_acc} val_summ = evaluate(model, val_dl, { 'loss': loss_fn, 'acc': acc }, device) scheduler.step(val_summ['loss']) tqdm.write('epoch : {}, tr_loss: {:.3f}, val_loss: ' '{:.3f}, tr_acc: {:.2%}, val_acc: {:.2%}'.format( epoch + 1, tr_summ['loss'], val_summ['loss'], tr_summ['acc'], val_summ['acc'])) val_loss = val_summ['loss'] is_best = val_loss < best_val_loss if is_best: state = { 'epoch': epoch + 1, 'model_state_dict': model.state_dict(), 'opt_state_dict': opt.state_dict() } summary = {'tr': tr_summ, 'val': val_summ} # manager.update_summary(summary) # manager.save_summary('summary.json') # manager.save_checkpoint(state, 'best.tar') best_val_loss = val_loss
def main(json_path): cwd = Path.cwd() with open(cwd / json_path) as io: params = json.loads(io.read()) # tokenizer vocab_path = params['filepath'].get('vocab') with open(cwd / vocab_path, mode='rb') as io: vocab = pickle.load(io) tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs) # model num_classes = params['model'].get('num_classes') lstm_hidden_dim = params['model'].get('lstm_hidden_dim') hidden_dim = params['model'].get('hidden_dim') da = params['model'].get('da') r = params['model'].get('r') model = SAN(num_classes=num_classes, lstm_hidden_dim=lstm_hidden_dim, hidden_dim=hidden_dim, da=da, r=r, vocab=tokenizer.vocab) # training epochs = params['training'].get('epochs') batch_size = params['training'].get('batch_size') learning_rate = params['training'].get('learning_rate') global_step = params['training'].get('global_step') tr_path = cwd / params['filepath'].get('tr') val_path = cwd / params['filepath'].get('val') tr_ds = Corpus(tr_path, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True, collate_fn=batchify) val_ds = Corpus(val_path, tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4, collate_fn=batchify) loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(params=model.parameters(), lr=learning_rate) scheduler = ReduceLROnPlateau(opt, patience=5) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) writer = SummaryWriter('./runs/{}'.format(params['version'])) for epoch in tqdm(range(epochs), desc='epochs'): tr_loss = 0 model.train() for step, mb in tqdm(enumerate(tr_dl), desc='steps', total=len(tr_dl)): queries_a_mb, queries_b_mb, y_mb = map(lambda elm: elm.to(device), mb) queries_mb = (queries_a_mb, queries_b_mb) opt.zero_grad() score, queries_a_attn_mat, queries_b_attn_mat = model(queries_mb) a_reg = regularize(queries_a_attn_mat, r, device) b_reg = regularize(queries_b_attn_mat, r, device) mb_loss = loss_fn(score, y_mb) mb_loss.add_(a_reg) mb_loss.add_(b_reg) mb_loss.backward() opt.step() tr_loss += mb_loss.item() if (epoch * len(tr_dl) + step) % global_step == 0: val_loss = evaluate(model, val_dl, loss_fn, device) writer.add_scalars('loss', { 'train': tr_loss / (step + 1), 'validation': val_loss }, epoch * len(tr_dl) + step) model.train() else: tr_loss /= (step + 1) val_loss = evaluate(model, val_dl, loss_fn, device) scheduler.step(val_loss) tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format( epoch + 1, tr_loss, val_loss)) ckpt = { 'model_state_dict': model.state_dict(), 'opt_state_dict': opt.state_dict() } save_path = cwd / params['filepath'].get('ckpt') torch.save(ckpt, save_path)
parser.add_argument('--model_dir', default='experiments/base_model', help="Directory containing config.json of model") if __name__ == '__main__': args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # tokenizer with open('data/vocab.pkl', mode='rb') as io: vocab = pickle.load(io) padding = PadSequence(model_config.length, pad_val=vocab.padding_token) tokenizer = Tokenizer(vocab=vocab, split_fn=split_to_jamo, pad_fn=padding) # model model = VDCNN(num_classes=model_config.num_classes, embedding_dim=model_config.embedding_dim, k_max=model_config.k_max, vocab=Vocab) # training tr_ds = Corpus(data_config.train, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=True) val_ds = Corpus(data_config.validation, tokenizer.split_and_transform) val_dl = DataLoader(val_ds,