def build_vocab(args, tokenizer): vocab = collections.Counter() df = pd.read_csv(args.train_path, sep="\t") for i, row in df.iterrows(): tokens = tokenizer(load_sent(row[0], -1)) vocab.update(tokens) words = ['<pad>', '<unk>', '<bos>', '<eos>'] + list(sorted(vocab)) return (words, {w: i for i, w in enumerate(words)})
def main(args): if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) log_file = os.path.join(args.save_dir, 'log.txt') logging(str(args), log_file) # Prepare data train_sents = load_sent(args.train) logging( '# train sents {}, tokens {}'.format(len(train_sents), sum(len(s) for s in train_sents)), log_file) valid_sents = load_sent(args.valid) logging( '# valid sents {}, tokens {}'.format(len(valid_sents), sum(len(s) for s in valid_sents)), log_file) vocab_file = os.path.join(args.save_dir, 'vocab.txt') # if not os.path.isfile(vocab_file): # Vocab.build(train_sents, vocab_file, args.vocab_size) Vocab.build(train_sents, vocab_file, args.vocab_size) vocab = Vocab(vocab_file) logging('# vocab size {}'.format(vocab.size), log_file) set_seed(args.seed) cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if cuda else 'cpu') model = { 'dae': DAE, 'vae': VAE, 'aae': AAE }[args.model_type](vocab, args).to(device) if args.load_model: ckpt = torch.load(args.load_model) model.load_state_dict(ckpt['model']) model.flatten() logging( '# model parameters: {}'.format( sum(x.data.nelement() for x in model.parameters())), log_file) train_batches, _ = get_batches(train_sents, vocab, args.batch_size, device) valid_batches, _ = get_batches(valid_sents, vocab, args.batch_size, device) best_val_loss = None for epoch in range(args.epochs): start_time = time.time() logging('-' * 80, log_file) model.train() meters = collections.defaultdict(lambda: AverageMeter()) indices = list(range(len(train_batches))) random.shuffle(indices) for i, idx in enumerate(indices): inputs, targets = train_batches[idx] losses = model.autoenc(inputs, targets, is_train=True) losses['loss'] = model.loss(losses) model.step(losses) for k, v in losses.items(): meters[k].update(v.item()) if (i + 1) % args.log_interval == 0: log_output = '| epoch {:3d} | {:5d}/{:5d} batches |'.format( epoch + 1, i + 1, len(indices)) for k, meter in meters.items(): log_output += ' {} {:.2f},'.format(k, meter.avg) meter.clear() logging(log_output, log_file) valid_meters = evaluate(model, valid_batches) logging('-' * 80, log_file) log_output = '| end of epoch {:3d} | time {:5.0f}s | valid'.format( epoch + 1, time.time() - start_time) for k, meter in valid_meters.items(): log_output += ' {} {:.2f},'.format(k, meter.avg) if not best_val_loss or valid_meters['loss'].avg < best_val_loss: log_output += ' | saving model' ckpt = {'args': args, 'model': model.state_dict()} torch.save(ckpt, os.path.join(args.save_dir, 'model.pt')) best_val_loss = valid_meters['loss'].avg logging(log_output, log_file) logging('Done training', log_file)
def main(args): pl.seed_everything(args.seed) model = load_model(args.checkpoint).to(device) model.eval() vocab = Vocab(os.path.join(model.hparams.root_dir, 'vocab.txt')) if args.eval: data = load_data(args.eval, model.hparams.add_eos, model.hparams.cat_sent, model.hparams.max_len) dl = get_eval_dataloader(data, vocab, args.max_tok, data_workers=args.data_workers, model_type=model.hparams.model_type) trainer = pl.Trainer(gpus=args.gpus, amp_level=args.fp16_opt_level, precision=16 if args.fp16 else 32, default_root_dir='testing_logs') model.hparams.n_mc = args.n_mc trainer.test(model, test_dataloaders=dl) if args.output: output = os.path.join( os.path.dirname(os.path.dirname(args.checkpoint)), 'outputs/', args.output) makedir(output) if args.sample: with open(output, 'w') as f: for i in tqdm(range(args.sample)): if model.hparams.model_type == 'inst': _, full = model.generate([], [0], args.decode, device) else: _, full = model.generate([model.init_canvas()], args.decode, device) full = [[vocab.idx2word[id] for id in ids] for ids in full] write(f, full, args.write_mid) if args.fill: sents = load_sent(args.fill, model.hparams.add_eos) sents = [[vocab.word_to_idx(w) for w in s] for s in sents] with open(output + '.fill', 'w') as f_fill: with open(output + '.full', 'w') as f_full: for s in tqdm(sents): if model.hparams.model_type == 'inst': seq, blanks = [], [] for w in s: if w == vocab.blank: blanks.append(len(seq)) else: seq.append(w) if args.anywhere: blanks = list(range(len(seq) + 1)) fill, full = model.generate(seq, blanks, args.decode, device, args.force_insert, args.prioritize_unfilled) else: fill, full = model.generate(s, args.decode, device) fill = [[vocab.idx2word[id] for id in ids] for ids in fill] full = [[vocab.idx2word[id] for id in ids] for ids in full] write(f_fill, fill, args.write_mid) write(f_full, full, args.write_mid)