def main(): # prepare corpus corpus = Corpus(args.data_file, args.dict_file, vocab_size=args.vocab_size) # dumping vocabulary with open(os.path.join(out_dir, 'vocab.json'), 'w') as f: json.dump(corpus.dictionary.word2idx, f) # save arguments ntokens = len(corpus.dictionary.word2idx) args.ntokens = ntokens with open(os.path.join(out_dir, 'args.json'), 'w') as f: json.dump(vars(args), f) log.info('[Data Loaded.]') autoencoder = AutoEncoder() if args.split: train, valid = corpus.get_data(split=args.split) valid = batchify(valid, args.batch_size, shuffle=False) else: train = corpus.get_data() for epoch in range(1, args.epochs + 1): # shuffle train data in each epoch batches = batchify(train, args.batch_size, shuffle=True) global_iters = 0 start_time = datetime.now() for i, batch in enumerate(batches): loss = autoencoder.update(batch) if i % args.log_interval == 0 and i > 0: log.info(('[Epoch {} {}/{} Loss {:.5f} ETA {}]').format( epoch, i, len(batches), loss, str((datetime.now() - start_time) / (i + 1) * (len(batches) - i - 1)).split('.')[0])) global_iters += 1 if global_iters % 100 == 0: autoencoder.anneal() if args.split: word_acc, sent_acc = autoencoder.evaluate(valid) msg = 'Epoch {} word acc: {} | sent acc: {}'.format( epoch, word_acc, sent_acc) log.warn(msg) autoencoder.save(out_dir, 'autoencoder_model_{}.pt'.format(epoch))
def main(): # prepare corpus ae_args = json.load(open(args.ae_args)) corpus = Corpus(args.data_file, args.dict_file, vocab_size=ae_args['vocab_size']) autoencoder = Seq2Seq(emsize=ae_args['emsize'], nhidden=ae_args['nhidden'], ntokens=ae_args['ntokens'], nlayers=ae_args['nlayers'], hidden_init=ae_args['hidden_init'], max_len=ae_args['max_len'], gpu=args.cuda) autoencoder.load_state_dict(torch.load(args.model)) if args.cuda: autoencoder.cuda() autoencoder.eval() if args.split: train, valid = corpus.get_data(split=args.split) else: valid = corpus.get_data() samples = batchify(random.sample(valid, args.len_samples), args.batch_size, shuffle=False) valid = batchify(valid, args.batch_size, shuffle=False) word_accuracies = [] sent_accuracies = [] f = open(args.err_f, 'w') for i, batch in enumerate(tqdm(valid, desc='acc')): source, target, length = batch source = to_gpu(args.cuda, Variable(source, volatile=True)) target = to_gpu(args.cuda, Variable(target, volatile=True)) length = to_gpu(args.cuda, Variable(length, volatile=True)) # output: batch x seq_len x ntokens code = autoencoder.encode(source) max_indices = autoencoder.generate(code, length).contiguous() # ============word accuracy============ word_accuracies.extend( # strip the last <eos> max_indices.view(-1).eq( target[:, :-1].contiguous().view(-1)).data.cpu().tolist()) # ==============generate examples================== max_indices = max_indices.data.cpu().numpy() target = target.data.cpu().numpy() for t, idx in zip(target, max_indices): # real sentence real = "".join([ corpus.dictionary.idx2word[x] for x in t if x >= corpus.dictionary.offset ]) # autoencoder output sentence gen = "".join([ corpus.dictionary.idx2word[x] for x in idx if x >= corpus.dictionary.offset ]) correct = real == gen sent_accuracies.append(correct) if not correct: f.write('{} | {}\n'.format(real, gen)) f.close() log.info('word acc: {} sent acc: {}'.format(np.mean(word_accuracies), np.mean(sent_accuracies))) f = open(args.len_f, 'w') for i, batch in enumerate(tqdm(samples, desc='len')): source, target, length = batch source = to_gpu(args.cuda, Variable(source, volatile=True)) target = to_gpu(args.cuda, Variable(target, volatile=True)) target = target.view_as(source).data.cpu().numpy() one = torch.LongTensor([1]).expand_as(length) indices = [] for j in range(-2, 3): length_ = torch.max(length + j, one) length_ = to_gpu(args.cuda, Variable(length_, volatile=True)) code = autoencoder.encode(source) max_indices = autoencoder.generate(code, length_) indices.append(max_indices.data.cpu().numpy()) for k, target_ in enumerate(target): # real sentence real = "".join([ corpus.dictionary.idx2word[x] for x in target_ if x >= corpus.dictionary.offset ]) f.write('origin: {}\n'.format(real)) for j in range(-2, 3): idx = indices[j][k] # autoencoder output sentence gen = "".join([ corpus.dictionary.idx2word[x] for x in idx if x >= corpus.dictionary.offset ]) f.write('{} {}\n'.format(j if j < 0 else '+' + str(j), gen)) f.close()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # hyper parameter embed_size = 128 hidden_size = 1024 num_layers = 2 num_epoches = 10 # number of words to be sampled num_samples = 1000 batch_size = 20 seq_length = 30 learning_rate = 0.002 # loading data corpus = Corpus() ids = corpus.get_data("./data/train.txt", batch_size) # get sequence of sentence vocab_size = len(corpus.dictionary) # len(vocab) num_batches = ids.size(1) class RNNLM(nn.Module): def __init__(self, vocab_size, embed_size, hidden_size, num_layers): super(RNNLM, self).__init__() # intial the matrix self.embed_size = nn.Embedding(vocab_size, embed_size) self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True) self.linear = nn.Linear(hidden_size, vocab_size)