def main(): config = get_config(mode='test') vocab = Vocab() vocab.load(config.word2id_path, config.id2word_path) print(f'Vocabulary size: {vocab.vocab_size}') config.vocab_size = vocab.vocab_size if config.users: test_users = load_pickle(config.convs_users_path) config.user_size = max([x for xx in test_users for x in xx]) + 1 print(f'User size: {config.user_size}') else: test_users = None data_loader = get_loader(convs=load_pickle(config.convs_path), convs_length=load_pickle(config.conversations_length_path), utterances_length=load_pickle(config.utterances_length_path), vocab=vocab, batch_size=config.batch_size, shuffle=False, convs_users=test_users) model_solver = getattr(solvers, "Solver{}".format(config.model)) test_solver = model_solver(config, None, data_loader, vocab=vocab, is_train=False) test_solver.build() test_solver.export_samples()
def test(args): vocab = Vocab() vocab.load(args.vocab) vocab.add_special_token() pos2id = Vocab() pos2id.load(args.poslist) if args.gpu > -1: cuda.get_device(args.gpu).use() xp = cuda.cupy else: xp = np model = WordnnTagger.load(args.model) out_path = making_data(args.test_path, model.window) if args.gpu > -1: model.to_gpu() model.make_oov_vector(args.gpu > -1) # start evaluation n_data = 0 n_correct = 0 sum_loss = xp.zeros((), dtype=xp.float32) start = time.time() for tags, contexts in line_iter(out_path, args.minibatch, False): batch_ts = xp.array([pos2id[tag] for tag in tags], dtype=xp.int32) batch_xs = xp.array( [[vocab[word] for word in vocab.check_words(context)] for context in contexts], dtype=xp.int32) cur_batch_size = batch_ts.shape[0] ys, loss = model(batch_xs, batch_ts) sum_loss += loss.data * cur_batch_size pred_labels = ys.data.argmax(1) n_correct += sum(1 for j in range(cur_batch_size) if pred_labels[j] == batch_ts[j]) n_data += cur_batch_size end = time.time() accuracy = float(n_correct / n_data) print('test loss : {}'.format(sum_loss)) print('test accuracy : {}'.format(accuracy)) print('(time to run : {})'.format(end - start))
def main(_): # Set up logging configure_logging(FLAGS.debug_log) # Load configuration with open(FLAGS.config, 'r') as f: config = yaml.load(f) # Get the checkpoint path ckpt_dir = os.path.join(config['training']['ckpt_dir'], config['experiment_name']) # Load model vocab logging.info('Loading the vocabulary.') with open(config['data']['vocab'], 'r') as f: vocab = Vocab.load(f) # Initialize models logging.info('Initializing the generative model.') inference_network = RNNTextInferenceNetwork( dim=config['model']['dim'], vocab_size=len(vocab), encoder_kwargs=config['model']['encoder'], normalizing_flow_kwargs=config['model']['normalizing_flow']) generative_model = RNNTextGenerativeModel( dim=config['model']['dim'], vocab_size=len(vocab), max_length=config['training']['max_length'], sos_idx=vocab.sos_idx, **config['model']['generator']) if torch.cuda.is_available(): inference_network = inference_network.cuda() generative_model = generative_model.cuda() # Restore ckpt = os.path.join(ckpt_dir, 'model.pt.best') if os.path.exists(ckpt): logging.info('Model checkpoint detected at: `%s`. Restoring.' % ckpt) checkpoint = torch.load(ckpt) inference_network.load_state_dict(checkpoint['state_dict_in']) generative_model.load_state_dict(checkpoint['state_dict_gm']) else: logging.error('No model checkpoint found. Terminating.') sys.exit(1) inference_network.eval() generative_model.eval() if FLAGS.which == 'interpolate': interpolate(inference_network, generative_model, vocab) elif FLAGS.which == 'sample': sample(inference_network, generative_model, vocab)
def main(_): if FLAGS.vocab: print('Loading vocab...') with open(FLAGS.vocab, 'r') as f: vocab = Vocab.load(f) print('Loading embeddings...') words = [] embeddings = [] with open(FLAGS.embedding_file, 'r') as f: for line in f: split = line.split() word = ' '.join(split[:-200]) embedding = split[-200:] embedding = list(map(float, embedding)) words.append(word) embeddings.append(embedding) embedding_size = len(embedding) if FLAGS.vocab: truncated_embeddings = [] word2id = {w: i for i, w in enumerate(words)} for word in vocab._word2id: try: id = word2id[word] truncated_embeddings.append(embeddings[id]) except KeyError: print('WARNING: Word "%s" has no predefined embedding' % word) random_embedding = [random.random() for _ in range(embedding_size)] truncated_embeddings.append(random_embedding) # Done! embedding_matrix = np.array(truncated_embeddings) else: embedding_matrix = np.array(embeddings) print('Producing Tensor:') embedding_matrix = tf.Variable(embedding_matrix, dtype=tf.float32, name='desc_word_embeddings') print(embedding_matrix) print('Saving checkpoint...') saver = tf.train.Saver([embedding_matrix]) with tf.Session() as sess: sess.run(tf.variables_initializer([embedding_matrix])) saver.save(sess, FLAGS.output_file, write_meta_graph=False) print('Done')
import os import pickle from models import VariationalModels def load_pickle(path): with open(path, 'rb') as f: return pickle.load(f) if __name__ == '__main__': config = get_config(mode='test') print('Loading Vocabulary...') vocab = Vocab() vocab.load(config.word2id_path, config.id2word_path) print(f'Vocabulary size: {vocab.vocab_size}') config.vocab_size = vocab.vocab_size data_loader = get_loader( sentences=load_pickle(config.sentences_path), conversation_length=load_pickle(config.conversation_length_path), sentence_length=load_pickle(config.sentence_length_path), vocab=vocab, batch_size=config.batch_size) if config.model in VariationalModels: solver = VariationalSolver(config, None, data_loader,
def main(): config = get_config(mode='test') if config.data_name == "cornell": vocab = Vocab() vocab.load(config.word2id_path, config.id2word_path, ptb=(config.model == "PTB")) print(f'Vocabulary size: {vocab.vocab_size}') config.vocab_size = vocab.vocab_size if config.users: test_users = load_pickle(config.convs_users_path) config.user_size = max([x for xx in test_users for x in xx]) + 1 print(f'User size: {config.user_size}') else: test_users = None data_loader = get_loader( convs=load_pickle(config.convs_path), convs_length=load_pickle(config.conversations_length_path), utterances_length=load_pickle(config.utterances_length_path), vocab=vocab, batch_size=config.batch_size, shuffle=False, convs_users=test_users, is_ptb_model=(config.model == "PTB")) elif config.model == "DialoGPT": if config.users: vocab = GPT2Tokenizer.from_pretrained(config.user_vocab_path) else: vocab = GPT2Tokenizer.from_pretrained('gpt2') config.vocab_size = len(vocab) config.vocab = vocab config.export_test = True data_loader = get_loader(convs=load_pickle(config.convs_path), vocab=vocab, batch_size=config.batch_size, model=config.model, dataset=config.data_name, config=config, shuffle=False) elif config.data_name == "cornell2" or config.data_name == "ubuntu" or config.data_name == "twitter_s": vocab = OpenAIGPTTokenizer.from_pretrained('openai-gpt') special_tokens = { 'pad_token': PAD_TOKEN, 'bos_token': SOS_TOKEN, 'eos_token': EOS_TOKEN, 'sep_token': SEP_TOKEN, } vocab.add_special_tokens(special_tokens) config.vocab_size = len(vocab) config.vocab = vocab config.pad_id = vocab.pad_token_id config.eos_id = vocab.eos_token_id config.sos_id = vocab.bos_token_id data_loader = get_loader(convs=load_pickle(config.convs_path), vocab=vocab, batch_size=config.batch_size, model=config.model, dataset=config.data_name, config=config, shuffle=False) else: raise ValueError("{} Sorry... We don't support that data".format( config.data_name)) model_solver = getattr(solvers, "Solver{}".format(config.model)) test_solver = model_solver(config, None, data_loader, vocab=vocab, is_train=False) test_solver.build() test_solver.export_samples(config.beam_size)
from models import VariationalModels import re def load_pickle(path): with open(path, 'rb') as f: return pickle.load(f) if __name__ == '__main__': test_freq_config = get_config(mode='test_freq') test_rare_config = get_config(mode='test_rare') print('Loading freq Vocabulary...') vocab_freq = Vocab() vocab_freq.load(test_freq_config.word2id_path, test_freq_config.id2word_path) vocab_rare = Vocab() vocab_rare.load(test_rare_config.word2id_path, test_rare_config.id2word_path) print(f'freq Vocabulary size: {vocab_freq.vocab_size}') print(f'rare Vocabulary size: {vocab_rare.vocab_size}') test_freq_config.vocab_size = vocab_freq.vocab_size test_rare_config.vocab_size = vocab_rare.vocab_size freq_data_loader = get_loader( sentences=load_pickle(test_freq_config.sentences_path), conversation_length=load_pickle( test_freq_config.conversation_length_path), sentence_length=load_pickle(test_freq_config.sentence_length_path), vocab=vocab_freq,
def train(args): if args.gpu > -1: cuda.get_device(args.gpu).use() xp = cuda.cupy else: xp = np if args.log: log_dir = args.log else: log_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), '{}_{}'.format(DIR_NAME, datetime.now().strftime('%Y%m%d_%H:%M'))) if not os.path.exists(log_dir): os.mkdir(log_dir) # setting for logging logger = logging.getLogger() logging.basicConfig(level=logging.INFO) log_path = os.path.join(log_dir, 'log') file_handler = logging.FileHandler(log_path) fmt = logging.Formatter('%(asctime)s %(levelname)s %(message)s') file_handler.setFormatter(fmt) logger.addHandler(file_handler) logger.info('Arguments...') for arg, val in vars(args).items(): logger.info('{} : {}'.format(arg, val)) logger.info('Loading Vocab...') vocab = Vocab() vocab.load(args.vocab, args.lowercase) vocab.add_special_token() sufvocab = Vocab() sufvocab.load(args.sufvocab, args.lowercase) sufvocab.add_special_token(['s>', '<UNK>']) pos2id = Vocab() pos2id.load(args.poslist) logger.info('preparation for training data...') out_path = making_data(args.train_data, args.window) model = WordCSnnTagger(args.wembed, args.fembed, args.hidden, len(vocab), len(sufvocab), len(pos2id), args.window, args.objct, args.alpha) model.save_model_config(log_dir) if args.gpu > -1: model.to_gpu() opt = getattr(optimizers, args.opt)() opt.setup(model) opt.add_hook(optimizer.GradientClipping(args.gclip)) opt.add_hook(optimizer.WeightDecay(args.wdecay)) for epoch in range(args.epoch): logger.info('START epoch {}/{}'.format(epoch + 1, args.epoch)) start = time.time() sum_loss = xp.zeros((), dtype=xp.float32) n_data = 0 n_correct = 0 for i, [tags, contexts] in enumerate(line_iter(out_path, args.minibatch)): batch_ts = xp.array([pos2id[tag] for tag in tags], dtype=xp.int32) batch_caps = xp.array([[get_capf(word) for word in context] for context in contexts], dtype=xp.int32) if args.lowercase: contexts = [[word.lower() for word in context] for context in contexts] batch_xs = xp.array([[vocab[word] for word in context] for context in contexts], dtype=xp.int32) batch_sufs = xp.array([[sufvocab[word[-2:]] for word in context] for context in contexts], dtype=xp.int32) batch_caps = xp.array([[get_capf(word) for word in context] for context in contexts], dtype=xp.int32) batch_features = [batch_xs, batch_sufs, batch_caps] cur_batch_size = batch_ts.shape[0] ys, loss = model(batch_features, batch_ts) sum_loss += loss.data * cur_batch_size model.zerograds() loss.backward() opt.update() pred_labels = ys.data.argmax(1) n_correct += sum(1 for j in range(cur_batch_size) if pred_labels[j] == batch_ts[j]) n_data += cur_batch_size logger.info('done {} batches'.format(i + 1)) logger.info('{} epoch train loss = {}'.format(epoch + 1, sum_loss)) logger.info('{} epoch train accuracy = {}'.format(epoch + 1, float(n_correct / n_data))) logger.info('{} sec for training per epoch'.format(time.time() - start)) if args.valid_data: start = time.time() valid_loss, valid_accuracy = evaluation(model, args.valid_data, pos2id, vocab, sufvocab, args) logger.info('{} epoch valid loss = {}'.format(epoch + 1, valid_loss)) logger.info('{} epoch valid accuracy = {}'.format(epoch + 1, valid_accuracy)) logger.info('{} sec for validation per epoch'.format(time.time() - start)) if args.test_data: start = time.time() test_loss, test_accuracy = evaluation(model, args.test_data, pos2id, vocab, sufvocab, args) logger.info('{} epoch test loss = {}'.format(epoch + 1, test_loss)) logger.info('{} epoch test accuracy = {}'.format(epoch + 1, test_accuracy)) logger.info('{} sec for testing per epoch'.format(time.time() - start)) logger.info('serializing...') prefix = '{}_{}ep_{}wembed_{}fembed_{}hidden_{}window_{}minibatch_{}opt'.format(DIR_NAME, epoch + 1, args.wembed, args.fembed, args.hidden, args.window, args.minibatch, args.opt) model_path = os.path.join(log_dir, prefix + '.model') model.save(model_path) logger.info('done training')
def main(_): # Load the configuration file. with open(FLAGS.config, 'r') as f: config = yaml.load(f) # Create the checkpoint directory if it does not already exist. ckpt_dir = os.path.join(config['data']['ckpt'], config['experiment_name']) if not os.path.exists(ckpt_dir): os.mkdir(ckpt_dir) # Check if a pre-existing configuration file exists and matches the current # configuration. Otherwise save a copy of the configuration to the # checkpoint directory. prev_config_path = os.path.join(ckpt_dir, 'config.yaml') if os.path.exists(prev_config_path): with open(prev_config_path, 'r') as f: prev_config = yaml.load(f) assert config == prev_config else: shutil.copyfile(FLAGS.config, prev_config_path) # Load the vocabularies. src_vocab = Vocab.load(config['data']['src']['vocab']) tgt_vocab = Vocab.load(config['data']['tgt']['vocab']) # Load the training and dev datasets. train_data = ShakespeareDataset('train', config, src_vocab, tgt_vocab) dev_data = ShakespeareDataset('dev', config, src_vocab, tgt_vocab) # Build the model. src_vocab_size = len(src_vocab) tgt_vocab_size = len(tgt_vocab) encoder = Encoder(src_vocab_size, config['model']['embedding_dim']) decoder = Decoder(tgt_vocab_size, config['model']['embedding_dim']) if torch.cuda.is_available(): encoder = encoder.cuda() decoder = decoder.cuda() # Define the loss function + optimizer. loss_weights = torch.ones(decoder.tgt_vocab_size) loss_weights[0] = 0 if torch.cuda.is_available(): loss_weights = loss_weights.cuda() criterion = torch.nn.NLLLoss(loss_weights) learning_rate = config['training']['learning_rate'] encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate) # Restore saved model (if one exists). ckpt_path = os.path.join(ckpt_dir, 'model.pt') if os.path.exists(ckpt_path): print('Loading checkpoint: %s' % ckpt_path) ckpt = torch.load(ckpt_path) epoch = ckpt['epoch'] encoder.load_state_dict(ckpt['encoder']) decoder.load_state_dict(ckpt['decoder']) encoder_optimizer.load_state_dict(ckpt['encoder_optimizer']) decoder_optimizer.load_state_dict(ckpt['decoder_optimizer']) else: epoch = 0 train_log_string = '%s :: Epoch %i :: Iter %i / %i :: train loss: %0.4f' dev_log_string = '\n%s :: Epoch %i :: dev loss: %0.4f' while epoch < config['training']['num_epochs']: # Main training loop. train_loss = [] sampler = RandomSampler(train_data) for i, train_idx in enumerate(sampler): src, tgt = train_data[train_idx] # Clear gradients encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() # Feed inputs one by one from src into encoder (in reverse). src_length = src.size()[0] hidden = None for j in reversed(range(src_length)): encoder_output, hidden = encoder(src[j], hidden) # Feed desired outputs one by one from tgt into decoder # and measure loss. tgt_length = tgt.size()[0] loss = 0 for j in range(tgt_length - 1): decoder_output, hidden = decoder(tgt[j], hidden) loss += criterion(decoder_output, tgt[j + 1]) # Backpropagate the loss and update the model parameters. loss.backward() encoder_optimizer.step() decoder_optimizer.step() train_loss.append(loss.data.cpu()) # Every once and a while check on the loss if ((i + 1) % 100) == 0: print(train_log_string % (datetime.now(), epoch, i + 1, len(train_data), np.mean(train_loss)), end='\r') train_loss = [] # Evaluation loop. dev_loss = [] for src, tgt in dev_data: # Feed inputs one by one from src into encoder. src_length = src.size()[0] hidden = None for j in reversed(range(src_length)): encoder_output, hidden = encoder(src[j], hidden) # Feed desired outputs one by one from tgt into decoder # and measure loss. tgt_length = tgt.size()[0] loss = 0 for j in range(tgt_length - 1): decoder_output, hidden = decoder(tgt[j], hidden) loss += criterion(decoder_output, tgt[j + 1]) dev_loss.append(loss.data.cpu()) print(dev_log_string % (datetime.now(), epoch, np.mean(dev_loss))) state_dict = { 'epoch': epoch, 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict(), 'encoder_optimizer': encoder_optimizer.state_dict(), 'decoder_optimizer': decoder_optimizer.state_dict() } torch.save(state_dict, ckpt_path) epoch += 1
def main(_): # Load the configuration file. with open(FLAGS.config, 'r') as f: config = yaml.load(f) # Load the vocabularies. src_vocab = Vocab.load(config['data']['src']['vocab']) tgt_vocab = Vocab.load(config['data']['tgt']['vocab']) # Load the training and dev datasets. test_data = ShakespeareDataset('test', config, src_vocab, tgt_vocab) # Restore the model. src_vocab_size = len(src_vocab) tgt_vocab_size = len(tgt_vocab) encoder = Encoder(src_vocab_size, config['model']['embedding_dim'], config['model']['bidirection'], config['model']['dropout'], config['model']['layer'], config['model']['mode']) decoder = Decoder(tgt_vocab_size, config['model']['embedding_dim'], config['model']['bidirection'], config['model']['dropout'], config['model']['layer'], config['model']['mode']) if torch.cuda.is_available(): encoder = encoder.cuda() decoder = decoder.cuda() ckpt_path = os.path.join(config['data']['ckpt'], config['experiment_name'], 'model.pt') if os.path.exists(ckpt_path): print('Loading checkpoint: %s' % ckpt_path) ckpt = torch.load(ckpt_path) encoder.load_state_dict(ckpt['encoder']) decoder.load_state_dict(ckpt['decoder']) else: print('Unable to find checkpoint. Terminating.') sys.exit(1) encoder.eval() decoder.eval() # Initialize translator. greedy_translator = GreedyTranslator(encoder, decoder, tgt_vocab) # Qualitative evaluation - print translations for first couple sentences in # test corpus. for i in range(10): src, tgt = test_data[i] translation = greedy_translator(src) src_sentence = [src_vocab.id2word(id) for id in src.data.cpu().numpy()] tgt_sentence = [tgt_vocab.id2word(id) for id in tgt.data.cpu().numpy()] translated_sentence = [tgt_vocab.id2word(id) for id in translation] print('---') print('Source: %s' % ' '.join(src_sentence)) print('Ground truth: %s' % ' '.join(tgt_sentence)) print('Model output: %s' % ' '.join(translated_sentence)) print('---') # Quantitative evaluation - compute corpus level BLEU scores. hypotheses = [] references = [] for src, tgt in test_data: translation = greedy_translator(src) tgt_sentence = [tgt_vocab.id2word(id) for id in tgt.data.cpu().numpy()] translated_sentence = [tgt_vocab.id2word(id) for id in translation] # Remove start and end of sentence tokens. tgt_sentence = tgt_sentence[1:-1] translated_sentence = translated_sentence[1:-1] hypotheses.append(tgt_sentence) references.append([translated_sentence]) print("Corpus BLEU score: %0.4f" % corpus_bleu(references, hypotheses))
def main(_): # Set up logging configure_logging(FLAGS.debug_log) # Load configuration with open(FLAGS.config, 'r') as f: config = yaml.load(f) # Get the directory paths ckpt_dir = os.path.join(config['training']['ckpt_dir'], config['experiment_name']) summary_dir = os.path.join(config['training']['summary_dir'], config['experiment_name']) # Create the directories if they do not already exist if not os.path.exists(ckpt_dir): logging.info('Creating checkpoint directory: `%s`.' % ckpt_dir) os.makedirs(ckpt_dir) if not os.path.exists(summary_dir): logging.info('Creating summary directory: `%s`.' % summary_dir) os.makedirs(summary_dir) # Check for conflicting configurations safe_copy_config(config, FLAGS.force_overwrite) # Init summary writer summary_writer = SummaryWriter(summary_dir) # Load vocab and datasets logging.info('Loading the vocabulary.') with open(config['data']['vocab'], 'r') as f: vocab = Vocab.load(f) logging.info('Loading train and valid data.') train_data = TextDataset(config['data']['train'], vocab=vocab, max_length=config['training']['max_length']) valid_data = TextDataset(config['data']['valid'], vocab=vocab, max_length=config['training']['max_length']) # Initialize models logging.info('Initializing the inference network and generative model.') inference_network = RNNTextInferenceNetwork( dim=config['model']['dim'], vocab_size=len(vocab), encoder_kwargs=config['model']['encoder'], normalizing_flow_kwargs=config['model']['normalizing_flow']) generative_model = RNNTextGenerativeModel( dim=config['model']['dim'], vocab_size=len(vocab), max_length=config['training']['max_length'], sos_idx=vocab.sos_idx, **config['model']['generator']) if torch.cuda.is_available(): inference_network = inference_network.cuda() generative_model = generative_model.cuda() # Setup model optimizers optimizer_in = torch.optim.Adam(inference_network.parameters(), lr=config['training']['learning_rate']) optimizer_gm = torch.optim.Adam(generative_model.parameters(), lr=config['training']['learning_rate']) # Restore ckpt = os.path.join(ckpt_dir, 'model.pt') if os.path.exists(ckpt): logging.info('Model checkpoint detected at: `%s`. Restoring.' % ckpt) checkpoint = torch.load(ckpt) epoch = checkpoint['epoch'] t = checkpoint['t'] best_loss = checkpoint['best_loss'] inference_network.load_state_dict(checkpoint['state_dict_in']) generative_model.load_state_dict(checkpoint['state_dict_gm']) optimizer_in.load_state_dict(checkpoint['optimizer_in']) optimizer_gm.load_state_dict(checkpoint['optimizer_gm']) else: logging.info('No existing checkpoint found.') epoch = 0 t = 0 best_loss = float('inf') # Start train weight = torch.ones(len(vocab)) weight[vocab.unk_idx] = config['training']['unk_weight'] if torch.cuda.is_available(): weight = weight.cuda() while epoch < config['training']['epochs']: logging.info('Starting epoch - %i.' % epoch) inference_network.train() generative_model.train() # Training step logging.info('Start train step.') train_loader = DataLoader( dataset=train_data, batch_size=config['training']['batch_size'], shuffle=True, num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) # Init train summaries train_nll = 0.0 train_kl = 0.0 train_loss = 0.0 for batch in train_loader: optimizer_in.zero_grad() optimizer_gm.zero_grad() x = batch['input'] target = batch['target'] lengths = batch['lengths'] if torch.cuda.is_available(): x = x.cuda() target = target.cuda() lengths = lengths.cuda() # Forward pass of inference network z, kl = inference_network(x, lengths) # Teacher forcing x_hat = word_dropout(x, config['training']['word_dropout_rate'], vocab.unk_idx) logp, _ = generative_model(z, x_hat, lengths) # Obtain current value of the annealing constant with beta trick beta = get_beta(config, epoch) # Compute annealed loss length = logp.shape[1] logp = logp.view(-1, len(vocab)) target = target[:,:length].contiguous().view(-1) nll = F.nll_loss(logp, target, ignore_index=vocab.pad_idx, weight=weight, size_average=False) loss = nll + beta * kl # Update summaries train_nll += nll.data train_kl += kl.data train_loss += loss.data # Backpropagate gradients batch_size = config['training']['batch_size'] loss /= batch_size kl /= batch_size nll /= batch_size loss.backward() optimizer_in.step() optimizer_gm.step() # Log if not t % config['training']['log_frequency']: # Note: logged train loss only for a single batch - see # tensorboard for summary over epochs line = 'Iteration: %i - Loss: %0.4f. - KL: %0.4f - NLL: %0.4f' logging.info(line % (t, loss.data, kl.data, nll.data)) # Print a greedy sample z_k, _ = inference_network(x, lengths) _, sample = generative_model(z_k) example = [vocab.id2word(int(x)) for x in sample[0]] try: T = example.index(vocab.eos_token) example = example[:T] except ValueError: pass example = ' '.join(example) logging.info('Example - `%s`' % example) t += 1 # Validation step logging.info('Start valid step.') valid_loader = DataLoader( dataset=valid_data, batch_size=config['training']['batch_size'], shuffle=False, num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) # Init valid summaries valid_nll = 0.0 valid_kl = 0.0 valid_loss = 0.0 for batch in valid_loader: x = batch['input'] target = batch['target'] lengths = batch['lengths'] if torch.cuda.is_available(): x = x.cuda() target = target.cuda() lengths = lengths.cuda() # Forward pass of inference network z, kl = inference_network(x, lengths) # Teacher forcing logp, _ = generative_model(z, x, lengths) # Compute annealed loss length = logp.shape[1] logp = logp.view(-1, len(vocab)) target = target[:,:length].contiguous().view(-1) nll = F.nll_loss(logp, target, ignore_index=vocab.pad_idx, size_average=False) loss = nll + kl # Update summaries valid_nll += nll.data valid_kl += kl.data valid_loss += loss.data # Normalize losses train_nll /= len(train_data) train_kl /= len(train_data) train_loss /= len(train_data) valid_nll /= len(valid_data) valid_kl /= len(valid_data) valid_loss /= len(valid_data) # Tensorboard logging summary_writer.add_scalar("elbo/train", train_loss.data, epoch) summary_writer.add_scalar("kl/train", train_kl.data, epoch) summary_writer.add_scalar("nll/train", train_nll.data, epoch) summary_writer.add_scalar("elbo/val", valid_loss.data, epoch) summary_writer.add_scalar("kl/val", valid_kl.data, epoch) summary_writer.add_scalar("nll/val", valid_nll.data, epoch) # Save checkpoint is_best = valid_loss < best_loss best_loss = min(loss, best_loss) save_checkpoint({ 'epoch': epoch + 1, 't': t, 'best_loss': best_loss, 'state_dict_in': inference_network.state_dict(), 'state_dict_gm': generative_model.state_dict(), 'optimizer_in': optimizer_in.state_dict(), 'optimizer_gm': optimizer_gm.state_dict() }, is_best, ckpt) epoch += 1
def main(_): # Set up logging configure_logging(FLAGS.debug_log) # Load configuration with open(FLAGS.config, 'r') as f: config = yaml.load(f) # Get the checkpoint path ckpt_dir = os.path.join(config['training']['ckpt_dir'], config['experiment_name']) # Load vocab and datasets logging.info('Loading the vocabulary.') with open(config['data']['vocab'], 'r') as f: vocab = Vocab.load(f) logging.info('Loading test data.') test_data = TextDataset(config['data']['test'], vocab=vocab, max_length=config['training']['max_length']) test_loader = DataLoader(dataset=test_data, batch_size=config['training']['batch_size'], shuffle=False, num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) # Initialize models logging.info('Initializing the inference network and generative model.') inference_network = RNNTextInferenceNetwork( dim=config['model']['dim'], vocab_size=len(vocab), encoder_kwargs=config['model']['encoder'], normalizing_flow_kwargs=config['model']['normalizing_flow']) generative_model = RNNTextGenerativeModel( dim=config['model']['dim'], vocab_size=len(vocab), max_length=config['training']['max_length'], sos_idx=vocab.sos_idx, **config['model']['generator']) if torch.cuda.is_available(): inference_network = inference_network.cuda() generative_model = generative_model.cuda() # Restore ckpt = os.path.join(ckpt_dir, 'model.pt.best') if os.path.exists(ckpt): logging.info('Model checkpoint detected at: `%s`. Restoring.' % ckpt) checkpoint = torch.load(ckpt) inference_network.load_state_dict(checkpoint['state_dict_in']) generative_model.load_state_dict(checkpoint['state_dict_gm']) else: logging.error('No model checkpoint found. Terminating.') sys.exit(1) # Init test summaries test_nll = 0.0 test_kl = 0.0 test_loss = 0.0 test_suml2p = 0.0 test_n = 0.0 # Evaluate inference_network.eval() generative_model.eval() for batch in test_loader: x = batch['input'] target = batch['target'] lengths = batch['lengths'] if torch.cuda.is_available(): x = x.cuda() target = target.cuda() lengths = lengths.cuda() # Forward pass of inference network z, kl = inference_network(x, lengths) # Teacher forcing logp, _ = generative_model(z, x, lengths) # Compute loss length = logp.shape[1] logp = logp.view(-1, len(vocab)) target = target[:, :length].contiguous().view(-1) nll = F.nll_loss(logp, target, ignore_index=vocab.pad_idx, size_average=False) loss = nll + kl l2p, n = suml2p(logp, target, vocab.pad_idx) # Update summaries test_nll += nll.data test_kl += kl.data test_loss += loss.data test_suml2p += l2p.data test_n += n # Normalize losses test_nll /= len(test_data) test_kl /= len(test_data) test_loss /= len(test_data) H = -test_suml2p / test_n test_perplexity = 2**H # Log output logging.info('NLL: %0.4f' % test_nll) logging.info('KL: %0.4f' % test_kl) logging.info('ELBO: %0.4f' % test_loss) logging.info('Perplexity: %0.4f' % test_perplexity)
def main(matrix=False): # Load the configuration file. with open('config.yaml', 'r') as f: config = yaml.load(f) # Load the vocabularies. src_vocab = Vocab.load(config['data']['src']['vocab']) tgt_vocab = Vocab.load(config['data']['tgt']['vocab']) # Load the training and dev datasets. test_data = ShakespeareDataset('test', config, src_vocab, tgt_vocab) # Restore the model. src_vocab_size = len(src_vocab) tgt_vocab_size = len(tgt_vocab) import pickle if matrix: f = open('attention_mat.pkl', 'rb') attention_matrix = pickle.load(f) f.close() for i in range(10): src, tgt = test_data[i] decoder_attn = attention_matrix[i] src_sentence = [ src_vocab.id2word(id) for id in src.data.cpu().numpy() ] tgt_sentence = [ tgt_vocab.id2word(id) for id in tgt.data.cpu().numpy() ] src_sentence_ = ' '.join(src_sentence) tgt_sentence_ = ' '.join(tgt_sentence) show_attention(src_sentence_, tgt_sentence_, decoder_attn) return encoder = EncoderRNN(src_vocab_size, config['model']['embedding_dim'], config['model']['layer']) attn = 'general' decoder = AttnDecoderRNN(attn, config['model']['embedding_dim'], tgt_vocab_size, config['model']['layer']) if torch.cuda.is_available(): encoder = encoder.cuda() decoder = decoder.cuda() ckpt_path = os.path.join(config['data']['ckpt'], config['experiment_name'], 'model.pt') if os.path.exists(ckpt_path): print('Loading checkpoint: %s' % ckpt_path) ckpt = torch.load(ckpt_path) encoder.load_state_dict(ckpt['encoder']) decoder.load_state_dict(ckpt['decoder']) else: print('Unable to find checkpoint. Terminating.') sys.exit(1) encoder.eval() decoder.eval() # Initialize translator. greedy_translator = GreedyTranslator(encoder, decoder, tgt_vocab) # Qualitative evaluation - print translations for first couple sentences in # test corpus. import numpy as np attention_matrix = [] import pickle for i in range(10): src, tgt = test_data[i] translation, decoder_attn = greedy_translator(src) attention_matrix.append(decoder_attn.numpy()) src_sentence = [src_vocab.id2word(id) for id in src.data.cpu().numpy()] tgt_sentence = [tgt_vocab.id2word(id) for id in tgt.data.cpu().numpy()] translated_sentence = [tgt_vocab.id2word(id) for id in translation] print('---') print('Source: %s' % ' '.join(src_sentence)) print('Ground truth: %s' % ' '.join(tgt_sentence)) print('Model output: %s' % ' '.join(translated_sentence)) print('---') f = open('attention_mat.pkl', 'wb') pickle.dump(attention_matrix, f) f.close() # Quantitative evaluation - compute corpus level BLEU scores. hypotheses = [] references = [] for src, tgt in test_data: translation, decoder_attn = greedy_translator(src) tgt_sentence = [tgt_vocab.id2word(id) for id in tgt.data.cpu().numpy()] translated_sentence = [tgt_vocab.id2word(id) for id in translation] # Remove start and end of sentence tokens. tgt_sentence = tgt_sentence[1:-1] translated_sentence = translated_sentence[1:-1] hypotheses.append(tgt_sentence) references.append([translated_sentence]) print("Corpus BLEU score: %0.4f" % corpus_bleu(references, hypotheses))
def train(args): vocab = Vocab.load(args.vocab, max_size=args.vocab_size) data_reader = DataReader(data_dir=args.data_dir, shuffle=True) preprocessor = Preprocessor( predict_prev=args.predict_prev, predict_cur=args.predict_cur, predict_next=args.predict_next, vocab=vocab, max_length=args.max_length, gpu=args.gpu) model = SkipThought( rnn_type=args.rnn_type, num_words=len(vocab), word_dim=args.word_dim, hidden_dim=args.hidden_dim, bidirectional=args.bidirectional, predict_prev=args.predict_prev, predict_cur=args.predict_cur, predict_next=args.predict_next) print(model) if args.pretrained is not None: print(f'Loading pretrained model from {args.pretrained}') model.load_state_dict( torch.load(args.pretrained, map_location=lambda storage, loc: storage)) if args.gpu > -1: model.cuda(args.gpu) optimizer = optim.Adam(model.parameters()) summary_writer = SummaryWriter(os.path.join(args.save_dir, 'log')) def add_scalar_summary(name, value, step): summary_writer.add_scalar(tag=name, scalar_value=value, global_step=step) def add_text_summary(name, value, step): summary_writer.add_text(tag=name, text_string=value, global_step=step) def variable(tensor, volatile=False): return Variable(tensor, volatile=volatile) def run_train_iter(batch): if not model.training: model.train() src, tgt = preprocessor(batch) src = (variable(src[0]), src[1]) for k in tgt: tgt[k] = (variable(tgt[k][0]), tgt[k][1]) logits = model.forward(src=src, tgt=tgt) loss = 0 for k in tgt: logits_k = logits[k] tgt_k = tgt[k] loss = loss + basic.sequence_cross_entropy( logits=logits_k[:-1], targets=tgt_k[0][1:], length=tgt_k[1] - 1) optimizer.zero_grad() loss.backward() clip_grad_norm(model.parameters(), max_norm=10) optimizer.step() return loss.data[0] def ids_to_words(ids): words = [] eos_id = vocab.stoi(vocab.eos) for id_ in ids: words.append(vocab.itos(id_)) if id_ == eos_id: break return words def generate_using_decoder(name, src, max_length): _, encoder_state = model.encoder(words=src[0], length=src[1]) if isinstance(encoder_state, tuple): # LSTM encoder_state = encoder_state[0] context = (encoder_state.transpose(0, 1).contiguous() .view(-1, args.hidden_dim)) batch_size = src[1].size(0) bos_id = vocab.stoi(vocab.bos) bos = Variable(src[1].new(1, batch_size).fill_(bos_id)) decoder = model.get_decoder(name) prev_pred = bos done = torch.zeros(batch_size).byte() hyps = [] prev_state = context.unsqueeze(0) for t in range(max_length): if done.all(): break decoder_input = prev_pred logit, prev_state = decoder(words=decoder_input, prev_state=prev_state) pred = logit.max(2)[1] prev_pred = pred hyps.append(pred.data) hyps = torch.cat(hyps, dim=0).transpose(0, 1).tolist() return hyps def generate(batch): # Greedy search src, tgt = preprocessor(batch) src = (variable(src[0]), src[1]) for k in tgt: tgt[k] = (variable(tgt[k][0], volatile=True), tgt[k][1]) batch_size = src[0].size(1) max_length = src[0].size(0) * 2 generated = {} for k in tgt: generated[k] = generate_using_decoder( name=k, src=src, max_length=max_length) results = [] for i in range(batch_size): res = {'src': ' '.join(ids_to_words(src[0][:src[1][i], i].data)), 'tgt': {}, 'out': {}} for k in tgt: res['tgt'][k] = ' '.join(ids_to_words(tgt[k][0][1:, i].data)) res['out'][k] = ' '.join(ids_to_words(generated[k][i])) results.append(res) return results def generate_synthetic_batch(real_batch): def sort_by_length(tgt_of_key): sorted_length, sort_inds = tgt_of_key[1].sort( dim=0, descending=True) return tgt_of_key[0][:, sort_inds], sorted_length # Forward: given prev, generate cur' _, tgt = preprocessor(real_batch) tgt_prev, tgt_prev_length = sort_by_length(tgt['prev']) syn_src_fw = generate_using_decoder( name='next', src=(variable(tgt_prev[1:], volatile=True), tgt_prev_length - 1), max_length=args.max_length) # Backward: given next, generate cur'' tgt_next, tgt_next_length = sort_by_length(tgt['next']) syn_src_bw = generate_using_decoder( name='prev', src=(variable(tgt_next[1:], volatile=True), tgt_next_length - 1), max_length=args.max_length) syn_batch_fw = [] syn_batch_bw = [] for i in range(len(real_batch)): syn_src_fw_str = ' '.join(ids_to_words(syn_src_fw[i])) syn_src_bw_str = ' '.join(ids_to_words(syn_src_bw[i])) syn_batch_fw.append( (real_batch[i][0], syn_src_fw_str, real_batch[i][2])) syn_batch_bw.append( (real_batch[i][0], syn_src_bw_str, real_batch[i][2])) return syn_batch_fw, syn_batch_bw global_step = 0 def print_samples(): model.eval() num_samples = 2 samples = data_reader.next_batch(size=num_samples, peek=True) syn_samples_fw, syn_samples_bw = generate_synthetic_batch(samples) gen_results = generate(samples) syn_gen_results_fw = generate(syn_samples_fw) syn_gen_results_bw = generate(syn_samples_bw) text_val = '' for i, res in enumerate(gen_results): text_val += f'* sample (real) #{i}\n' text_val += f'\t* src: {res["src"]}\n' for k in res['tgt']: tgt_k = res['tgt'][k] out_k = res['out'][k] text_val += f'\t* {k} (tgt): {tgt_k}\n' text_val += f'\t* {k} (out): {out_k}\n' for i, res in enumerate(syn_gen_results_fw): text_val += f'* sample (syn_fw) #{i}\n' text_val += f'\t* src: {res["src"]}\n' for k in res['tgt']: tgt_k = res['tgt'][k] out_k = res['out'][k] text_val += f'\t* {k} (tgt): {tgt_k}\n' text_val += f'\t* {k} (out): {out_k}\n' for i, res in enumerate(syn_gen_results_bw): text_val += f'* sample (syn_bw) #{i}\n' text_val += f'\t* src: {res["src"]}\n' for k in res['tgt']: tgt_k = res['tgt'][k] out_k = res['out'][k] text_val += f'\t* {k} (tgt): {tgt_k}\n' text_val += f'\t* {k} (out): {out_k}\n' add_text_summary('Sample', value=text_val, step=global_step) for epoch in range(args.max_epoch): data_reader.start_epoch() for batch in tqdm(data_reader.iterator(args.batch_size), desc=f'Epoch {epoch}'): # Train on real batch real_loss = run_train_iter(batch) # Train on synthetic batches syn_batch_fw, syn_batch_bw = generate_synthetic_batch(batch) syn_loss_fw = run_train_iter(syn_batch_fw) syn_loss_bw = run_train_iter(syn_batch_bw) global_step += 1 add_scalar_summary(name='real_loss', value=real_loss, step=global_step) add_scalar_summary(name='syn_loss_fw', value=syn_loss_fw, step=global_step) add_scalar_summary(name='syn_loss_bw', value=syn_loss_bw, step=global_step) if global_step % args.print_every == 0: print_samples() if global_step % args.save_every == 0: model_filename = f'model-{global_step}.pt' model_path = os.path.join(args.save_dir, model_filename) torch.save(model.state_dict(), model_path) print(f'\nIter #{global_step}: ' f'Saved checkpoint to {model_path}')
import solvers from utils import load_pickle, PAD_TOKEN, UNK_TOKEN, EOS_TOKEN, SOS_TOKEN, UNK_TOKEN, SEP_TOKEN, EOS_ID import torch import sentencepiece as spm from transformers import OpenAIGPTTokenizer, GPT2Tokenizer import os if __name__ == '__main__': config = get_config(mode='train') val_config = get_config(mode='valid') with open(os.path.join(config.save_path, 'config.txt'), 'w') as f: print(config, file=f) if config.data_name == "cornell": vocab = Vocab() vocab.load(config.word2id_path, config.id2word_path, ptb=(config.model == "PTB")) config.vocab_size = vocab.vocab_size config.pad_id = vocab.pad_id config.eos_id = EOS_ID print(f'Vocabulary size: {vocab.vocab_size}') if config.users: train_users = load_pickle(config.convs_users_path) config.user_size = max([x for xx in train_users for x in xx]) + 1 print(f'User size: {config.user_size}') eval_users = load_pickle(val_config.convs_users_path) else: train_users = None eval_users = None