def main():
    config = get_config(mode='test')

    vocab = Vocab()
    vocab.load(config.word2id_path, config.id2word_path)
    print(f'Vocabulary size: {vocab.vocab_size}')
    config.vocab_size = vocab.vocab_size

    if config.users:
        test_users = load_pickle(config.convs_users_path)
        config.user_size = max([x for xx in test_users for x in xx]) + 1
        print(f'User size: {config.user_size}')
    else:
        test_users = None

    data_loader = get_loader(convs=load_pickle(config.convs_path),
                             convs_length=load_pickle(config.conversations_length_path),
                             utterances_length=load_pickle(config.utterances_length_path),
                             vocab=vocab, batch_size=config.batch_size, shuffle=False, convs_users=test_users)

    model_solver = getattr(solvers, "Solver{}".format(config.model))
    test_solver = model_solver(config, None, data_loader, vocab=vocab, is_train=False)

    test_solver.build()
    test_solver.export_samples()
示例#2
0
def test(args):
    vocab = Vocab()
    vocab.load(args.vocab)
    vocab.add_special_token()

    pos2id = Vocab()
    pos2id.load(args.poslist)

    if args.gpu > -1:
        cuda.get_device(args.gpu).use()
        xp = cuda.cupy
    else:
        xp = np

    model = WordnnTagger.load(args.model)

    out_path = making_data(args.test_path, model.window)

    if args.gpu > -1:
        model.to_gpu()
    model.make_oov_vector(args.gpu > -1)

    # start evaluation
    n_data = 0
    n_correct = 0
    sum_loss = xp.zeros((), dtype=xp.float32)
    start = time.time()
    for tags, contexts in line_iter(out_path, args.minibatch, False):
        batch_ts = xp.array([pos2id[tag] for tag in tags], dtype=xp.int32)
        batch_xs = xp.array(
            [[vocab[word] for word in vocab.check_words(context)]
             for context in contexts],
            dtype=xp.int32)
        cur_batch_size = batch_ts.shape[0]
        ys, loss = model(batch_xs, batch_ts)
        sum_loss += loss.data * cur_batch_size
        pred_labels = ys.data.argmax(1)
        n_correct += sum(1 for j in range(cur_batch_size)
                         if pred_labels[j] == batch_ts[j])
        n_data += cur_batch_size
    end = time.time()
    accuracy = float(n_correct / n_data)
    print('test loss : {}'.format(sum_loss))
    print('test accuracy : {}'.format(accuracy))
    print('(time to run : {})'.format(end - start))
示例#3
0
def main(_):
    # Set up logging
    configure_logging(FLAGS.debug_log)

    # Load configuration
    with open(FLAGS.config, 'r') as f:
        config = yaml.load(f)

    # Get the checkpoint path
    ckpt_dir = os.path.join(config['training']['ckpt_dir'],
                            config['experiment_name'])

    # Load model vocab
    logging.info('Loading the vocabulary.')
    with open(config['data']['vocab'], 'r') as f:
        vocab = Vocab.load(f)

    # Initialize models
    logging.info('Initializing the generative model.')
    inference_network = RNNTextInferenceNetwork(
        dim=config['model']['dim'],
        vocab_size=len(vocab),
        encoder_kwargs=config['model']['encoder'],
        normalizing_flow_kwargs=config['model']['normalizing_flow'])
    generative_model = RNNTextGenerativeModel(
        dim=config['model']['dim'],
        vocab_size=len(vocab),
        max_length=config['training']['max_length'],
        sos_idx=vocab.sos_idx,
        **config['model']['generator'])
    if torch.cuda.is_available():
        inference_network = inference_network.cuda()
        generative_model = generative_model.cuda()

    # Restore
    ckpt = os.path.join(ckpt_dir, 'model.pt.best')
    if os.path.exists(ckpt):
        logging.info('Model checkpoint detected at: `%s`. Restoring.' % ckpt)
        checkpoint = torch.load(ckpt)
        inference_network.load_state_dict(checkpoint['state_dict_in'])
        generative_model.load_state_dict(checkpoint['state_dict_gm'])
    else:
        logging.error('No model checkpoint found. Terminating.')
        sys.exit(1)

    inference_network.eval()
    generative_model.eval()

    if FLAGS.which == 'interpolate':
        interpolate(inference_network, generative_model, vocab)
    elif FLAGS.which == 'sample':
        sample(inference_network, generative_model, vocab)
示例#4
0
def main(_):
    if FLAGS.vocab:
        print('Loading vocab...')
        with open(FLAGS.vocab, 'r') as f:
            vocab = Vocab.load(f)

    print('Loading embeddings...')
    words = []
    embeddings = []
    with open(FLAGS.embedding_file, 'r') as f:
        for line in f:
            split = line.split()
            word = ' '.join(split[:-200])
            embedding = split[-200:]
            embedding = list(map(float, embedding))
            words.append(word)
            embeddings.append(embedding)
    embedding_size = len(embedding)
    if FLAGS.vocab:
        truncated_embeddings = []
        word2id = {w: i for i, w in enumerate(words)}
        for word in vocab._word2id:
            try:
                id = word2id[word]
                truncated_embeddings.append(embeddings[id])
            except KeyError:
                print('WARNING: Word "%s" has no predefined embedding' % word)
                random_embedding = [random.random() for _ in
                                    range(embedding_size)]
                truncated_embeddings.append(random_embedding)
        # Done!
        embedding_matrix = np.array(truncated_embeddings)
    else:
        embedding_matrix = np.array(embeddings)

    print('Producing Tensor:')
    embedding_matrix = tf.Variable(embedding_matrix,
                                   dtype=tf.float32,
                                   name='desc_word_embeddings')
    print(embedding_matrix)
    print('Saving checkpoint...')
    saver = tf.train.Saver([embedding_matrix])
    with tf.Session() as sess:
        sess.run(tf.variables_initializer([embedding_matrix]))
        saver.save(sess, FLAGS.output_file, write_meta_graph=False)

    print('Done')
示例#5
0
import os
import pickle
from models import VariationalModels


def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)


if __name__ == '__main__':
    config = get_config(mode='test')

    print('Loading Vocabulary...')
    vocab = Vocab()
    vocab.load(config.word2id_path, config.id2word_path)
    print(f'Vocabulary size: {vocab.vocab_size}')

    config.vocab_size = vocab.vocab_size

    data_loader = get_loader(
        sentences=load_pickle(config.sentences_path),
        conversation_length=load_pickle(config.conversation_length_path),
        sentence_length=load_pickle(config.sentence_length_path),
        vocab=vocab,
        batch_size=config.batch_size)

    if config.model in VariationalModels:
        solver = VariationalSolver(config,
                                   None,
                                   data_loader,
示例#6
0
def main():
    config = get_config(mode='test')

    if config.data_name == "cornell":
        vocab = Vocab()
        vocab.load(config.word2id_path,
                   config.id2word_path,
                   ptb=(config.model == "PTB"))
        print(f'Vocabulary size: {vocab.vocab_size}')
        config.vocab_size = vocab.vocab_size

        if config.users:
            test_users = load_pickle(config.convs_users_path)
            config.user_size = max([x for xx in test_users for x in xx]) + 1
            print(f'User size: {config.user_size}')
        else:
            test_users = None

        data_loader = get_loader(
            convs=load_pickle(config.convs_path),
            convs_length=load_pickle(config.conversations_length_path),
            utterances_length=load_pickle(config.utterances_length_path),
            vocab=vocab,
            batch_size=config.batch_size,
            shuffle=False,
            convs_users=test_users,
            is_ptb_model=(config.model == "PTB"))

    elif config.model == "DialoGPT":
        if config.users:
            vocab = GPT2Tokenizer.from_pretrained(config.user_vocab_path)
        else:
            vocab = GPT2Tokenizer.from_pretrained('gpt2')
        config.vocab_size = len(vocab)
        config.vocab = vocab
        config.export_test = True
        data_loader = get_loader(convs=load_pickle(config.convs_path),
                                 vocab=vocab,
                                 batch_size=config.batch_size,
                                 model=config.model,
                                 dataset=config.data_name,
                                 config=config,
                                 shuffle=False)

    elif config.data_name == "cornell2" or config.data_name == "ubuntu" or config.data_name == "twitter_s":
        vocab = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        special_tokens = {
            'pad_token': PAD_TOKEN,
            'bos_token': SOS_TOKEN,
            'eos_token': EOS_TOKEN,
            'sep_token': SEP_TOKEN,
        }
        vocab.add_special_tokens(special_tokens)
        config.vocab_size = len(vocab)
        config.vocab = vocab
        config.pad_id = vocab.pad_token_id
        config.eos_id = vocab.eos_token_id
        config.sos_id = vocab.bos_token_id

        data_loader = get_loader(convs=load_pickle(config.convs_path),
                                 vocab=vocab,
                                 batch_size=config.batch_size,
                                 model=config.model,
                                 dataset=config.data_name,
                                 config=config,
                                 shuffle=False)
    else:
        raise ValueError("{} Sorry... We don't support that data".format(
            config.data_name))

    model_solver = getattr(solvers, "Solver{}".format(config.model))
    test_solver = model_solver(config,
                               None,
                               data_loader,
                               vocab=vocab,
                               is_train=False)

    test_solver.build()
    test_solver.export_samples(config.beam_size)
示例#7
0
from models import VariationalModels
import re


def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)


if __name__ == '__main__':
    test_freq_config = get_config(mode='test_freq')
    test_rare_config = get_config(mode='test_rare')

    print('Loading freq Vocabulary...')
    vocab_freq = Vocab()
    vocab_freq.load(test_freq_config.word2id_path,
                    test_freq_config.id2word_path)
    vocab_rare = Vocab()
    vocab_rare.load(test_rare_config.word2id_path,
                    test_rare_config.id2word_path)
    print(f'freq Vocabulary size: {vocab_freq.vocab_size}')
    print(f'rare Vocabulary size: {vocab_rare.vocab_size}')

    test_freq_config.vocab_size = vocab_freq.vocab_size
    test_rare_config.vocab_size = vocab_rare.vocab_size

    freq_data_loader = get_loader(
        sentences=load_pickle(test_freq_config.sentences_path),
        conversation_length=load_pickle(
            test_freq_config.conversation_length_path),
        sentence_length=load_pickle(test_freq_config.sentence_length_path),
        vocab=vocab_freq,
示例#8
0
def train(args):
    if args.gpu > -1:
        cuda.get_device(args.gpu).use()
        xp = cuda.cupy
    else:
        xp = np

    if args.log:
        log_dir = args.log
    else:
        log_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), '{}_{}'.format(DIR_NAME, datetime.now().strftime('%Y%m%d_%H:%M')))

    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    # setting for logging
    logger = logging.getLogger()
    logging.basicConfig(level=logging.INFO)
    log_path = os.path.join(log_dir, 'log')
    file_handler = logging.FileHandler(log_path)
    fmt = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    file_handler.setFormatter(fmt)
    logger.addHandler(file_handler)

    logger.info('Arguments...')
    for arg, val in vars(args).items():
        logger.info('{} : {}'.format(arg, val))

    logger.info('Loading Vocab...')
    vocab = Vocab()
    vocab.load(args.vocab, args.lowercase)
    vocab.add_special_token()

    sufvocab = Vocab()
    sufvocab.load(args.sufvocab, args.lowercase)
    sufvocab.add_special_token(['s>', '<UNK>'])

    pos2id = Vocab()
    pos2id.load(args.poslist)

    logger.info('preparation for training data...')
    out_path = making_data(args.train_data, args.window)

    model = WordCSnnTagger(args.wembed, args.fembed, args.hidden, len(vocab), len(sufvocab), len(pos2id), args.window, args.objct, args.alpha)
    model.save_model_config(log_dir)

    if args.gpu > -1:
        model.to_gpu()

    opt = getattr(optimizers, args.opt)()
    opt.setup(model)
    opt.add_hook(optimizer.GradientClipping(args.gclip))
    opt.add_hook(optimizer.WeightDecay(args.wdecay))

    for epoch in range(args.epoch):
        logger.info('START epoch {}/{}'.format(epoch + 1, args.epoch))
        start = time.time()
        sum_loss = xp.zeros((), dtype=xp.float32)
        n_data = 0
        n_correct = 0
        for i, [tags, contexts] in enumerate(line_iter(out_path, args.minibatch)):
            batch_ts = xp.array([pos2id[tag] for tag in tags], dtype=xp.int32)
            batch_caps = xp.array([[get_capf(word) for word in context] for context in contexts], dtype=xp.int32)
            if args.lowercase:
                contexts = [[word.lower() for word in context] for context in contexts]
            batch_xs = xp.array([[vocab[word] for word in context] for context in contexts], dtype=xp.int32)
            batch_sufs = xp.array([[sufvocab[word[-2:]] for word in context] for context in contexts], dtype=xp.int32)
            batch_caps = xp.array([[get_capf(word) for word in context] for context in contexts], dtype=xp.int32)
            batch_features = [batch_xs, batch_sufs, batch_caps]
            cur_batch_size = batch_ts.shape[0]
            ys, loss = model(batch_features, batch_ts)
            sum_loss += loss.data * cur_batch_size
            model.zerograds()
            loss.backward()
            opt.update()
            pred_labels = ys.data.argmax(1)
            n_correct += sum(1 for j in range(cur_batch_size) if pred_labels[j] == batch_ts[j])
            n_data += cur_batch_size
            logger.info('done {} batches'.format(i + 1))
        logger.info('{} epoch train loss = {}'.format(epoch + 1, sum_loss))
        logger.info('{} epoch train accuracy = {}'.format(epoch + 1, float(n_correct / n_data)))
        logger.info('{} sec for training per epoch'.format(time.time() - start))

        if args.valid_data:
            start = time.time()
            valid_loss, valid_accuracy = evaluation(model, args.valid_data, pos2id, vocab, sufvocab, args)
            logger.info('{} epoch valid loss = {}'.format(epoch + 1, valid_loss))
            logger.info('{} epoch valid accuracy = {}'.format(epoch + 1, valid_accuracy))
            logger.info('{} sec for validation per epoch'.format(time.time() - start))

        if args.test_data:
            start = time.time()
            test_loss, test_accuracy = evaluation(model, args.test_data, pos2id, vocab, sufvocab, args)
            logger.info('{} epoch test loss = {}'.format(epoch + 1, test_loss))
            logger.info('{} epoch test accuracy = {}'.format(epoch + 1, test_accuracy))
            logger.info('{} sec for testing per epoch'.format(time.time() - start))

        logger.info('serializing...')
        prefix = '{}_{}ep_{}wembed_{}fembed_{}hidden_{}window_{}minibatch_{}opt'.format(DIR_NAME, epoch + 1, args.wembed, args.fembed, args.hidden, args.window, args.minibatch, args.opt)
        model_path = os.path.join(log_dir, prefix + '.model')
        model.save(model_path)

    logger.info('done training')
示例#9
0
def main(_):
    # Load the configuration file.
    with open(FLAGS.config, 'r') as f:
        config = yaml.load(f)

    # Create the checkpoint directory if it does not already exist.
    ckpt_dir = os.path.join(config['data']['ckpt'], config['experiment_name'])
    if not os.path.exists(ckpt_dir):
        os.mkdir(ckpt_dir)

    # Check if a pre-existing configuration file exists and matches the current
    # configuration. Otherwise save a copy of the configuration to the
    # checkpoint directory.
    prev_config_path = os.path.join(ckpt_dir, 'config.yaml')
    if os.path.exists(prev_config_path):
        with open(prev_config_path, 'r') as f:
            prev_config = yaml.load(f)
        assert config == prev_config
    else:
        shutil.copyfile(FLAGS.config, prev_config_path)

    # Load the vocabularies.
    src_vocab = Vocab.load(config['data']['src']['vocab'])
    tgt_vocab = Vocab.load(config['data']['tgt']['vocab'])

    # Load the training and dev datasets.
    train_data = ShakespeareDataset('train', config, src_vocab, tgt_vocab)
    dev_data = ShakespeareDataset('dev', config, src_vocab, tgt_vocab)

    # Build the model.
    src_vocab_size = len(src_vocab)
    tgt_vocab_size = len(tgt_vocab)
    encoder = Encoder(src_vocab_size, config['model']['embedding_dim'])
    decoder = Decoder(tgt_vocab_size, config['model']['embedding_dim'])
    if torch.cuda.is_available():
        encoder = encoder.cuda()
        decoder = decoder.cuda()

    # Define the loss function + optimizer.
    loss_weights = torch.ones(decoder.tgt_vocab_size)
    loss_weights[0] = 0
    if torch.cuda.is_available():
        loss_weights = loss_weights.cuda()
    criterion = torch.nn.NLLLoss(loss_weights)

    learning_rate = config['training']['learning_rate']
    encoder_optimizer = torch.optim.Adam(encoder.parameters(),
                                         lr=learning_rate)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(),
                                         lr=learning_rate)

    # Restore saved model (if one exists).
    ckpt_path = os.path.join(ckpt_dir, 'model.pt')
    if os.path.exists(ckpt_path):
        print('Loading checkpoint: %s' % ckpt_path)
        ckpt = torch.load(ckpt_path)
        epoch = ckpt['epoch']
        encoder.load_state_dict(ckpt['encoder'])
        decoder.load_state_dict(ckpt['decoder'])
        encoder_optimizer.load_state_dict(ckpt['encoder_optimizer'])
        decoder_optimizer.load_state_dict(ckpt['decoder_optimizer'])
    else:
        epoch = 0

    train_log_string = '%s :: Epoch %i :: Iter %i / %i :: train loss: %0.4f'
    dev_log_string = '\n%s :: Epoch %i :: dev loss: %0.4f'
    while epoch < config['training']['num_epochs']:

        # Main training loop.
        train_loss = []
        sampler = RandomSampler(train_data)
        for i, train_idx in enumerate(sampler):
            src, tgt = train_data[train_idx]

            # Clear gradients
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            # Feed inputs one by one from src into encoder (in reverse).
            src_length = src.size()[0]
            hidden = None
            for j in reversed(range(src_length)):
                encoder_output, hidden = encoder(src[j], hidden)

            # Feed desired outputs one by one from tgt into decoder
            # and measure loss.
            tgt_length = tgt.size()[0]
            loss = 0
            for j in range(tgt_length - 1):
                decoder_output, hidden = decoder(tgt[j], hidden)
                loss += criterion(decoder_output, tgt[j + 1])

            # Backpropagate the loss and update the model parameters.
            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()

            train_loss.append(loss.data.cpu())

            # Every once and a while check on the loss
            if ((i + 1) % 100) == 0:
                print(train_log_string %
                      (datetime.now(), epoch, i + 1, len(train_data),
                       np.mean(train_loss)),
                      end='\r')
                train_loss = []

        # Evaluation loop.
        dev_loss = []
        for src, tgt in dev_data:

            # Feed inputs one by one from src into encoder.
            src_length = src.size()[0]
            hidden = None
            for j in reversed(range(src_length)):
                encoder_output, hidden = encoder(src[j], hidden)

            # Feed desired outputs one by one from tgt into decoder
            # and measure loss.
            tgt_length = tgt.size()[0]
            loss = 0
            for j in range(tgt_length - 1):
                decoder_output, hidden = decoder(tgt[j], hidden)
                loss += criterion(decoder_output, tgt[j + 1])

            dev_loss.append(loss.data.cpu())

        print(dev_log_string % (datetime.now(), epoch, np.mean(dev_loss)))

        state_dict = {
            'epoch': epoch,
            'encoder': encoder.state_dict(),
            'decoder': decoder.state_dict(),
            'encoder_optimizer': encoder_optimizer.state_dict(),
            'decoder_optimizer': decoder_optimizer.state_dict()
        }
        torch.save(state_dict, ckpt_path)

        epoch += 1
示例#10
0
def main(_):
    # Load the configuration file.
    with open(FLAGS.config, 'r') as f:
        config = yaml.load(f)

    # Load the vocabularies.
    src_vocab = Vocab.load(config['data']['src']['vocab'])
    tgt_vocab = Vocab.load(config['data']['tgt']['vocab'])

    # Load the training and dev datasets.
    test_data = ShakespeareDataset('test', config, src_vocab, tgt_vocab)

    # Restore the model.
    src_vocab_size = len(src_vocab)
    tgt_vocab_size = len(tgt_vocab)

    encoder = Encoder(src_vocab_size, config['model']['embedding_dim'],
                      config['model']['bidirection'],
                      config['model']['dropout'], config['model']['layer'],
                      config['model']['mode'])
    decoder = Decoder(tgt_vocab_size, config['model']['embedding_dim'],
                      config['model']['bidirection'],
                      config['model']['dropout'], config['model']['layer'],
                      config['model']['mode'])

    if torch.cuda.is_available():
        encoder = encoder.cuda()
        decoder = decoder.cuda()
    ckpt_path = os.path.join(config['data']['ckpt'], config['experiment_name'],
                             'model.pt')
    if os.path.exists(ckpt_path):
        print('Loading checkpoint: %s' % ckpt_path)
        ckpt = torch.load(ckpt_path)
        encoder.load_state_dict(ckpt['encoder'])
        decoder.load_state_dict(ckpt['decoder'])
    else:
        print('Unable to find checkpoint. Terminating.')
        sys.exit(1)
    encoder.eval()
    decoder.eval()

    # Initialize translator.
    greedy_translator = GreedyTranslator(encoder, decoder, tgt_vocab)

    # Qualitative evaluation - print translations for first couple sentences in
    # test corpus.
    for i in range(10):
        src, tgt = test_data[i]
        translation = greedy_translator(src)
        src_sentence = [src_vocab.id2word(id) for id in src.data.cpu().numpy()]
        tgt_sentence = [tgt_vocab.id2word(id) for id in tgt.data.cpu().numpy()]
        translated_sentence = [tgt_vocab.id2word(id) for id in translation]
        print('---')
        print('Source: %s' % ' '.join(src_sentence))
        print('Ground truth: %s' % ' '.join(tgt_sentence))
        print('Model output: %s' % ' '.join(translated_sentence))
    print('---')

    # Quantitative evaluation - compute corpus level BLEU scores.
    hypotheses = []
    references = []
    for src, tgt in test_data:
        translation = greedy_translator(src)
        tgt_sentence = [tgt_vocab.id2word(id) for id in tgt.data.cpu().numpy()]
        translated_sentence = [tgt_vocab.id2word(id) for id in translation]
        # Remove start and end of sentence tokens.
        tgt_sentence = tgt_sentence[1:-1]
        translated_sentence = translated_sentence[1:-1]
        hypotheses.append(tgt_sentence)
        references.append([translated_sentence])
    print("Corpus BLEU score: %0.4f" % corpus_bleu(references, hypotheses))
示例#11
0
def main(_):
    # Set up logging
    configure_logging(FLAGS.debug_log)

    # Load configuration
    with open(FLAGS.config, 'r') as f:
        config = yaml.load(f)

    # Get the directory paths
    ckpt_dir = os.path.join(config['training']['ckpt_dir'],
                            config['experiment_name'])
    summary_dir = os.path.join(config['training']['summary_dir'],
                               config['experiment_name'])

    # Create the directories if they do not already exist
    if not os.path.exists(ckpt_dir):
        logging.info('Creating checkpoint directory: `%s`.' % ckpt_dir)
        os.makedirs(ckpt_dir)
    if not os.path.exists(summary_dir):
        logging.info('Creating summary directory: `%s`.' % summary_dir)
        os.makedirs(summary_dir)

    # Check for conflicting configurations
    safe_copy_config(config, FLAGS.force_overwrite)

    # Init summary writer
    summary_writer =  SummaryWriter(summary_dir)

    # Load vocab and datasets
    logging.info('Loading the vocabulary.')
    with open(config['data']['vocab'], 'r') as f:
        vocab = Vocab.load(f)
    logging.info('Loading train and valid data.')
    train_data = TextDataset(config['data']['train'],
                             vocab=vocab,
                             max_length=config['training']['max_length'])
    valid_data = TextDataset(config['data']['valid'],
                             vocab=vocab,
                             max_length=config['training']['max_length'])

    # Initialize models
    logging.info('Initializing the inference network and generative model.')
    inference_network = RNNTextInferenceNetwork(
        dim=config['model']['dim'],
        vocab_size=len(vocab),
        encoder_kwargs=config['model']['encoder'],
        normalizing_flow_kwargs=config['model']['normalizing_flow'])
    generative_model = RNNTextGenerativeModel(
        dim=config['model']['dim'],
        vocab_size=len(vocab),
        max_length=config['training']['max_length'],
        sos_idx=vocab.sos_idx,
        **config['model']['generator'])
    if torch.cuda.is_available():
        inference_network = inference_network.cuda()
        generative_model = generative_model.cuda()

    # Setup model optimizers
    optimizer_in = torch.optim.Adam(inference_network.parameters(),
                                    lr=config['training']['learning_rate'])
    optimizer_gm = torch.optim.Adam(generative_model.parameters(),
                                    lr=config['training']['learning_rate'])

    # Restore
    ckpt = os.path.join(ckpt_dir, 'model.pt')
    if os.path.exists(ckpt):
        logging.info('Model checkpoint detected at: `%s`. Restoring.' % ckpt)
        checkpoint = torch.load(ckpt)
        epoch = checkpoint['epoch']
        t = checkpoint['t']
        best_loss = checkpoint['best_loss']
        inference_network.load_state_dict(checkpoint['state_dict_in'])
        generative_model.load_state_dict(checkpoint['state_dict_gm'])
        optimizer_in.load_state_dict(checkpoint['optimizer_in'])
        optimizer_gm.load_state_dict(checkpoint['optimizer_gm'])
    else:
        logging.info('No existing checkpoint found.')
        epoch = 0
        t = 0
        best_loss = float('inf')

    # Start train
    weight = torch.ones(len(vocab))
    weight[vocab.unk_idx] = config['training']['unk_weight']
    if torch.cuda.is_available():
        weight = weight.cuda()
    while epoch < config['training']['epochs']:
        logging.info('Starting epoch - %i.' % epoch)

        inference_network.train()
        generative_model.train()

        # Training step
        logging.info('Start train step.')
        train_loader = DataLoader(
            dataset=train_data,
            batch_size=config['training']['batch_size'],
            shuffle=True,
            num_workers=cpu_count(),
            pin_memory=torch.cuda.is_available())

        # Init train summaries
        train_nll = 0.0
        train_kl = 0.0
        train_loss = 0.0

        for batch in train_loader:

            optimizer_in.zero_grad()
            optimizer_gm.zero_grad()

            x = batch['input']
            target = batch['target']
            lengths = batch['lengths']
            if torch.cuda.is_available():
                x = x.cuda()
                target = target.cuda()
                lengths = lengths.cuda()

            # Forward pass of inference network
            z, kl = inference_network(x, lengths)

            # Teacher forcing
            x_hat = word_dropout(x, config['training']['word_dropout_rate'],
                                 vocab.unk_idx)
            logp, _ = generative_model(z, x_hat, lengths)

            # Obtain current value of the annealing constant with beta trick
            beta = get_beta(config, epoch)

            # Compute annealed loss
            length = logp.shape[1]
            logp = logp.view(-1, len(vocab))
            target = target[:,:length].contiguous().view(-1)
            nll = F.nll_loss(logp, target,
                             ignore_index=vocab.pad_idx,
                             weight=weight,
                             size_average=False)
            loss = nll + beta * kl

            # Update summaries
            train_nll += nll.data
            train_kl += kl.data
            train_loss += loss.data

            # Backpropagate gradients
            batch_size = config['training']['batch_size']
            loss /= batch_size
            kl /= batch_size
            nll /= batch_size
            loss.backward()
            optimizer_in.step()
            optimizer_gm.step()

            # Log
            if not t % config['training']['log_frequency']:
                # Note: logged train loss only for a single batch - see
                # tensorboard for summary over epochs
                line = 'Iteration: %i - Loss: %0.4f. - KL: %0.4f - NLL: %0.4f'
                logging.info(line % (t, loss.data, kl.data, nll.data))

                # Print a greedy sample
                z_k, _ = inference_network(x, lengths)
                _, sample = generative_model(z_k)
                example = [vocab.id2word(int(x)) for x in sample[0]]
                try:
                    T = example.index(vocab.eos_token)
                    example = example[:T]
                except ValueError:
                    pass
                example = ' '.join(example)
                logging.info('Example - `%s`' % example)

            t += 1

        # Validation step
        logging.info('Start valid step.')
        valid_loader = DataLoader(
            dataset=valid_data,
            batch_size=config['training']['batch_size'],
            shuffle=False,
            num_workers=cpu_count(),
            pin_memory=torch.cuda.is_available())

        # Init valid summaries
        valid_nll = 0.0
        valid_kl = 0.0
        valid_loss = 0.0

        for batch in valid_loader:

            x = batch['input']
            target = batch['target']
            lengths = batch['lengths']
            if torch.cuda.is_available():
                x = x.cuda()
                target = target.cuda()
                lengths = lengths.cuda()

            # Forward pass of inference network
            z, kl = inference_network(x, lengths)

            # Teacher forcing
            logp, _ = generative_model(z, x, lengths)

            # Compute annealed loss
            length = logp.shape[1]
            logp = logp.view(-1, len(vocab))
            target = target[:,:length].contiguous().view(-1)
            nll = F.nll_loss(logp, target, ignore_index=vocab.pad_idx,
                             size_average=False)
            loss = nll + kl

            # Update summaries
            valid_nll += nll.data
            valid_kl += kl.data
            valid_loss += loss.data

        # Normalize losses
        train_nll /= len(train_data)
        train_kl /= len(train_data)
        train_loss /= len(train_data)
        valid_nll /= len(valid_data)
        valid_kl /= len(valid_data)
        valid_loss /= len(valid_data)

        # Tensorboard logging
        summary_writer.add_scalar("elbo/train", train_loss.data, epoch)
        summary_writer.add_scalar("kl/train", train_kl.data, epoch)
        summary_writer.add_scalar("nll/train", train_nll.data, epoch)
        summary_writer.add_scalar("elbo/val", valid_loss.data, epoch)
        summary_writer.add_scalar("kl/val", valid_kl.data, epoch)
        summary_writer.add_scalar("nll/val", valid_nll.data, epoch)

        # Save checkpoint
        is_best = valid_loss < best_loss
        best_loss = min(loss, best_loss)
        save_checkpoint({
            'epoch': epoch + 1,
            't': t,
            'best_loss': best_loss,
            'state_dict_in': inference_network.state_dict(),
            'state_dict_gm': generative_model.state_dict(),
            'optimizer_in': optimizer_in.state_dict(),
            'optimizer_gm': optimizer_gm.state_dict()
        }, is_best, ckpt)

        epoch += 1
示例#12
0
def main(_):
    # Set up logging
    configure_logging(FLAGS.debug_log)

    # Load configuration
    with open(FLAGS.config, 'r') as f:
        config = yaml.load(f)

    # Get the checkpoint path
    ckpt_dir = os.path.join(config['training']['ckpt_dir'],
                            config['experiment_name'])

    # Load vocab and datasets
    logging.info('Loading the vocabulary.')
    with open(config['data']['vocab'], 'r') as f:
        vocab = Vocab.load(f)
    logging.info('Loading test data.')
    test_data = TextDataset(config['data']['test'],
                            vocab=vocab,
                            max_length=config['training']['max_length'])
    test_loader = DataLoader(dataset=test_data,
                             batch_size=config['training']['batch_size'],
                             shuffle=False,
                             num_workers=cpu_count(),
                             pin_memory=torch.cuda.is_available())

    # Initialize models
    logging.info('Initializing the inference network and generative model.')
    inference_network = RNNTextInferenceNetwork(
        dim=config['model']['dim'],
        vocab_size=len(vocab),
        encoder_kwargs=config['model']['encoder'],
        normalizing_flow_kwargs=config['model']['normalizing_flow'])
    generative_model = RNNTextGenerativeModel(
        dim=config['model']['dim'],
        vocab_size=len(vocab),
        max_length=config['training']['max_length'],
        sos_idx=vocab.sos_idx,
        **config['model']['generator'])
    if torch.cuda.is_available():
        inference_network = inference_network.cuda()
        generative_model = generative_model.cuda()

    # Restore
    ckpt = os.path.join(ckpt_dir, 'model.pt.best')
    if os.path.exists(ckpt):
        logging.info('Model checkpoint detected at: `%s`. Restoring.' % ckpt)
        checkpoint = torch.load(ckpt)
        inference_network.load_state_dict(checkpoint['state_dict_in'])
        generative_model.load_state_dict(checkpoint['state_dict_gm'])
    else:
        logging.error('No model checkpoint found. Terminating.')
        sys.exit(1)

    # Init test summaries
    test_nll = 0.0
    test_kl = 0.0
    test_loss = 0.0
    test_suml2p = 0.0
    test_n = 0.0

    # Evaluate
    inference_network.eval()
    generative_model.eval()

    for batch in test_loader:

        x = batch['input']
        target = batch['target']
        lengths = batch['lengths']
        if torch.cuda.is_available():
            x = x.cuda()
            target = target.cuda()
            lengths = lengths.cuda()

        # Forward pass of inference network
        z, kl = inference_network(x, lengths)

        # Teacher forcing
        logp, _ = generative_model(z, x, lengths)

        # Compute loss
        length = logp.shape[1]
        logp = logp.view(-1, len(vocab))
        target = target[:, :length].contiguous().view(-1)
        nll = F.nll_loss(logp,
                         target,
                         ignore_index=vocab.pad_idx,
                         size_average=False)
        loss = nll + kl
        l2p, n = suml2p(logp, target, vocab.pad_idx)

        # Update summaries
        test_nll += nll.data
        test_kl += kl.data
        test_loss += loss.data
        test_suml2p += l2p.data
        test_n += n

    # Normalize losses
    test_nll /= len(test_data)
    test_kl /= len(test_data)
    test_loss /= len(test_data)
    H = -test_suml2p / test_n
    test_perplexity = 2**H

    # Log output
    logging.info('NLL: %0.4f' % test_nll)
    logging.info('KL: %0.4f' % test_kl)
    logging.info('ELBO: %0.4f' % test_loss)
    logging.info('Perplexity: %0.4f' % test_perplexity)
示例#13
0
def main(matrix=False):
    # Load the configuration file.
    with open('config.yaml', 'r') as f:
        config = yaml.load(f)

    # Load the vocabularies.
    src_vocab = Vocab.load(config['data']['src']['vocab'])
    tgt_vocab = Vocab.load(config['data']['tgt']['vocab'])

    # Load the training and dev datasets.
    test_data = ShakespeareDataset('test', config, src_vocab, tgt_vocab)

    # Restore the model.
    src_vocab_size = len(src_vocab)
    tgt_vocab_size = len(tgt_vocab)
    import pickle
    if matrix:
        f = open('attention_mat.pkl', 'rb')
        attention_matrix = pickle.load(f)
        f.close()
        for i in range(10):
            src, tgt = test_data[i]
            decoder_attn = attention_matrix[i]
            src_sentence = [
                src_vocab.id2word(id) for id in src.data.cpu().numpy()
            ]
            tgt_sentence = [
                tgt_vocab.id2word(id) for id in tgt.data.cpu().numpy()
            ]

            src_sentence_ = ' '.join(src_sentence)
            tgt_sentence_ = ' '.join(tgt_sentence)
            show_attention(src_sentence_, tgt_sentence_, decoder_attn)
        return

    encoder = EncoderRNN(src_vocab_size, config['model']['embedding_dim'],
                         config['model']['layer'])
    attn = 'general'
    decoder = AttnDecoderRNN(attn, config['model']['embedding_dim'],
                             tgt_vocab_size, config['model']['layer'])

    if torch.cuda.is_available():
        encoder = encoder.cuda()
        decoder = decoder.cuda()
    ckpt_path = os.path.join(config['data']['ckpt'], config['experiment_name'],
                             'model.pt')
    if os.path.exists(ckpt_path):
        print('Loading checkpoint: %s' % ckpt_path)
        ckpt = torch.load(ckpt_path)
        encoder.load_state_dict(ckpt['encoder'])
        decoder.load_state_dict(ckpt['decoder'])
    else:
        print('Unable to find checkpoint. Terminating.')
        sys.exit(1)
    encoder.eval()
    decoder.eval()

    # Initialize translator.
    greedy_translator = GreedyTranslator(encoder, decoder, tgt_vocab)

    # Qualitative evaluation - print translations for first couple sentences in
    # test corpus.

    import numpy as np
    attention_matrix = []
    import pickle
    for i in range(10):
        src, tgt = test_data[i]
        translation, decoder_attn = greedy_translator(src)
        attention_matrix.append(decoder_attn.numpy())
        src_sentence = [src_vocab.id2word(id) for id in src.data.cpu().numpy()]
        tgt_sentence = [tgt_vocab.id2word(id) for id in tgt.data.cpu().numpy()]
        translated_sentence = [tgt_vocab.id2word(id) for id in translation]
        print('---')
        print('Source: %s' % ' '.join(src_sentence))
        print('Ground truth: %s' % ' '.join(tgt_sentence))
        print('Model output: %s' % ' '.join(translated_sentence))
    print('---')
    f = open('attention_mat.pkl', 'wb')
    pickle.dump(attention_matrix, f)
    f.close()

    # Quantitative evaluation - compute corpus level BLEU scores.
    hypotheses = []
    references = []
    for src, tgt in test_data:
        translation, decoder_attn = greedy_translator(src)
        tgt_sentence = [tgt_vocab.id2word(id) for id in tgt.data.cpu().numpy()]
        translated_sentence = [tgt_vocab.id2word(id) for id in translation]
        # Remove start and end of sentence tokens.
        tgt_sentence = tgt_sentence[1:-1]
        translated_sentence = translated_sentence[1:-1]
        hypotheses.append(tgt_sentence)
        references.append([translated_sentence])
    print("Corpus BLEU score: %0.4f" % corpus_bleu(references, hypotheses))
示例#14
0
def train(args):
    vocab = Vocab.load(args.vocab, max_size=args.vocab_size)
    data_reader = DataReader(data_dir=args.data_dir, shuffle=True)
    preprocessor = Preprocessor(
        predict_prev=args.predict_prev,
        predict_cur=args.predict_cur,
        predict_next=args.predict_next,
        vocab=vocab, max_length=args.max_length, gpu=args.gpu)
    model = SkipThought(
        rnn_type=args.rnn_type, num_words=len(vocab),
        word_dim=args.word_dim, hidden_dim=args.hidden_dim,
        bidirectional=args.bidirectional,
        predict_prev=args.predict_prev,
        predict_cur=args.predict_cur,
        predict_next=args.predict_next)
    print(model)

    if args.pretrained is not None:
        print(f'Loading pretrained model from {args.pretrained}')
        model.load_state_dict(
            torch.load(args.pretrained,
                       map_location=lambda storage, loc: storage))
    if args.gpu > -1:
        model.cuda(args.gpu)
    optimizer = optim.Adam(model.parameters())

    summary_writer = SummaryWriter(os.path.join(args.save_dir, 'log'))

    def add_scalar_summary(name, value, step):
        summary_writer.add_scalar(tag=name, scalar_value=value,
                                  global_step=step)

    def add_text_summary(name, value, step):
        summary_writer.add_text(tag=name, text_string=value,
                                global_step=step)

    def variable(tensor, volatile=False):
        return Variable(tensor, volatile=volatile)

    def run_train_iter(batch):
        if not model.training:
            model.train()
        src, tgt = preprocessor(batch)
        src = (variable(src[0]), src[1])
        for k in tgt:
            tgt[k] = (variable(tgt[k][0]), tgt[k][1])
        logits = model.forward(src=src, tgt=tgt)
        loss = 0
        for k in tgt:
            logits_k = logits[k]
            tgt_k = tgt[k]
            loss = loss + basic.sequence_cross_entropy(
                logits=logits_k[:-1], targets=tgt_k[0][1:],
                length=tgt_k[1] - 1)
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm(model.parameters(), max_norm=10)
        optimizer.step()
        return loss.data[0]

    def ids_to_words(ids):
        words = []
        eos_id = vocab.stoi(vocab.eos)
        for id_ in ids:
            words.append(vocab.itos(id_))
            if id_ == eos_id:
                break
        return words

    def generate_using_decoder(name, src, max_length):
        _, encoder_state = model.encoder(words=src[0], length=src[1])
        if isinstance(encoder_state, tuple):  # LSTM
            encoder_state = encoder_state[0]
        context = (encoder_state.transpose(0, 1).contiguous()
                   .view(-1, args.hidden_dim))
        batch_size = src[1].size(0)

        bos_id = vocab.stoi(vocab.bos)
        bos = Variable(src[1].new(1, batch_size).fill_(bos_id))
        decoder = model.get_decoder(name)
        prev_pred = bos
        done = torch.zeros(batch_size).byte()
        hyps = []
        prev_state = context.unsqueeze(0)
        for t in range(max_length):
            if done.all():
                break
            decoder_input = prev_pred
            logit, prev_state = decoder(words=decoder_input,
                                        prev_state=prev_state)
            pred = logit.max(2)[1]
            prev_pred = pred
            hyps.append(pred.data)
        hyps = torch.cat(hyps, dim=0).transpose(0, 1).tolist()
        return hyps

    def generate(batch):
        # Greedy search
        src, tgt = preprocessor(batch)
        src = (variable(src[0]), src[1])
        for k in tgt:
            tgt[k] = (variable(tgt[k][0], volatile=True), tgt[k][1])
        batch_size = src[0].size(1)
        max_length = src[0].size(0) * 2
        generated = {}
        for k in tgt:
            generated[k] = generate_using_decoder(
                name=k, src=src, max_length=max_length)
        results = []
        for i in range(batch_size):
            res = {'src': ' '.join(ids_to_words(src[0][:src[1][i], i].data)),
                   'tgt': {},
                   'out': {}}
            for k in tgt:
                res['tgt'][k] = ' '.join(ids_to_words(tgt[k][0][1:, i].data))
                res['out'][k] = ' '.join(ids_to_words(generated[k][i]))
            results.append(res)
        return results

    def generate_synthetic_batch(real_batch):
        def sort_by_length(tgt_of_key):
            sorted_length, sort_inds = tgt_of_key[1].sort(
                dim=0, descending=True)
            return tgt_of_key[0][:, sort_inds], sorted_length

        # Forward: given prev, generate cur'
        _, tgt = preprocessor(real_batch)
        tgt_prev, tgt_prev_length = sort_by_length(tgt['prev'])
        syn_src_fw = generate_using_decoder(
            name='next',
            src=(variable(tgt_prev[1:], volatile=True),
                 tgt_prev_length - 1),
            max_length=args.max_length)
        # Backward: given next, generate cur''
        tgt_next, tgt_next_length = sort_by_length(tgt['next'])
        syn_src_bw = generate_using_decoder(
            name='prev',
            src=(variable(tgt_next[1:], volatile=True),
                 tgt_next_length - 1),
            max_length=args.max_length)
        syn_batch_fw = []
        syn_batch_bw = []
        for i in range(len(real_batch)):
            syn_src_fw_str = ' '.join(ids_to_words(syn_src_fw[i]))
            syn_src_bw_str = ' '.join(ids_to_words(syn_src_bw[i]))
            syn_batch_fw.append(
                (real_batch[i][0], syn_src_fw_str, real_batch[i][2]))
            syn_batch_bw.append(
                (real_batch[i][0], syn_src_bw_str, real_batch[i][2]))
        return syn_batch_fw, syn_batch_bw

    global_step = 0

    def print_samples():
        model.eval()
        num_samples = 2
        samples = data_reader.next_batch(size=num_samples, peek=True)
        syn_samples_fw, syn_samples_bw = generate_synthetic_batch(samples)
        gen_results = generate(samples)
        syn_gen_results_fw = generate(syn_samples_fw)
        syn_gen_results_bw = generate(syn_samples_bw)
        text_val = ''
        for i, res in enumerate(gen_results):
            text_val += f'* sample (real) #{i}\n'
            text_val += f'\t* src: {res["src"]}\n'
            for k in res['tgt']:
                tgt_k = res['tgt'][k]
                out_k = res['out'][k]
                text_val += f'\t* {k} (tgt): {tgt_k}\n'
                text_val += f'\t* {k} (out): {out_k}\n'
        for i, res in enumerate(syn_gen_results_fw):
            text_val += f'* sample (syn_fw) #{i}\n'
            text_val += f'\t* src: {res["src"]}\n'
            for k in res['tgt']:
                tgt_k = res['tgt'][k]
                out_k = res['out'][k]
                text_val += f'\t* {k} (tgt): {tgt_k}\n'
                text_val += f'\t* {k} (out): {out_k}\n'
        for i, res in enumerate(syn_gen_results_bw):
            text_val += f'* sample (syn_bw) #{i}\n'
            text_val += f'\t* src: {res["src"]}\n'
            for k in res['tgt']:
                tgt_k = res['tgt'][k]
                out_k = res['out'][k]
                text_val += f'\t* {k} (tgt): {tgt_k}\n'
                text_val += f'\t* {k} (out): {out_k}\n'
        add_text_summary('Sample', value=text_val, step=global_step)

    for epoch in range(args.max_epoch):
        data_reader.start_epoch()
        for batch in tqdm(data_reader.iterator(args.batch_size),
                          desc=f'Epoch {epoch}'):
            # Train on real batch
            real_loss = run_train_iter(batch)
            # Train on synthetic batches
            syn_batch_fw, syn_batch_bw = generate_synthetic_batch(batch)
            syn_loss_fw = run_train_iter(syn_batch_fw)
            syn_loss_bw = run_train_iter(syn_batch_bw)
            global_step += 1
            add_scalar_summary(name='real_loss', value=real_loss,
                               step=global_step)
            add_scalar_summary(name='syn_loss_fw', value=syn_loss_fw,
                               step=global_step)
            add_scalar_summary(name='syn_loss_bw', value=syn_loss_bw,
                               step=global_step)
            if global_step % args.print_every == 0:
                print_samples()
            if global_step % args.save_every == 0:
                model_filename = f'model-{global_step}.pt'
                model_path = os.path.join(args.save_dir, model_filename)
                torch.save(model.state_dict(), model_path)
                print(f'\nIter #{global_step}: '
                      f'Saved checkpoint to {model_path}')
示例#15
0
import solvers
from utils import load_pickle, PAD_TOKEN, UNK_TOKEN, EOS_TOKEN, SOS_TOKEN, UNK_TOKEN, SEP_TOKEN, EOS_ID
import torch 
import sentencepiece as spm
from transformers import OpenAIGPTTokenizer, GPT2Tokenizer
import os

if __name__ == '__main__':
    config = get_config(mode='train')
    val_config = get_config(mode='valid')
    with open(os.path.join(config.save_path, 'config.txt'), 'w') as f:
        print(config, file=f)

    if config.data_name == "cornell":
        vocab = Vocab()
        vocab.load(config.word2id_path, config.id2word_path, ptb=(config.model == "PTB"))
        config.vocab_size = vocab.vocab_size
        config.pad_id = vocab.pad_id
        config.eos_id = EOS_ID

        print(f'Vocabulary size: {vocab.vocab_size}')

        if config.users:
            train_users = load_pickle(config.convs_users_path)
            config.user_size = max([x for xx in train_users for x in xx]) + 1
            print(f'User size: {config.user_size}')
            eval_users = load_pickle(val_config.convs_users_path)
        else:
            train_users = None
            eval_users = None