Exemplo n.º 1
0
def train_session():
    """ Executes a training session on the LM. """
    # Clear the default graph within which the model graph is constructed
    tf.reset_default_graph()
    # Load data
    train_data = load_pickle(train_pickle)
    valid_data = load_pickle(valid_pickle)
    # Construct the model graph and
    cog_lm = CogLM(vocab, train_opt, 'cog_lm')
    # Declare OP for initializing of model variables
    init_op = tf.global_variables_initializer()
    # Time training duration
    starting_time = time.time()

    with tf.Session(config=config) as train_sess:
        # Initialize variables
        train_sess.run(init_op)
        # Initialize LM trainer
        trainer = CogLMTrainer(vocab, train_opt, cog_lm, train_sess,
                               train_data, valid_data)
        # Train model (either for a predefined number of epochs or until early stopping)
        print('+++TRAINING+++')
        trainer.train_model()

    # Report training duration
    elapsed = time.time() - starting_time
    logging.info(
        'Training took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format(
            int(elapsed // 3600),
            int((elapsed % 3600)) // 60, elapsed % 60))
Exemplo n.º 2
0
def train_session():
    """ Executes a training session on the SAE model. """
    # Clear the default graph within which the model graph is constructed
    tf.reset_default_graph()
    # Load data
    train_data = load_pickle(train_pickle)
    valid_data = load_pickle(valid_pickle)
    # Construct the model graph
    sent_sim_class = SentSimClassifier(vocab, train_opt, 'ssc')
    all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    # Declare OP for initializing of model variables
    init_op = tf.global_variables_initializer()

    # During domain adoptation, restore learned SSC parameters with the exception of the embeddings,
    # which are extracted from the pre-trained IDGAN-internal LM
    restored_vars = [var for var in all_vars if 'embedding_table' not in var.name]
    pre_train_saver = tf.train.Saver(restored_vars)
    embeddings_ssc_keys = [var.name for var in all_vars if var not in restored_vars and 'optimization' not in var.name]
    embedding_lm_keys = list()
    # Handle scoping discrepancies between SSC and LM checkpoints, to make LM variables compatible with the SSC graph
    for k in embeddings_ssc_keys:
        k = k.replace('ssc', 'cog_lm')
        k = k.replace('encoder/embeddings', 'embeddings')
        k = k_replace('Adam', 'optimizer')
	k = k.split(':')[0]
        embedding_lm_keys.append(k)
    embeddings_dir = os.path.join(train_opt.root_dir, 'cognitive_language_model/src/checkpoints/')
    embeddings_epoch = 'best'
    # Map SSC embedding variables to LM embedding variables,
    # so that the former may be initialized with values extracted from the latter
    embeddings_dict = {embedding_lm_keys[i]: [v for v in tf.global_variables() if v.name == embeddings_ssc_keys[i]][0]
                       for i in range(len(embedding_lm_keys))}
    # Declare saver object for initializing SAE's embedding table with embeddings learned by IDGAN's LM
    embeddings_saver = tf.train.Saver(embeddings_dict)
    # Time training duration
    starting_time = time.time()

    with tf.Session(config=config) as train_sess:
        if train_opt.pre_train:
            # Initialize variables
            train_sess.run(init_op)
        else:
            # Restore pre-trained model parameters for domain adaptation (sans embedding table)
            load_model(train_sess, pre_train_saver, os.path.join(train_opt.save_dir, 'pre_training'), 'best')
            # Restore embedding parameters from the specified LM checkpoint
            load_model(train_sess, embeddings_saver, embeddings_dir, embeddings_epoch)

        # Initialize SSC trainer
        trainer = SentSimClassTrainer(vocab, train_opt, sent_sim_class, train_sess, train_data, valid_data)
        # Train model (either for a predefined number of epochs or until early stopping)
        print('+++TRAINING+++')
        trainer.train_model()

    # Report training duration
    elapsed = time.time() - starting_time
    logging.info('Training took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format(
        int(elapsed // 3600), int((elapsed % 3600)) // 60, elapsed % 60))
Exemplo n.º 3
0
def test_session(batch_size=1, target_epoch='best', beam_decoding=False):
    """ Executes a quick test session on the SAE model by sampling a small quantity of items from the test set
    and using the model to first compress them into a meaningful representation and subsequently reconstruct them. """
    # Clear the default graph
    tf.reset_default_graph()
    # Declare the batch size, if left unspecified in test options
    if train_opt.batch_size is None:
        train_opt.batch_size = batch_size
    # Load test data
    test_data = load_pickle(test_pickle)
    # Build model graph
    autoencoder = SeqAE(vocab, test_opt,
                        'seq_ae' + '_{:s}'.format(train_opt.train_id))
    # Declare saver object for restoring learned model parameters
    test_saver = tf.train.Saver()

    # Initiate testing session
    with tf.Session(config=config) as test_sess:
        # Load learned model parameters
        load_model(test_sess, test_saver, test_opt.save_dir, target_epoch)
        # Initialize model interface containing inference methods
        interface = SeqAEInterface(autoencoder, vocab, test_sess, test_opt)
        # Sample candidate sentences from the test set
        samples = np.random.choice(test_data, test_opt.num_samples).tolist()
        while max([len(sample.split()) for sample in samples]) > 100:
            samples = np.random.choice(test_data,
                                       test_opt.num_samples).tolist()
        # Initialize a loader object to pre-process the sampled sentences
        sample_loader = DataServer(samples, vocab, test_opt)
        samples_read = 0

        print('Sampled sentences:')
        for i, s in enumerate(samples):
            print('{:d}: {:s}'.format(i, s))
        print('-' * 10 + '\n')

        if not beam_decoding:
            # Perform greedy encoding-decoding
            print('Greedy decoding:')
            for i, sample_data in enumerate(sample_loader):
                _, enc_input, dec_input = sample_data
                generated = interface.greedy_generation(enc_input, dec_input)
                for j in range(test_opt.batch_size):
                    print('Encoded: {:s}\nDecoded: {:s}\n'.format(
                        samples[samples_read + j], generated[j]))
                samples_read += test_opt.batch_size
        else:
            # Perform encoding-decoding with beam-search (limited use for reconstruction)
            assert (
                test_opt.batch_size == 1
            ), 'Beam search not defined for batches with more than one element.'
            print('Beam search decoding:')
            for i, sample_data in enumerate(sample_loader):
                _, enc_input, _ = sample_data
                print('Encoded: {:s}'.format(samples[i]))
                interface.beam_generation(enc_input, print_results=True)

    print('-' * 10 + '\n')
    print('=' * 10 + '\n')
    print('Auto-encoder evaluation completed!')
Exemplo n.º 4
0
def score_corpus():
    """ Executes a session during which the source corpus is annotated with sentence-wise model perplexity scores. """
    # Clear the default graph
    tf.reset_default_graph()
    # Declare path leading to corpus to be scored
    scored_path = os.path.join(data_dir, '{:s}.txt'.format(scored_name))
    ppx_scores = list()
    # Load data
    full_data = load_pickle(full_pickle)
    # Build model graph
    cog_lm = CogLM(vocab, test_opt, 'cog_lm')
    # Declare saver object for restoring learned model parameters
    sort_saver = tf.train.Saver()
    # Time the duration of the scoring process
    starting_time = time.time()

    with tf.Session(config=config) as sort_sess:
        # Load learned model parameters
        load_model(sort_sess, sort_saver, test_opt.save_dir, 'best')
        # Initialize LM interface
        interface = CogLMInterface(cog_lm, vocab, sort_sess, test_opt)
        # Run the scoring loop
        pos = 0
        with codecs.open(scored_path, 'w') as in_file:
            while pos < len(full_data) - 1:
                # Fill a single batch of sentences to be scored
                try:
                    batch = full_data[pos:pos + test_opt.batch_size]
                    pos += test_opt.batch_size
                except IndexError:
                    batch = full_data[pos:len(full_data) - 1]
                    pos = len(full_data) - 1
                # Get sentence-wise model perplexity scores
                batch_ppx = interface.get_sequence_perplexity(batch)
                # Write the scored sentences to file
                for i in range(len(batch)):
                    sentence_ppx = batch_ppx[i, :].tolist()[0]
                    scored_sent = '{:s}\t{:.4f}\n'.format(
                        batch[i], sentence_ppx)
                    in_file.write(scored_sent)
                    # Keep track of corpus-wide statistics
                    ppx_scores.append(sentence_ppx)

    # Archive corpus statistics
    with open(lm_notes, 'w') as notes_file:
        notes_file.write(
            '------------ Scored Corpus Statistics -------------\n')
        notes_file.write('Metric\tMean\tMedian\n')
        notes_file.write('Senetence Perplexity\t{:.4f}\t{:.4f}\n'.format(
            np.mean(ppx_scores), np.median(ppx_scores)))

    # Report scoring duration
    elapsed = time.time() - starting_time
    print('Scoring took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format(
        int(elapsed // 3600),
        int((elapsed % 3600)) // 60, elapsed % 60))
Exemplo n.º 5
0
def test_session(target_epoch='best'):
    """ Evaluates the accuracy of the learned SSC model by using it to predict the similarity score of
    sentence pairs contained within the specified test set. """
    # Clear the default graph
    tf.reset_default_graph()
    # Load data
    test_data = load_pickle(test_pickle)
    # Build model graph
    sent_sim_class = SentSimClassifier(vocab, test_opt, 'ssc')
    # Declare saver
    test_saver = tf.train.Saver()
    save_dir = train_opt.save_dir

    # Initiate testing session
    with tf.Session(config=config) as test_sess:
        # Load learned model parameters
        load_model(test_sess, test_saver, save_dir, target_epoch)
        # Initialize model interface
        interface = SentSimClassInterface(sent_sim_class, vocab, test_sess, test_opt)
        # Initialize a loader object to pre-process and serve items drawn from the source corpus
        sample_loader = DataServer(test_data, vocab, test_opt)
        # Evaluate model's performance on a withheld test corpus to estimate its capacity for generalization beyond
        # seen data
        # Track prediction accuracy and the divergence of predicted similarity scores from target values
        total_error = 0.0
        total_differential = 0.0
        total_items = 0

        for i, test_batch in enumerate(sample_loader):
            # Obtain model predictions for the current test batch
            predictions, prediction_error = interface.infer_step(test_batch)
            total_error += np.sum(np.abs(prediction_error))
            try:
                for j in range(test_opt.batch_size):
                    cj = total_items + j
                    differential = np.abs(np.subtract(float(test_data[1][cj]), predictions[j][0]))
                    total_differential += differential
                    # Report model prediction and error
                    print('Sentence 1: {:s}\nSentence 2: {:s}\n'
                          'True score: {:.4f} | Model Prediction: {:.4f} | Differential: {:.4f}'
                          .format(test_data[0][cj][0], test_data[0][cj][1], float(test_data[1][cj]), predictions[j][0],
                                  differential))
                    print('-' * 10)
                total_items += test_opt.batch_size
            except IndexError:
                break
        # Report test corpus statistics
        print('Total model error: {:.4f} | Average model error: {:.4f} | Average prediction error: {:.4f}'.format(
            total_error, total_error / total_items, total_differential / total_items))

    print('-' * 10 + '\n')
    print('=' * 10 + '\n')
    print('Sentence similarity classifier evaluation completed!')
Exemplo n.º 6
0
def make_embedding_dict(opt, dict_pkl):
    """ Creates a dictionary with vocabulary items designated as keys and their embeddings as learned by an LM
    as values. """
    # Declare the source vocabulary path
    vocab_pkl = os.path.join(opt.root_dir,
                             'data/europarl/europarl_v7_train_vocab.pkl')
    # Declare the path to an LM checkpoint containing the learned embeddings
    embeddings_ckpt = os.path.join(opt.local_dir,
                                   'checkpoints/best_cog_lm.ckpt')

    # Extract embeddings from the checkpoint file
    vocab = load_pickle(vocab_pkl)
    reader = pywrap_tensorflow.NewCheckpointReader(embeddings_ckpt)
    embedding_table = reader.get_tensor('embeddings/embedding_table')

    # Construct the embedding dictionary and pickle it for future access
    embedding_dict = {
        vocab.index_to_word[idx]: embedding_table[[idx], :]
        for idx in range(vocab.n_words)
    }
    with open(dict_pkl, 'wb') as in_file:
        pickle.dump(embedding_dict, in_file)
    print('Embedding dictionary created and pickled!')
Exemplo n.º 7
0
def test_to_file(batch_size=1, target_epoch='best', beam_decoding=True):
    """ Executes a comprehensive test session on the entire test corpus;
    output is written to file for the calculation of the achieved corpus-wide ID reduction and
    the BLEU score between source sentences and their ID-reduced translations. """
    def _reconstruct_input(input_array):
        """ Reconstructs input sentences from numpy arrays; used to derive an accurate representation of the
        pre-processed, encoded sequences. """
        # Convert input array to list of lists of word indices
        input_idx = [
            np.squeeze(array).tolist()
            for array in np.split(input_array, input_array.shape[0], axis=0)
        ]
        # Translate indices into corresponding word tokens; truncated after sentence-final <EOS>
        input_boundaries = [
            idx_list.index(vocab.eos_id)
            if vocab.eos_id in idx_list else len(idx_list)
            for idx_list in input_idx
        ]
        input_sentences = [[
            vocab.index_to_word[idx]
            for idx in input_idx[j][:input_boundaries[j]]
        ] for j in range(len(input_idx))]
        input_sentences = [
            ' '.join(word_list) + '.' for word_list in input_sentences
        ]
        return input_sentences

    assert (test_opt.batch_size == 1), \
        'Function is defined for a batch size of 1 due to the nature of beam search implementation.'

    # Clear the default graph
    tf.reset_default_graph()
    # Declare the batch size
    if train_opt.batch_size is None:
        train_opt.batch_size = batch_size
    # Load test data from the high-ID corpus
    source_test_data = load_pickle(pickle_paths[2])
    # Build model graph
    seq_gan = IDGAN(opts, vocab, 'IDGAN')
    # Declare saver object for restoring learned model parameters
    test_saver = tf.train.Saver()

    # Declare paths pointing to locations of output files (i.e. reference and translations sets for BLEU)
    encoded_path = os.path.join(
        test_opt.out_dir,
        'source_encoded_test_corpus_beam_{:s}.txt'.format(str(beam_decoding)))
    decoded_path = os.path.join(
        test_opt.out_dir,
        'source_decoded_test_corpus_beam_{:s}.txt'.format(str(beam_decoding)))

    # Initiate testing session
    with tf.Session(config=config) as test_sess:
        # Load learned model parameters
        load_model(test_sess, test_saver, test_opt.save_dir, target_epoch)
        # Initialize the model interface containing inference methods
        interface = IDGANInterface(seq_gan, vocab, test_sess, test_opt)
        # Initialize a loader object to pre-process the test corpus
        test_loader = DataServer(source_test_data, vocab, test_opt)

        with open(encoded_path, 'w') as enc_file:
            with open(decoded_path, 'w') as dec_file:
                if not beam_decoding:
                    # Perform greedy ID-reduction on the sampled sentences
                    print('Greedy decoding:')
                    for s_id, test_items in enumerate(test_loader):
                        enc_labels, enc_inputs, dec_inputs = test_items
                        generated = interface.greedy_generation(
                            enc_labels, enc_inputs, dec_inputs)
                        enc_file.write(
                            _reconstruct_input(enc_labels)[0] + '\n')
                        dec_file.write(generated[0] + '\n')
                        if s_id % 10 == 0 and s_id > 0:
                            print(
                                '{:d} sentences written to file.'.format(s_id))

                else:
                    # Perform greedy ID-reduction with beam-search on the sampled sentences
                    assert (
                        test_opt.batch_size == 1
                    ), 'Beam search not defined for batches with more than one element.'
                    print('Beam search decoding:')
                    for s_id, test_items in enumerate(test_loader):
                        enc_labels, enc_input, _ = test_items
                        generated = interface.beam_generation(
                            enc_labels, enc_input, print_results=False)
                        # Write best beam result only
                        enc_file.write(
                            _reconstruct_input(enc_labels)[0] + '\n')
                        dec_file.write(generated[0][0] + '\n')
                        if s_id % 10 == 0 and s_id > 0:
                            print(
                                '{:d} sentences written to file.'.format(s_id))

    print('-' * 10 + '\n')
    print('=' * 10 + '\n')
    print('IDGAN documented evaluation completed!')
Exemplo n.º 8
0
def test_session(batch_size=1, target_epoch='best', beam_decoding=False):
    """ Executes a quick test session on the IDGAN system by sampling a small quantity of items from the test set
    and using the model to first compress them into a meaningful representation and subsequently reconstruct them;
    the evaluation process focuses exclusively on the translator SAE. """
    # Clear the default graph
    tf.reset_default_graph()
    # Declare the batch size, if left unspecified in test options
    if train_opt.batch_size is None:
        train_opt.batch_size = batch_size
    # Load data
    source_test_data = load_pickle(pickle_paths[2])
    # Build system graph
    seq_gan = IDGAN(opts, vocab, 'IDGAN')
    # Declare saver object for restoring learned IDGAN parameters
    test_saver = tf.train.Saver()

    # Initiate testing session
    with tf.Session(config=config) as test_sess:
        # Load learned model parameters
        load_model(test_sess, test_saver, test_opt.save_dir, target_epoch)
        # Initialize system interface containing inference methods
        interface = IDGANInterface(seq_gan, vocab, test_sess, test_opt)
        # Sample candidate sentences from the test set
        samples = np.random.choice(source_test_data,
                                   test_opt.num_samples).tolist()
        while max([len(sample.split()) for sample in samples]) > 10:
            samples = np.random.choice(source_test_data,
                                       test_opt.num_samples).tolist()
        # Initialize a loader object to pre-process the sampled sentences
        sample_loader = DataServer(samples, vocab, test_opt)
        samples_read = 0

        print('Sampled sentences:')
        for s_id, s in enumerate(samples):
            print('{:d}: {:s}'.format(s_id, s))
        print('-' * 10 + '\n')

        if not beam_decoding:
            # Perform greedy ID-reduction on the sampled sentences
            print('Greedy decoding:')
            for _, sample_data in enumerate(sample_loader):
                enc_labels, enc_inputs, dec_inputs = sample_data
                generated = interface.greedy_generation(
                    enc_labels, enc_inputs, dec_inputs)
                for j in range(test_opt.batch_size):
                    print('Encoded: {:s}\nDecoded: {:s}\n'.format(
                        samples[samples_read + j], generated[j]))
                samples_read += test_opt.batch_size
        else:
            # Perform greedy ID-reduction with beam-search on the sampled sentences
            assert (
                test_opt.batch_size == 1
            ), 'Beam search not defined for batches with more than one element.'
            print('Beam search decoding:')
            for _, sample_data in enumerate(sample_loader):
                enc_labels, enc_input, _ = sample_data
                print('Encoded: {:s}'.format(samples[i]))
                interface.beam_generation(enc_labels,
                                          enc_input,
                                          print_results=True)

    print('-' * 10 + '\n')
    print('=' * 10 + '\n')
    print('IDGAN evaluation completed!')
Exemplo n.º 9
0
def train_session():
    """ Executes a training session on the IDGAN system. """
    # Clear the default graph within which the model graph is constructed
    tf.reset_default_graph()

    # Load data
    source_train_data = load_pickle(pickle_paths[0])
    source_valid_data = load_pickle(pickle_paths[1])
    target_train_data = load_pickle(pickle_paths[3])
    target_valid_data = load_pickle(pickle_paths[4])

    # Construct the system graph (component-specific graphs are constructed within the IDGAN graph)
    seq_gan = IDGAN(opts, vocab, 'IDGAN')

    # Initialize IDGAN's component models with pre-trained parameters
    # Declare paths pointing to checkpoints containing desired parameter values
    component_ckpt_dir = os.path.join(train_opt.local_dir,
                                      'checkpoints/components')
    lm_dir = os.path.join(component_ckpt_dir, 'lm')
    source_encoder_dir = os.path.join(component_ckpt_dir, 'source')
    if train_opt.cross_dec:
        source_decoder_dir = os.path.join(component_ckpt_dir, 'source_decoder')
    else:
        source_decoder_dir = os.path.join(component_ckpt_dir,
                                          'source')  # NO crossing
    target_dir = os.path.join(component_ckpt_dir, 'target')
    chosen_epoch = 'best'

    # Isolate parameters to be loaded into the IDGAN's system
    # Excludes optimization variables as well as variables connected to the training of 'frozen' IDGAN components
    # Get lists of variables contained within component checkpoint files
    lm_vars_plus_optimization = get_ckpt_vars(lm_dir)
    source_encoder_vars_plus_optimization = get_ckpt_vars(source_encoder_dir)
    source_decoder_vars_plus_optimization = get_ckpt_vars(source_decoder_dir)
    target_vars_plus_optimization = get_ckpt_vars(target_dir)
    # Exclude training-specific variables from IDGAN initialization
    lm_vars = [
        var_name for var_name in lm_vars_plus_optimization
        if 'optimization' not in var_name
    ]
    # To enable the 'crossed decoder' training condition, separate the encoder and decoder variables
    # of the SAE pre-trained on the high-ID corpus ('translator SAE' within IDGAN)
    source_encoder_vars = \
        [var_name for var_name in source_encoder_vars_plus_optimization if 'optimization' not in var_name]
    source_encoder_vars = [
        var_name for var_name in source_encoder_vars if 'encoder' in var_name
    ]
    source_decoder_vars = \
        [var_name for var_name in source_decoder_vars_plus_optimization if 'optimization' not in var_name]
    source_decoder_vars = [
        var_name for var_name in source_decoder_vars if 'decoder' in var_name
    ]
    target_vars = [
        var_name for var_name in target_vars_plus_optimization
        if 'optimization' not in var_name
    ]
    # Obtain list of all variables which have to be initialized (either randomly or from pre-trained values)
    # within the IDGAN system
    all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)

    # Check for matches between variables found within the pre-trained checkpoints and IDGAN variables
    lm_parameters = [
        var for var in all_vars if var.name.split(':')[0] in lm_vars
    ]
    source_encoder_parameters = [
        var for var in all_vars
        if var.name.split(':')[0] in source_encoder_vars
    ]
    source_decoder_parameters = [
        var for var in all_vars
        if var.name.split(':')[0] in source_decoder_vars
    ]
    target_parameters = [
        var for var in all_vars if var.name.split(':')[0] in target_vars
    ]
    # Load matching variables from corresponding checkpoints
    loaded_parameters = lm_parameters + source_encoder_parameters + source_decoder_parameters + target_parameters
    # Rest is initialized randomly
    initialized_parameters = [
        var for var in all_vars if var not in loaded_parameters
    ]

    # Initialize saver objects tasked with loading in the pre-trained parameters
    lm_saver = tf.train.Saver(lm_parameters)
    source_encoder_saver = tf.train.Saver(source_encoder_parameters)
    source_decoder_saver = tf.train.Saver(source_decoder_parameters)
    target_saver = tf.train.Saver(target_parameters)
    # Declare random initialization OP
    init_op = tf.variables_initializer(initialized_parameters)

    # Time training duration
    starting_time = time.time()

    with tf.Session(config=config) as train_sess:
        # Load pre-trained parameters into the IDGAN graph
        load_model(train_sess, lm_saver, lm_dir, chosen_epoch)
        load_model(train_sess, source_encoder_saver, source_encoder_dir,
                   chosen_epoch)
        load_model(train_sess, source_decoder_saver, source_decoder_dir,
                   chosen_epoch)
        load_model(train_sess, target_saver, target_dir, chosen_epoch)
        # Initialize the rest
        train_sess.run(init_op)

        # Initialize IDGAN interface and trainer, used for inference and training steps, respectively
        interface = IDGANInterface(seq_gan, vocab, train_sess, test_opt)
        trainer = IDGANTrainer(vocab,
                               train_opt,
                               seq_gan,
                               train_sess,
                               source_train_data,
                               source_valid_data,
                               target_train_data,
                               target_valid_data,
                               test_opt,
                               interface,
                               verbose=True)
        # Train system (either for a predefined number of epochs or until early stopping)
        print('+++TRAINING+++')
        trainer.train_gan()

    # Report training duration
    elapsed = time.time() - starting_time
    logging.info(
        'Training took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format(
            int(elapsed // 3600),
            int((elapsed % 3600)) // 60, elapsed % 60))
Exemplo n.º 10
0
                    corpus_names[i],
                    source_paths[i],
                    pickle_paths[i],
                    vocab_path=None,
                    is_train='train' in corpus_names[i],
                    is_valid='valid' in corpus_names[i],
                    is_test='test' in corpus_names[i])

# Load the vocabulary pickle shared among all of IDGAN's components
if train_opt.use_toy:
    vocab_pickle = os.path.join(train_opt.root_dir,
                                'data/toy/toy_train_vocab.pkl')
else:
    vocab_pickle = os.path.join(train_opt.root_dir,
                                'data/europarl/europarl_v7_train_vocab.pkl')
vocab = load_pickle(vocab_pickle)

# Write vocabulary contents to file for manual inspection
vocab_log_path = os.path.join(train_opt.out_dir,
                              '{:s}_vocab_log.txt'.format('shared'))
with codecs.open(vocab_log_path, 'w', encoding='utf8') as in_file:
    for key, value in vocab.index_to_word.items():
        in_file.write('{:s}, {:s}\n'.format(str(key), value))
print('Vocab log written.')

# Define TensorFlow session configuration - active during all calls to session.py (uncomment desired settings)
config = tf.ConfigProto(allow_soft_placement=True)

# config.gpu_options.allow_growth = True
# config.gpu_options.per_process_gpu_memory_fraction = 0.5
Exemplo n.º 11
0
def analogy_tests(opt, dict_pkl):
    """ Performs the semantic and syntactic analogy tests in accordance with arxiv.org/pdf/1301.3781.pdf
    on a selection of embeddings learned by the LM; as the tests had to be manually constructed, their scope is
    limited to questions referencing the ten most frequent word tokens and their paired counterparts
    per test question. """
    def _find_nearest(lookup_dict, source_pair, target_pair):
        """ Performs the analogy tests by iterating over the specified pairs. """
        # Input format
        # input pairs: ('france', 'paris'), ('germany', 'berlin')
        # corresponding variables: source[0], source[1], target[0], nearest

        # Keep track of the word vector closest to the predicted location of the analogy question answer and
        # the associated cosine distance score
        nearest = None
        max_similarity = 0.0

        # Identify the vector value denoting the relationship represented by the source pair
        relationship = lookup_dict[source_pair[1]] - lookup_dict[
            source_pair[0]]
        # Attempt to predict the second item in the target pair by applying the relationship vector to the first item
        predicted = relationship + lookup_dict[target_pair[0]]

        # Check which of the learned embeddings is closest to the predicted location within the embedding space
        for item in lookup_dict.items():
            similarity = cosine_similarity(predicted, item[1])[0][0]
            if similarity >= max_similarity:
                max_similarity = similarity
                nearest = item[0]
        return nearest

    # Load embedding dict
    embed_dict = load_pickle(dict_pkl)

    # Declare destination path for the test evaluation file
    out_path = os.path.join(opt.local_dir, 'out/embedding_tests.txt')

    # Manually define semantic tests
    capital = [('paris', 'france'), ('berlin', 'germany'),
               ('brussels', 'belgium'), ('vienna', 'austria'),
               ('copenhagen', 'denmark'), ('london', 'england'),
               ('athens', 'greece'), ('dublin', 'ireland'),
               ('amsterdam', 'netherlands'), ('lisbon', 'portugal')]
    currency = [('denmark', 'krone'), ('england', 'pound'), ('usa', 'dollar'),
                ('japan', 'yen'), ('germany', 'euro')]
    gender = [('mr', 'mrs'), ('sir', 'madam'), ('man', 'woman'), ('he', 'she'),
              ('king', 'queen'), ('father', 'mother'), ('boy', 'girl'),
              ('son', 'daughter')]

    # Manually define syntactic tests
    adverb = [('particular', 'particularly'), ('clear', 'clearly'),
              ('extreme', 'extremely'), ('final', 'finally'),
              ('absolute', 'absolutely'), ('simple', 'simply'),
              ('full', 'fully'), ('current', 'currently'),
              ('complete', 'completely'), ('quick', 'quickly')]
    opposite = [('possible', 'impossible'), ('necessary', 'unnecessary'),
                ('legal', 'illegal'), ('important', 'unimportant'),
                ('likely', 'unlikely'), ('clear', 'unclear'),
                ('realistic', 'unrealistic'), ('able', 'unable'),
                ('responsible', 'irresponsible')]
    comparative = [('great', 'greater'), ('long', 'longer'),
                   ('early', 'earlier'),
                   ('late', 'later'), ('close', 'closer'), ('high', 'higher'),
                   ('small', 'smaller'), ('few', 'fewer'), ('large', 'larger'),
                   ('broad', 'broader')]
    superlative = [('great', 'greatest'), ('long', 'longest'),
                   ('early', 'earliest'), ('late', 'latest'),
                   ('close', 'closest'), ('high', 'highest'),
                   ('small', 'smallest'), ('large', 'largest'),
                   ('broad', 'broadest'), ('poor', 'poorest')]
    participle = [('work', 'working'), ('make', 'making'), ('take', 'taking'),
                  ('vote', 'voting'), ('monitor', 'monitoring'),
                  ('develop', 'developing'), ('read', 'reading'),
                  ('say', 'saying'), ('talk', 'talking'), ('sit', 'sitting')]
    nationality = [('france', 'french'), ('germany', 'german'),
                   ('belgium', 'belgian'), ('austria', 'austrian'),
                   ('denmark', 'danish'), ('england', 'english'),
                   ('greece', 'greek'), ('ireland', 'irish'),
                   ('netherlands', 'dutch'), ('portugal', 'portuguese')]
    past = [('working', 'worked'), ('making', 'made'), ('taking', 'took'),
            ('voting', 'voted'), ('monitoring', 'monitored'),
            ('developing', 'developed'), ('saying', 'said'),
            ('talking', 'talked'), ('sitting', 'sat'), ('wanting', 'wanted')]
    plurals = [('democracy', 'democracies'), ('nationality', 'nationalities'),
               ('president', 'presidents'), ('nation', 'nations'),
               ('country', 'countries'), ('committee', 'committees'),
               ('year', 'years'), ('citizen', 'citizens'),
               ('agenda', 'agendas'), ('month', 'months')]
    third = [('work', 'works'), ('make', 'makes'), ('take', 'takes'),
             ('vote', 'votes'), ('monitor', 'monitors'), ('say', 'says'),
             ('talk', 'talks'), ('want', 'wants'), ('cover', 'covers'),
             ('offer', 'offers')]

    # Iterate over all test items and write results to file
    all_tests = [
        capital, currency, gender, adverb, opposite, comparative, superlative,
        participle, nationality, past, plurals, third
    ]
    # Track fraction of correct predictions for intrinsic embedding quality estimation
    correct_predictions_total = list()
    questions_total = list()

    # Write output
    with open(out_path, 'w') as out_file:
        for test_set in all_tests:
            correct_set_predictions = 0
            for pair_a in test_set:
                source = pair_a
                correct_pair_predictions = -1
                for pair_b in test_set:
                    target = pair_b
                    prediction = _find_nearest(embed_dict, source, target)
                    if prediction == target[1]:
                        correct_pair_predictions += 1
                    log_entry = '{:s} is similar to {:s} as [{:s}] is similar to {:s}\n' \
                        .format(source[1], source[0], prediction, target[0])
                    out_file.write(log_entry)
                correct_set_predictions += correct_pair_predictions
                out_file.write('-' * 10 + '\n')

            # Compile output statistics per test set
            num_questions = len(test_set)**2 - len(test_set)
            questions_total.append(num_questions)
            correct_predictions_total.append(correct_set_predictions)
            out_file.write('\n')
            out_file.write(
                'Number of non-identity questions asked {:d} | Correct answers: {:d} | Accuracy: {:.4f}\n'
                .format(num_questions, correct_set_predictions,
                        correct_set_predictions / num_questions))
            out_file.write('\n')
            out_file.write('=' * 10 + '\n')
            out_file.write('\n')
            print('Competed one of the test sets!')

        # Compile output statistics for the full collection of tests
        out_file.write('\n')
        out_file.write('Asked count: {}\n'.format(questions_total))
        out_file.write(
            'Answered correctly count: {}\n'.format(correct_predictions_total))
        out_file.write('Semantic accuracy: {:.4f}\n'.format(
            sum(correct_predictions_total[:3]) / sum(questions_total[:3])))
        out_file.write('Syntactic accuracy: {:.4f}\n'.format(
            sum(correct_predictions_total[3:]) / sum(questions_total[3:])))
        out_file.write('Overall accuracy: {:.4f}\n'.format(
            sum(correct_predictions_total) / sum(questions_total)))
Exemplo n.º 12
0
def test_session(target_epoch='best',
                 calculate_er=False,
                 generate=True,
                 gen_cycles=1):
    """ Executes a quick test session on the language model by sampling a small quantity of items from the test set
    and scoring them along various metrics. """
    # Tests are defined for a batch size of 1
    assert (
        test_opt.batch_size == 1), 'Model tests require batch size to equal 1.'
    # Clear the default graph
    tf.reset_default_graph()
    # Load data
    test_data = load_pickle(test_pickle)
    # Build model graph
    cog_lm = CogLM(vocab, test_opt, 'cog_lm')
    # Declare saver object for restoring learned model parameters
    test_saver = tf.train.Saver()

    with tf.Session(config=config) as test_sess:
        # Load learned model parameters
        load_model(test_sess, test_saver, test_opt.save_dir, target_epoch)
        # Initialize LM interface
        interface = CogLMInterface(cog_lm, vocab, test_sess, test_opt)
        # Sample sentences to be forwarded to the model
        samples = np.random.choice(test_data, test_opt.num_samples).tolist()

        print('Sampled sentences:')
        for i, s in enumerate(samples):
            print('{:d}: {:s}'.format(i, s))
        print('-' * 10 + '\n')

        # Get sentence probabilities
        print('Probabilities:')
        for i, s in enumerate(samples):
            total, prob_array, _ = interface.get_probability(s)
            # Mask <EOS> and <PAD> tag values
            cut_off = len(s.split())
            print('{:d}: {:s} | Total probability: {:.10f}'.format(
                i, s, total[0][0]))
            print('Per-word probabilities:')
            print('\t'.join(s.split()))
            print('\t'.join(
                ['{:.4}'.format(score) for score in prob_array[0][:cut_off]]))
        print('-' * 10 + '\n')

        # Get sentence log-probabilities
        print('Log-probabilities:')
        for i, s in enumerate(samples):
            total, prob_array, _ = interface.get_log_probability(s)
            cut_off = len(s.split())
            print('{:d}: {:s} | Total log-probability: {:.4f}'.format(
                i, s, total[0][0]))
            print('Per-word log-probabilities:')
            print('\t'.join(s.split()))
            print('\t'.join(['{:.4}'.format(score)
                             for score in prob_array[0]][:cut_off]))
        print('-' * 10 + '\n')

        # Get surprisal
        print('Surprisal and UID:')
        for i, s in enumerate(samples):
            total_s, s_array, norm_s, total_ud, ud_array, norm_ud = interface.get_surprisal(
                s)
            cut_off = len(s.split())
            tabbed_sent = '\t'.join(s.split())
            print(
                '{:d}: {:s} | Total surprisal: {: .4f} | Normalized surprisal: {: .4f}'
                .format(i, s, total_s[0][0], norm_s[0][0]))
            print('Per-word surprisal:')
            print(tabbed_sent)
            print('\t'.join(
                ['{: .4}'.format(score) for score in s_array[0][:cut_off]]))
            print(
                '{:d}: {:s} | Absolute UID: {:.4f} | Normalized UID: {: .4f}'.
                format(i, s, total_ud[0][0], norm_ud[0][0]))
            print('Per-word UID:')
            print(tabbed_sent)
            print('\t'.join(
                ['{: .4}'.format(score) for score in ud_array[0][:cut_off]]))
        print('-' * 10 + '\n')

        # Get approximate entropy reduction (computationally expensive!)
        if calculate_er:
            print('Approximate entropy reduction:')
            for i, s in enumerate(samples):
                total, array, norm = interface.get_entropy_reduction(samples)
                cut_off = len(s.split())
                print(
                    '{:d}: {:s} | Total ER: {: .4f} | Normalized ER: {: .4f}'.
                    format(i, s, total[0][0], norm[0][0]))
                print('Per-word ER:')
                print('\t'.join(s.split()))
                print('\t'.join(
                    ['{: .4}'.format(score) for score in array[0][:cut_off]]))
            print('-' * 10 + '\n')

            # Get cognitive load score (weighted sum of normalized surprisal and entropy reduction scores)
            print('Combined cognitive load:')
            for i, s in enumerate(samples):
                total, array, norm = interface.get_cognitive_load(samples)
                cut_off = len(s.split())
                print(
                    '{: d}: {:s} | Total CL: {: .4f} | Normalized CL: {: .4f}'.
                    format(i, s, total[0][0], norm[0][0]))
                print('Per-word CL:')
                print('\t'.join(s.split()))
                print('\t'.join(
                    ['{: .4}'.format(score) for score in array[0][:cut_off]]))
            print('-' * 10 + '\n')

        # Get model perplexity for the entire test set
        print('Model perplexity: {: .4f}'.format(
            interface.get_model_perplexity(test_data)[0][0]))
        print('-' * 10 + '\n')

        # Evaluate generative capability of the trained LM
        if generate:
            # Generate greedliy from scratch
            print('Sentences generated from scratch:')
            for c in range(gen_cycles):
                interface.generate(prefix=None, print_results=True)
            print('-' * 10 + '\n')

            # Generate greedily from some sentence prefix (i.e. a sentence completion test)
            print('Sentences generated from prefix:')
            for i, s in enumerate(samples):
                sent_list = s.split(' ')
                # Generate a sentence prefix of random length (at most 1/2 of the source sentence)
                cut_off = np.random.randint(1, len(sent_list) // 2)
                prefix = ' '.join(sent_list[:cut_off])
                print('Prefix: {:s} | Source: {:s}'.format(prefix, s))
                generated = interface.generate(prefix=prefix,
                                               print_results=False)
                for tpl in generated:
                    print('{:s} | Probability: {:.10f}'.format(tpl[0], tpl[1]))
                print('\n')

    print('-' * 10 + '\n')
    print('=' * 10 + '\n')
    print('Language model evaluation completed!')
Exemplo n.º 13
0
def annotate_corpus():
    """ Executes a session during which the shrunk source corpus is annotated with ID-relevant measures. """
    # Clear the default graph
    tf.reset_default_graph()
    # Declare path leading to corpus to be annotated
    annotated_path = os.path.join(
        data_dir, '{:s}_annotated.txt'.format(train_name.split('_')[0]))
    # Values assigned per sentence are tracked for subsequent computation of corpus-wide statistics
    corpus_stats = {
        'Total_surprisal': list(),
        'Per_word_surprisal': list(),
        'Normalized_surprisal': list(),
        'Total_UID_divergence': list(),
        'Per_word_UID_divergence': list(),
        'Normalized_UID_divergence': list()
    }

    # Load data
    full_data = load_pickle(full_pickle)
    # Build model graph
    cog_lm = CogLM(vocab, test_opt, 'cog_lm')
    # Declare saver object for restoring learned model parameters
    annotate_saver = tf.train.Saver()
    # Time annotation duration
    starting_time = time.time()

    with tf.Session(config=config) as annotate_sess:
        # Load learned model parameters
        load_model(annotate_sess, annotate_saver, test_opt.save_dir, 'best')
        # Initialize LM interface
        interface = CogLMInterface(cog_lm, vocab, annotate_sess, test_opt)
        # Run the annotation loop
        pos = 0
        with codecs.open(annotated_path, 'w') as in_file:
            while pos < len(full_data) - 1:
                # Fill a single batch of sentences to be annotated
                try:
                    batch = full_data[pos:pos + test_opt.batch_size]
                    pos += test_opt.batch_size
                except IndexError:
                    batch = full_data[pos:len(full_data) - 1]
                    pos = len(full_data) - 1
                # Obtain ID-values via LM's interface
                total_surp, per_word_surp, norm_surp, total_uiddiv, per_word_uiddiv, norm_uiddiv = \
                    interface.get_surprisal(batch)
                # Write annotated sentences to file
                for i in range(len(batch)):
                    # For per-word annotations, exclude values associated with <EOS> and <PAD> tags
                    cut_off = len(batch[i].split())
                    item_ts = total_surp[i, :].tolist()[0]
                    # Surprisal
                    item_pws_floats = per_word_surp[i, :].tolist()[:cut_off]
                    item_pws = ';'.join(
                        ['{:.4f}'.format(pws) for pws in item_pws_floats])
                    item_ns = norm_surp[i, :].tolist()[0]
                    item_tu = total_uiddiv[i, :].tolist()[0]
                    # UID divergence
                    item_pwu_floats = per_word_uiddiv[i, :].tolist()[:cut_off]
                    item_pwu = ';'.join(
                        ['{:.4f}'.format(pwu) for pwu in item_pwu_floats])
                    item_nu = norm_uiddiv[i, :].tolist()[0]
                    # Construct annotated sample
                    scored_sent = '{:s}\t{:.4f}\t{:s}\t{:.4f}\t{:.4f}\t{:s}\t{:4f}\n'. \
                        format(batch[i], item_ts, item_pws, item_ns, item_tu, item_pwu, item_nu)
                    # Write to file
                    in_file.write(scored_sent)
                    # Update corpus stats dictionary
                    corpus_stats['Total_surprisal'].append(item_ts)
                    corpus_stats['Per_word_surprisal'].extend(item_pws_floats)
                    corpus_stats['Normalized_surprisal'].append(item_ns)
                    corpus_stats['Total_UID_divergence'].append(item_tu)
                    corpus_stats['Per_word_UID_divergence'].extend(
                        item_pwu_floats)
                    corpus_stats['Normalized_UID_divergence'].append(item_nu)

    # Archive corpus statistics
    with open(lm_notes, 'a') as notes_file:
        notes_file.write('\n')
        notes_file.write(
            '------------ Annotated Corpus Statistics -------------\n')
        notes_file.write('Metric\tMean\tMedian\tLowest\tHighest\n')
        for k, v in corpus_stats.items():
            notes_file.write('{:s}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\n'.format(
                k, np.mean(v), np.median(v), np.min(v), np.max(v)))

    # Report scoring duration
    elapsed = time.time() - starting_time
    print(
        'Annotation took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format(
            int(elapsed // 3600),
            int((elapsed % 3600)) // 60, elapsed % 60))
Exemplo n.º 14
0
def train_session():
    """ Executes a training session on the SAE model. """
    # Clear the default graph within which the model graph is constructed
    tf.reset_default_graph()
    # Load data
    train_data = load_pickle(train_pickle)
    valid_data = load_pickle(valid_pickle)
    # Construct the model graph
    sae_name = 'seq_ae' + '_{:s}'.format(train_opt.train_id)
    autoencoder = SeqAE(vocab, train_opt, sae_name)
    all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    all_init_op = tf.global_variables_initializer()

    # Extract pre-trained word embeddings from the IDGAN-internal LM and use them to initialize the SAE
    initialized_vars = [
        var for var in all_vars
        if 'embedding_table' not in var.name or 'optimization' in var.name
    ]
    embeddings_sae_keys = [
        var.name for var in all_vars if var not in initialized_vars
    ]
    embedding_lm_keys = list()
    # Handle scoping discrepancies between SAE graph and LM checkpoints,
    # to make LM variables compatible with the instantiated graph
    for k in embeddings_sae_keys:
        k = k.replace(sae_name, 'cog_lm')
        k = k.split(':')[0]
        embedding_lm_keys.append(k)
    embeddings_dir = os.path.join(train_opt.root_dir,
                                  'cognitive_language_model/src/checkpoints/')
    embeddings_epoch = 'best'
    # Map SAE embedding variables to LM embedding variables,
    # so that the former may be initialized with values extracted from the latter
    embeddings_dict = {
        embedding_lm_keys[i]:
        [v for v in tf.global_variables()
         if v.name == embeddings_sae_keys[i]][0]
        for i in range(len(embedding_lm_keys))
    }
    # Declare saver object for initializing SAE's embedding table with embeddings learned by IDGAN's LM
    embeddings_saver = tf.train.Saver(embeddings_dict)
    # Declare OP for initializing other SAE parameters randomly
    no_embeds_init_op = tf.variables_initializer(initialized_vars)
    # Time training duration
    starting_time = time.time()

    with tf.Session(config=config) as train_sess:
        # Initialize variables
        if train_opt.is_local:
            # No pre-trained embeddings are loaded for experiments on the toy set
            train_sess.run(all_init_op)
        else:
            load_model(train_sess, embeddings_saver, embeddings_dir,
                       embeddings_epoch)
            train_sess.run(no_embeds_init_op)

        # Initialize SAE interface and trainer, used for inference and training steps, respectively
        interface = SeqAEInterface(autoencoder, vocab, train_sess, test_opt)
        trainer = SeqAETrainer(vocab, train_opt, autoencoder, train_sess,
                               train_data, valid_data, test_opt, interface)
        # Train model (either for a predefined number of epochs or until early stopping)
        print('+++TRAINING+++')
        trainer.train_model()

    # Report training duration
    elapsed = time.time() - starting_time
    logging.info(
        'Training took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format(
            int(elapsed // 3600),
            int((elapsed % 3600)) // 60, elapsed % 60))