def train_session(): """ Executes a training session on the LM. """ # Clear the default graph within which the model graph is constructed tf.reset_default_graph() # Load data train_data = load_pickle(train_pickle) valid_data = load_pickle(valid_pickle) # Construct the model graph and cog_lm = CogLM(vocab, train_opt, 'cog_lm') # Declare OP for initializing of model variables init_op = tf.global_variables_initializer() # Time training duration starting_time = time.time() with tf.Session(config=config) as train_sess: # Initialize variables train_sess.run(init_op) # Initialize LM trainer trainer = CogLMTrainer(vocab, train_opt, cog_lm, train_sess, train_data, valid_data) # Train model (either for a predefined number of epochs or until early stopping) print('+++TRAINING+++') trainer.train_model() # Report training duration elapsed = time.time() - starting_time logging.info( 'Training took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format( int(elapsed // 3600), int((elapsed % 3600)) // 60, elapsed % 60))
def train_session(): """ Executes a training session on the SAE model. """ # Clear the default graph within which the model graph is constructed tf.reset_default_graph() # Load data train_data = load_pickle(train_pickle) valid_data = load_pickle(valid_pickle) # Construct the model graph sent_sim_class = SentSimClassifier(vocab, train_opt, 'ssc') all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) # Declare OP for initializing of model variables init_op = tf.global_variables_initializer() # During domain adoptation, restore learned SSC parameters with the exception of the embeddings, # which are extracted from the pre-trained IDGAN-internal LM restored_vars = [var for var in all_vars if 'embedding_table' not in var.name] pre_train_saver = tf.train.Saver(restored_vars) embeddings_ssc_keys = [var.name for var in all_vars if var not in restored_vars and 'optimization' not in var.name] embedding_lm_keys = list() # Handle scoping discrepancies between SSC and LM checkpoints, to make LM variables compatible with the SSC graph for k in embeddings_ssc_keys: k = k.replace('ssc', 'cog_lm') k = k.replace('encoder/embeddings', 'embeddings') k = k_replace('Adam', 'optimizer') k = k.split(':')[0] embedding_lm_keys.append(k) embeddings_dir = os.path.join(train_opt.root_dir, 'cognitive_language_model/src/checkpoints/') embeddings_epoch = 'best' # Map SSC embedding variables to LM embedding variables, # so that the former may be initialized with values extracted from the latter embeddings_dict = {embedding_lm_keys[i]: [v for v in tf.global_variables() if v.name == embeddings_ssc_keys[i]][0] for i in range(len(embedding_lm_keys))} # Declare saver object for initializing SAE's embedding table with embeddings learned by IDGAN's LM embeddings_saver = tf.train.Saver(embeddings_dict) # Time training duration starting_time = time.time() with tf.Session(config=config) as train_sess: if train_opt.pre_train: # Initialize variables train_sess.run(init_op) else: # Restore pre-trained model parameters for domain adaptation (sans embedding table) load_model(train_sess, pre_train_saver, os.path.join(train_opt.save_dir, 'pre_training'), 'best') # Restore embedding parameters from the specified LM checkpoint load_model(train_sess, embeddings_saver, embeddings_dir, embeddings_epoch) # Initialize SSC trainer trainer = SentSimClassTrainer(vocab, train_opt, sent_sim_class, train_sess, train_data, valid_data) # Train model (either for a predefined number of epochs or until early stopping) print('+++TRAINING+++') trainer.train_model() # Report training duration elapsed = time.time() - starting_time logging.info('Training took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format( int(elapsed // 3600), int((elapsed % 3600)) // 60, elapsed % 60))
def test_session(batch_size=1, target_epoch='best', beam_decoding=False): """ Executes a quick test session on the SAE model by sampling a small quantity of items from the test set and using the model to first compress them into a meaningful representation and subsequently reconstruct them. """ # Clear the default graph tf.reset_default_graph() # Declare the batch size, if left unspecified in test options if train_opt.batch_size is None: train_opt.batch_size = batch_size # Load test data test_data = load_pickle(test_pickle) # Build model graph autoencoder = SeqAE(vocab, test_opt, 'seq_ae' + '_{:s}'.format(train_opt.train_id)) # Declare saver object for restoring learned model parameters test_saver = tf.train.Saver() # Initiate testing session with tf.Session(config=config) as test_sess: # Load learned model parameters load_model(test_sess, test_saver, test_opt.save_dir, target_epoch) # Initialize model interface containing inference methods interface = SeqAEInterface(autoencoder, vocab, test_sess, test_opt) # Sample candidate sentences from the test set samples = np.random.choice(test_data, test_opt.num_samples).tolist() while max([len(sample.split()) for sample in samples]) > 100: samples = np.random.choice(test_data, test_opt.num_samples).tolist() # Initialize a loader object to pre-process the sampled sentences sample_loader = DataServer(samples, vocab, test_opt) samples_read = 0 print('Sampled sentences:') for i, s in enumerate(samples): print('{:d}: {:s}'.format(i, s)) print('-' * 10 + '\n') if not beam_decoding: # Perform greedy encoding-decoding print('Greedy decoding:') for i, sample_data in enumerate(sample_loader): _, enc_input, dec_input = sample_data generated = interface.greedy_generation(enc_input, dec_input) for j in range(test_opt.batch_size): print('Encoded: {:s}\nDecoded: {:s}\n'.format( samples[samples_read + j], generated[j])) samples_read += test_opt.batch_size else: # Perform encoding-decoding with beam-search (limited use for reconstruction) assert ( test_opt.batch_size == 1 ), 'Beam search not defined for batches with more than one element.' print('Beam search decoding:') for i, sample_data in enumerate(sample_loader): _, enc_input, _ = sample_data print('Encoded: {:s}'.format(samples[i])) interface.beam_generation(enc_input, print_results=True) print('-' * 10 + '\n') print('=' * 10 + '\n') print('Auto-encoder evaluation completed!')
def score_corpus(): """ Executes a session during which the source corpus is annotated with sentence-wise model perplexity scores. """ # Clear the default graph tf.reset_default_graph() # Declare path leading to corpus to be scored scored_path = os.path.join(data_dir, '{:s}.txt'.format(scored_name)) ppx_scores = list() # Load data full_data = load_pickle(full_pickle) # Build model graph cog_lm = CogLM(vocab, test_opt, 'cog_lm') # Declare saver object for restoring learned model parameters sort_saver = tf.train.Saver() # Time the duration of the scoring process starting_time = time.time() with tf.Session(config=config) as sort_sess: # Load learned model parameters load_model(sort_sess, sort_saver, test_opt.save_dir, 'best') # Initialize LM interface interface = CogLMInterface(cog_lm, vocab, sort_sess, test_opt) # Run the scoring loop pos = 0 with codecs.open(scored_path, 'w') as in_file: while pos < len(full_data) - 1: # Fill a single batch of sentences to be scored try: batch = full_data[pos:pos + test_opt.batch_size] pos += test_opt.batch_size except IndexError: batch = full_data[pos:len(full_data) - 1] pos = len(full_data) - 1 # Get sentence-wise model perplexity scores batch_ppx = interface.get_sequence_perplexity(batch) # Write the scored sentences to file for i in range(len(batch)): sentence_ppx = batch_ppx[i, :].tolist()[0] scored_sent = '{:s}\t{:.4f}\n'.format( batch[i], sentence_ppx) in_file.write(scored_sent) # Keep track of corpus-wide statistics ppx_scores.append(sentence_ppx) # Archive corpus statistics with open(lm_notes, 'w') as notes_file: notes_file.write( '------------ Scored Corpus Statistics -------------\n') notes_file.write('Metric\tMean\tMedian\n') notes_file.write('Senetence Perplexity\t{:.4f}\t{:.4f}\n'.format( np.mean(ppx_scores), np.median(ppx_scores))) # Report scoring duration elapsed = time.time() - starting_time print('Scoring took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format( int(elapsed // 3600), int((elapsed % 3600)) // 60, elapsed % 60))
def test_session(target_epoch='best'): """ Evaluates the accuracy of the learned SSC model by using it to predict the similarity score of sentence pairs contained within the specified test set. """ # Clear the default graph tf.reset_default_graph() # Load data test_data = load_pickle(test_pickle) # Build model graph sent_sim_class = SentSimClassifier(vocab, test_opt, 'ssc') # Declare saver test_saver = tf.train.Saver() save_dir = train_opt.save_dir # Initiate testing session with tf.Session(config=config) as test_sess: # Load learned model parameters load_model(test_sess, test_saver, save_dir, target_epoch) # Initialize model interface interface = SentSimClassInterface(sent_sim_class, vocab, test_sess, test_opt) # Initialize a loader object to pre-process and serve items drawn from the source corpus sample_loader = DataServer(test_data, vocab, test_opt) # Evaluate model's performance on a withheld test corpus to estimate its capacity for generalization beyond # seen data # Track prediction accuracy and the divergence of predicted similarity scores from target values total_error = 0.0 total_differential = 0.0 total_items = 0 for i, test_batch in enumerate(sample_loader): # Obtain model predictions for the current test batch predictions, prediction_error = interface.infer_step(test_batch) total_error += np.sum(np.abs(prediction_error)) try: for j in range(test_opt.batch_size): cj = total_items + j differential = np.abs(np.subtract(float(test_data[1][cj]), predictions[j][0])) total_differential += differential # Report model prediction and error print('Sentence 1: {:s}\nSentence 2: {:s}\n' 'True score: {:.4f} | Model Prediction: {:.4f} | Differential: {:.4f}' .format(test_data[0][cj][0], test_data[0][cj][1], float(test_data[1][cj]), predictions[j][0], differential)) print('-' * 10) total_items += test_opt.batch_size except IndexError: break # Report test corpus statistics print('Total model error: {:.4f} | Average model error: {:.4f} | Average prediction error: {:.4f}'.format( total_error, total_error / total_items, total_differential / total_items)) print('-' * 10 + '\n') print('=' * 10 + '\n') print('Sentence similarity classifier evaluation completed!')
def make_embedding_dict(opt, dict_pkl): """ Creates a dictionary with vocabulary items designated as keys and their embeddings as learned by an LM as values. """ # Declare the source vocabulary path vocab_pkl = os.path.join(opt.root_dir, 'data/europarl/europarl_v7_train_vocab.pkl') # Declare the path to an LM checkpoint containing the learned embeddings embeddings_ckpt = os.path.join(opt.local_dir, 'checkpoints/best_cog_lm.ckpt') # Extract embeddings from the checkpoint file vocab = load_pickle(vocab_pkl) reader = pywrap_tensorflow.NewCheckpointReader(embeddings_ckpt) embedding_table = reader.get_tensor('embeddings/embedding_table') # Construct the embedding dictionary and pickle it for future access embedding_dict = { vocab.index_to_word[idx]: embedding_table[[idx], :] for idx in range(vocab.n_words) } with open(dict_pkl, 'wb') as in_file: pickle.dump(embedding_dict, in_file) print('Embedding dictionary created and pickled!')
def test_to_file(batch_size=1, target_epoch='best', beam_decoding=True): """ Executes a comprehensive test session on the entire test corpus; output is written to file for the calculation of the achieved corpus-wide ID reduction and the BLEU score between source sentences and their ID-reduced translations. """ def _reconstruct_input(input_array): """ Reconstructs input sentences from numpy arrays; used to derive an accurate representation of the pre-processed, encoded sequences. """ # Convert input array to list of lists of word indices input_idx = [ np.squeeze(array).tolist() for array in np.split(input_array, input_array.shape[0], axis=0) ] # Translate indices into corresponding word tokens; truncated after sentence-final <EOS> input_boundaries = [ idx_list.index(vocab.eos_id) if vocab.eos_id in idx_list else len(idx_list) for idx_list in input_idx ] input_sentences = [[ vocab.index_to_word[idx] for idx in input_idx[j][:input_boundaries[j]] ] for j in range(len(input_idx))] input_sentences = [ ' '.join(word_list) + '.' for word_list in input_sentences ] return input_sentences assert (test_opt.batch_size == 1), \ 'Function is defined for a batch size of 1 due to the nature of beam search implementation.' # Clear the default graph tf.reset_default_graph() # Declare the batch size if train_opt.batch_size is None: train_opt.batch_size = batch_size # Load test data from the high-ID corpus source_test_data = load_pickle(pickle_paths[2]) # Build model graph seq_gan = IDGAN(opts, vocab, 'IDGAN') # Declare saver object for restoring learned model parameters test_saver = tf.train.Saver() # Declare paths pointing to locations of output files (i.e. reference and translations sets for BLEU) encoded_path = os.path.join( test_opt.out_dir, 'source_encoded_test_corpus_beam_{:s}.txt'.format(str(beam_decoding))) decoded_path = os.path.join( test_opt.out_dir, 'source_decoded_test_corpus_beam_{:s}.txt'.format(str(beam_decoding))) # Initiate testing session with tf.Session(config=config) as test_sess: # Load learned model parameters load_model(test_sess, test_saver, test_opt.save_dir, target_epoch) # Initialize the model interface containing inference methods interface = IDGANInterface(seq_gan, vocab, test_sess, test_opt) # Initialize a loader object to pre-process the test corpus test_loader = DataServer(source_test_data, vocab, test_opt) with open(encoded_path, 'w') as enc_file: with open(decoded_path, 'w') as dec_file: if not beam_decoding: # Perform greedy ID-reduction on the sampled sentences print('Greedy decoding:') for s_id, test_items in enumerate(test_loader): enc_labels, enc_inputs, dec_inputs = test_items generated = interface.greedy_generation( enc_labels, enc_inputs, dec_inputs) enc_file.write( _reconstruct_input(enc_labels)[0] + '\n') dec_file.write(generated[0] + '\n') if s_id % 10 == 0 and s_id > 0: print( '{:d} sentences written to file.'.format(s_id)) else: # Perform greedy ID-reduction with beam-search on the sampled sentences assert ( test_opt.batch_size == 1 ), 'Beam search not defined for batches with more than one element.' print('Beam search decoding:') for s_id, test_items in enumerate(test_loader): enc_labels, enc_input, _ = test_items generated = interface.beam_generation( enc_labels, enc_input, print_results=False) # Write best beam result only enc_file.write( _reconstruct_input(enc_labels)[0] + '\n') dec_file.write(generated[0][0] + '\n') if s_id % 10 == 0 and s_id > 0: print( '{:d} sentences written to file.'.format(s_id)) print('-' * 10 + '\n') print('=' * 10 + '\n') print('IDGAN documented evaluation completed!')
def test_session(batch_size=1, target_epoch='best', beam_decoding=False): """ Executes a quick test session on the IDGAN system by sampling a small quantity of items from the test set and using the model to first compress them into a meaningful representation and subsequently reconstruct them; the evaluation process focuses exclusively on the translator SAE. """ # Clear the default graph tf.reset_default_graph() # Declare the batch size, if left unspecified in test options if train_opt.batch_size is None: train_opt.batch_size = batch_size # Load data source_test_data = load_pickle(pickle_paths[2]) # Build system graph seq_gan = IDGAN(opts, vocab, 'IDGAN') # Declare saver object for restoring learned IDGAN parameters test_saver = tf.train.Saver() # Initiate testing session with tf.Session(config=config) as test_sess: # Load learned model parameters load_model(test_sess, test_saver, test_opt.save_dir, target_epoch) # Initialize system interface containing inference methods interface = IDGANInterface(seq_gan, vocab, test_sess, test_opt) # Sample candidate sentences from the test set samples = np.random.choice(source_test_data, test_opt.num_samples).tolist() while max([len(sample.split()) for sample in samples]) > 10: samples = np.random.choice(source_test_data, test_opt.num_samples).tolist() # Initialize a loader object to pre-process the sampled sentences sample_loader = DataServer(samples, vocab, test_opt) samples_read = 0 print('Sampled sentences:') for s_id, s in enumerate(samples): print('{:d}: {:s}'.format(s_id, s)) print('-' * 10 + '\n') if not beam_decoding: # Perform greedy ID-reduction on the sampled sentences print('Greedy decoding:') for _, sample_data in enumerate(sample_loader): enc_labels, enc_inputs, dec_inputs = sample_data generated = interface.greedy_generation( enc_labels, enc_inputs, dec_inputs) for j in range(test_opt.batch_size): print('Encoded: {:s}\nDecoded: {:s}\n'.format( samples[samples_read + j], generated[j])) samples_read += test_opt.batch_size else: # Perform greedy ID-reduction with beam-search on the sampled sentences assert ( test_opt.batch_size == 1 ), 'Beam search not defined for batches with more than one element.' print('Beam search decoding:') for _, sample_data in enumerate(sample_loader): enc_labels, enc_input, _ = sample_data print('Encoded: {:s}'.format(samples[i])) interface.beam_generation(enc_labels, enc_input, print_results=True) print('-' * 10 + '\n') print('=' * 10 + '\n') print('IDGAN evaluation completed!')
def train_session(): """ Executes a training session on the IDGAN system. """ # Clear the default graph within which the model graph is constructed tf.reset_default_graph() # Load data source_train_data = load_pickle(pickle_paths[0]) source_valid_data = load_pickle(pickle_paths[1]) target_train_data = load_pickle(pickle_paths[3]) target_valid_data = load_pickle(pickle_paths[4]) # Construct the system graph (component-specific graphs are constructed within the IDGAN graph) seq_gan = IDGAN(opts, vocab, 'IDGAN') # Initialize IDGAN's component models with pre-trained parameters # Declare paths pointing to checkpoints containing desired parameter values component_ckpt_dir = os.path.join(train_opt.local_dir, 'checkpoints/components') lm_dir = os.path.join(component_ckpt_dir, 'lm') source_encoder_dir = os.path.join(component_ckpt_dir, 'source') if train_opt.cross_dec: source_decoder_dir = os.path.join(component_ckpt_dir, 'source_decoder') else: source_decoder_dir = os.path.join(component_ckpt_dir, 'source') # NO crossing target_dir = os.path.join(component_ckpt_dir, 'target') chosen_epoch = 'best' # Isolate parameters to be loaded into the IDGAN's system # Excludes optimization variables as well as variables connected to the training of 'frozen' IDGAN components # Get lists of variables contained within component checkpoint files lm_vars_plus_optimization = get_ckpt_vars(lm_dir) source_encoder_vars_plus_optimization = get_ckpt_vars(source_encoder_dir) source_decoder_vars_plus_optimization = get_ckpt_vars(source_decoder_dir) target_vars_plus_optimization = get_ckpt_vars(target_dir) # Exclude training-specific variables from IDGAN initialization lm_vars = [ var_name for var_name in lm_vars_plus_optimization if 'optimization' not in var_name ] # To enable the 'crossed decoder' training condition, separate the encoder and decoder variables # of the SAE pre-trained on the high-ID corpus ('translator SAE' within IDGAN) source_encoder_vars = \ [var_name for var_name in source_encoder_vars_plus_optimization if 'optimization' not in var_name] source_encoder_vars = [ var_name for var_name in source_encoder_vars if 'encoder' in var_name ] source_decoder_vars = \ [var_name for var_name in source_decoder_vars_plus_optimization if 'optimization' not in var_name] source_decoder_vars = [ var_name for var_name in source_decoder_vars if 'decoder' in var_name ] target_vars = [ var_name for var_name in target_vars_plus_optimization if 'optimization' not in var_name ] # Obtain list of all variables which have to be initialized (either randomly or from pre-trained values) # within the IDGAN system all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) # Check for matches between variables found within the pre-trained checkpoints and IDGAN variables lm_parameters = [ var for var in all_vars if var.name.split(':')[0] in lm_vars ] source_encoder_parameters = [ var for var in all_vars if var.name.split(':')[0] in source_encoder_vars ] source_decoder_parameters = [ var for var in all_vars if var.name.split(':')[0] in source_decoder_vars ] target_parameters = [ var for var in all_vars if var.name.split(':')[0] in target_vars ] # Load matching variables from corresponding checkpoints loaded_parameters = lm_parameters + source_encoder_parameters + source_decoder_parameters + target_parameters # Rest is initialized randomly initialized_parameters = [ var for var in all_vars if var not in loaded_parameters ] # Initialize saver objects tasked with loading in the pre-trained parameters lm_saver = tf.train.Saver(lm_parameters) source_encoder_saver = tf.train.Saver(source_encoder_parameters) source_decoder_saver = tf.train.Saver(source_decoder_parameters) target_saver = tf.train.Saver(target_parameters) # Declare random initialization OP init_op = tf.variables_initializer(initialized_parameters) # Time training duration starting_time = time.time() with tf.Session(config=config) as train_sess: # Load pre-trained parameters into the IDGAN graph load_model(train_sess, lm_saver, lm_dir, chosen_epoch) load_model(train_sess, source_encoder_saver, source_encoder_dir, chosen_epoch) load_model(train_sess, source_decoder_saver, source_decoder_dir, chosen_epoch) load_model(train_sess, target_saver, target_dir, chosen_epoch) # Initialize the rest train_sess.run(init_op) # Initialize IDGAN interface and trainer, used for inference and training steps, respectively interface = IDGANInterface(seq_gan, vocab, train_sess, test_opt) trainer = IDGANTrainer(vocab, train_opt, seq_gan, train_sess, source_train_data, source_valid_data, target_train_data, target_valid_data, test_opt, interface, verbose=True) # Train system (either for a predefined number of epochs or until early stopping) print('+++TRAINING+++') trainer.train_gan() # Report training duration elapsed = time.time() - starting_time logging.info( 'Training took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format( int(elapsed // 3600), int((elapsed % 3600)) // 60, elapsed % 60))
corpus_names[i], source_paths[i], pickle_paths[i], vocab_path=None, is_train='train' in corpus_names[i], is_valid='valid' in corpus_names[i], is_test='test' in corpus_names[i]) # Load the vocabulary pickle shared among all of IDGAN's components if train_opt.use_toy: vocab_pickle = os.path.join(train_opt.root_dir, 'data/toy/toy_train_vocab.pkl') else: vocab_pickle = os.path.join(train_opt.root_dir, 'data/europarl/europarl_v7_train_vocab.pkl') vocab = load_pickle(vocab_pickle) # Write vocabulary contents to file for manual inspection vocab_log_path = os.path.join(train_opt.out_dir, '{:s}_vocab_log.txt'.format('shared')) with codecs.open(vocab_log_path, 'w', encoding='utf8') as in_file: for key, value in vocab.index_to_word.items(): in_file.write('{:s}, {:s}\n'.format(str(key), value)) print('Vocab log written.') # Define TensorFlow session configuration - active during all calls to session.py (uncomment desired settings) config = tf.ConfigProto(allow_soft_placement=True) # config.gpu_options.allow_growth = True # config.gpu_options.per_process_gpu_memory_fraction = 0.5
def analogy_tests(opt, dict_pkl): """ Performs the semantic and syntactic analogy tests in accordance with arxiv.org/pdf/1301.3781.pdf on a selection of embeddings learned by the LM; as the tests had to be manually constructed, their scope is limited to questions referencing the ten most frequent word tokens and their paired counterparts per test question. """ def _find_nearest(lookup_dict, source_pair, target_pair): """ Performs the analogy tests by iterating over the specified pairs. """ # Input format # input pairs: ('france', 'paris'), ('germany', 'berlin') # corresponding variables: source[0], source[1], target[0], nearest # Keep track of the word vector closest to the predicted location of the analogy question answer and # the associated cosine distance score nearest = None max_similarity = 0.0 # Identify the vector value denoting the relationship represented by the source pair relationship = lookup_dict[source_pair[1]] - lookup_dict[ source_pair[0]] # Attempt to predict the second item in the target pair by applying the relationship vector to the first item predicted = relationship + lookup_dict[target_pair[0]] # Check which of the learned embeddings is closest to the predicted location within the embedding space for item in lookup_dict.items(): similarity = cosine_similarity(predicted, item[1])[0][0] if similarity >= max_similarity: max_similarity = similarity nearest = item[0] return nearest # Load embedding dict embed_dict = load_pickle(dict_pkl) # Declare destination path for the test evaluation file out_path = os.path.join(opt.local_dir, 'out/embedding_tests.txt') # Manually define semantic tests capital = [('paris', 'france'), ('berlin', 'germany'), ('brussels', 'belgium'), ('vienna', 'austria'), ('copenhagen', 'denmark'), ('london', 'england'), ('athens', 'greece'), ('dublin', 'ireland'), ('amsterdam', 'netherlands'), ('lisbon', 'portugal')] currency = [('denmark', 'krone'), ('england', 'pound'), ('usa', 'dollar'), ('japan', 'yen'), ('germany', 'euro')] gender = [('mr', 'mrs'), ('sir', 'madam'), ('man', 'woman'), ('he', 'she'), ('king', 'queen'), ('father', 'mother'), ('boy', 'girl'), ('son', 'daughter')] # Manually define syntactic tests adverb = [('particular', 'particularly'), ('clear', 'clearly'), ('extreme', 'extremely'), ('final', 'finally'), ('absolute', 'absolutely'), ('simple', 'simply'), ('full', 'fully'), ('current', 'currently'), ('complete', 'completely'), ('quick', 'quickly')] opposite = [('possible', 'impossible'), ('necessary', 'unnecessary'), ('legal', 'illegal'), ('important', 'unimportant'), ('likely', 'unlikely'), ('clear', 'unclear'), ('realistic', 'unrealistic'), ('able', 'unable'), ('responsible', 'irresponsible')] comparative = [('great', 'greater'), ('long', 'longer'), ('early', 'earlier'), ('late', 'later'), ('close', 'closer'), ('high', 'higher'), ('small', 'smaller'), ('few', 'fewer'), ('large', 'larger'), ('broad', 'broader')] superlative = [('great', 'greatest'), ('long', 'longest'), ('early', 'earliest'), ('late', 'latest'), ('close', 'closest'), ('high', 'highest'), ('small', 'smallest'), ('large', 'largest'), ('broad', 'broadest'), ('poor', 'poorest')] participle = [('work', 'working'), ('make', 'making'), ('take', 'taking'), ('vote', 'voting'), ('monitor', 'monitoring'), ('develop', 'developing'), ('read', 'reading'), ('say', 'saying'), ('talk', 'talking'), ('sit', 'sitting')] nationality = [('france', 'french'), ('germany', 'german'), ('belgium', 'belgian'), ('austria', 'austrian'), ('denmark', 'danish'), ('england', 'english'), ('greece', 'greek'), ('ireland', 'irish'), ('netherlands', 'dutch'), ('portugal', 'portuguese')] past = [('working', 'worked'), ('making', 'made'), ('taking', 'took'), ('voting', 'voted'), ('monitoring', 'monitored'), ('developing', 'developed'), ('saying', 'said'), ('talking', 'talked'), ('sitting', 'sat'), ('wanting', 'wanted')] plurals = [('democracy', 'democracies'), ('nationality', 'nationalities'), ('president', 'presidents'), ('nation', 'nations'), ('country', 'countries'), ('committee', 'committees'), ('year', 'years'), ('citizen', 'citizens'), ('agenda', 'agendas'), ('month', 'months')] third = [('work', 'works'), ('make', 'makes'), ('take', 'takes'), ('vote', 'votes'), ('monitor', 'monitors'), ('say', 'says'), ('talk', 'talks'), ('want', 'wants'), ('cover', 'covers'), ('offer', 'offers')] # Iterate over all test items and write results to file all_tests = [ capital, currency, gender, adverb, opposite, comparative, superlative, participle, nationality, past, plurals, third ] # Track fraction of correct predictions for intrinsic embedding quality estimation correct_predictions_total = list() questions_total = list() # Write output with open(out_path, 'w') as out_file: for test_set in all_tests: correct_set_predictions = 0 for pair_a in test_set: source = pair_a correct_pair_predictions = -1 for pair_b in test_set: target = pair_b prediction = _find_nearest(embed_dict, source, target) if prediction == target[1]: correct_pair_predictions += 1 log_entry = '{:s} is similar to {:s} as [{:s}] is similar to {:s}\n' \ .format(source[1], source[0], prediction, target[0]) out_file.write(log_entry) correct_set_predictions += correct_pair_predictions out_file.write('-' * 10 + '\n') # Compile output statistics per test set num_questions = len(test_set)**2 - len(test_set) questions_total.append(num_questions) correct_predictions_total.append(correct_set_predictions) out_file.write('\n') out_file.write( 'Number of non-identity questions asked {:d} | Correct answers: {:d} | Accuracy: {:.4f}\n' .format(num_questions, correct_set_predictions, correct_set_predictions / num_questions)) out_file.write('\n') out_file.write('=' * 10 + '\n') out_file.write('\n') print('Competed one of the test sets!') # Compile output statistics for the full collection of tests out_file.write('\n') out_file.write('Asked count: {}\n'.format(questions_total)) out_file.write( 'Answered correctly count: {}\n'.format(correct_predictions_total)) out_file.write('Semantic accuracy: {:.4f}\n'.format( sum(correct_predictions_total[:3]) / sum(questions_total[:3]))) out_file.write('Syntactic accuracy: {:.4f}\n'.format( sum(correct_predictions_total[3:]) / sum(questions_total[3:]))) out_file.write('Overall accuracy: {:.4f}\n'.format( sum(correct_predictions_total) / sum(questions_total)))
def test_session(target_epoch='best', calculate_er=False, generate=True, gen_cycles=1): """ Executes a quick test session on the language model by sampling a small quantity of items from the test set and scoring them along various metrics. """ # Tests are defined for a batch size of 1 assert ( test_opt.batch_size == 1), 'Model tests require batch size to equal 1.' # Clear the default graph tf.reset_default_graph() # Load data test_data = load_pickle(test_pickle) # Build model graph cog_lm = CogLM(vocab, test_opt, 'cog_lm') # Declare saver object for restoring learned model parameters test_saver = tf.train.Saver() with tf.Session(config=config) as test_sess: # Load learned model parameters load_model(test_sess, test_saver, test_opt.save_dir, target_epoch) # Initialize LM interface interface = CogLMInterface(cog_lm, vocab, test_sess, test_opt) # Sample sentences to be forwarded to the model samples = np.random.choice(test_data, test_opt.num_samples).tolist() print('Sampled sentences:') for i, s in enumerate(samples): print('{:d}: {:s}'.format(i, s)) print('-' * 10 + '\n') # Get sentence probabilities print('Probabilities:') for i, s in enumerate(samples): total, prob_array, _ = interface.get_probability(s) # Mask <EOS> and <PAD> tag values cut_off = len(s.split()) print('{:d}: {:s} | Total probability: {:.10f}'.format( i, s, total[0][0])) print('Per-word probabilities:') print('\t'.join(s.split())) print('\t'.join( ['{:.4}'.format(score) for score in prob_array[0][:cut_off]])) print('-' * 10 + '\n') # Get sentence log-probabilities print('Log-probabilities:') for i, s in enumerate(samples): total, prob_array, _ = interface.get_log_probability(s) cut_off = len(s.split()) print('{:d}: {:s} | Total log-probability: {:.4f}'.format( i, s, total[0][0])) print('Per-word log-probabilities:') print('\t'.join(s.split())) print('\t'.join(['{:.4}'.format(score) for score in prob_array[0]][:cut_off])) print('-' * 10 + '\n') # Get surprisal print('Surprisal and UID:') for i, s in enumerate(samples): total_s, s_array, norm_s, total_ud, ud_array, norm_ud = interface.get_surprisal( s) cut_off = len(s.split()) tabbed_sent = '\t'.join(s.split()) print( '{:d}: {:s} | Total surprisal: {: .4f} | Normalized surprisal: {: .4f}' .format(i, s, total_s[0][0], norm_s[0][0])) print('Per-word surprisal:') print(tabbed_sent) print('\t'.join( ['{: .4}'.format(score) for score in s_array[0][:cut_off]])) print( '{:d}: {:s} | Absolute UID: {:.4f} | Normalized UID: {: .4f}'. format(i, s, total_ud[0][0], norm_ud[0][0])) print('Per-word UID:') print(tabbed_sent) print('\t'.join( ['{: .4}'.format(score) for score in ud_array[0][:cut_off]])) print('-' * 10 + '\n') # Get approximate entropy reduction (computationally expensive!) if calculate_er: print('Approximate entropy reduction:') for i, s in enumerate(samples): total, array, norm = interface.get_entropy_reduction(samples) cut_off = len(s.split()) print( '{:d}: {:s} | Total ER: {: .4f} | Normalized ER: {: .4f}'. format(i, s, total[0][0], norm[0][0])) print('Per-word ER:') print('\t'.join(s.split())) print('\t'.join( ['{: .4}'.format(score) for score in array[0][:cut_off]])) print('-' * 10 + '\n') # Get cognitive load score (weighted sum of normalized surprisal and entropy reduction scores) print('Combined cognitive load:') for i, s in enumerate(samples): total, array, norm = interface.get_cognitive_load(samples) cut_off = len(s.split()) print( '{: d}: {:s} | Total CL: {: .4f} | Normalized CL: {: .4f}'. format(i, s, total[0][0], norm[0][0])) print('Per-word CL:') print('\t'.join(s.split())) print('\t'.join( ['{: .4}'.format(score) for score in array[0][:cut_off]])) print('-' * 10 + '\n') # Get model perplexity for the entire test set print('Model perplexity: {: .4f}'.format( interface.get_model_perplexity(test_data)[0][0])) print('-' * 10 + '\n') # Evaluate generative capability of the trained LM if generate: # Generate greedliy from scratch print('Sentences generated from scratch:') for c in range(gen_cycles): interface.generate(prefix=None, print_results=True) print('-' * 10 + '\n') # Generate greedily from some sentence prefix (i.e. a sentence completion test) print('Sentences generated from prefix:') for i, s in enumerate(samples): sent_list = s.split(' ') # Generate a sentence prefix of random length (at most 1/2 of the source sentence) cut_off = np.random.randint(1, len(sent_list) // 2) prefix = ' '.join(sent_list[:cut_off]) print('Prefix: {:s} | Source: {:s}'.format(prefix, s)) generated = interface.generate(prefix=prefix, print_results=False) for tpl in generated: print('{:s} | Probability: {:.10f}'.format(tpl[0], tpl[1])) print('\n') print('-' * 10 + '\n') print('=' * 10 + '\n') print('Language model evaluation completed!')
def annotate_corpus(): """ Executes a session during which the shrunk source corpus is annotated with ID-relevant measures. """ # Clear the default graph tf.reset_default_graph() # Declare path leading to corpus to be annotated annotated_path = os.path.join( data_dir, '{:s}_annotated.txt'.format(train_name.split('_')[0])) # Values assigned per sentence are tracked for subsequent computation of corpus-wide statistics corpus_stats = { 'Total_surprisal': list(), 'Per_word_surprisal': list(), 'Normalized_surprisal': list(), 'Total_UID_divergence': list(), 'Per_word_UID_divergence': list(), 'Normalized_UID_divergence': list() } # Load data full_data = load_pickle(full_pickle) # Build model graph cog_lm = CogLM(vocab, test_opt, 'cog_lm') # Declare saver object for restoring learned model parameters annotate_saver = tf.train.Saver() # Time annotation duration starting_time = time.time() with tf.Session(config=config) as annotate_sess: # Load learned model parameters load_model(annotate_sess, annotate_saver, test_opt.save_dir, 'best') # Initialize LM interface interface = CogLMInterface(cog_lm, vocab, annotate_sess, test_opt) # Run the annotation loop pos = 0 with codecs.open(annotated_path, 'w') as in_file: while pos < len(full_data) - 1: # Fill a single batch of sentences to be annotated try: batch = full_data[pos:pos + test_opt.batch_size] pos += test_opt.batch_size except IndexError: batch = full_data[pos:len(full_data) - 1] pos = len(full_data) - 1 # Obtain ID-values via LM's interface total_surp, per_word_surp, norm_surp, total_uiddiv, per_word_uiddiv, norm_uiddiv = \ interface.get_surprisal(batch) # Write annotated sentences to file for i in range(len(batch)): # For per-word annotations, exclude values associated with <EOS> and <PAD> tags cut_off = len(batch[i].split()) item_ts = total_surp[i, :].tolist()[0] # Surprisal item_pws_floats = per_word_surp[i, :].tolist()[:cut_off] item_pws = ';'.join( ['{:.4f}'.format(pws) for pws in item_pws_floats]) item_ns = norm_surp[i, :].tolist()[0] item_tu = total_uiddiv[i, :].tolist()[0] # UID divergence item_pwu_floats = per_word_uiddiv[i, :].tolist()[:cut_off] item_pwu = ';'.join( ['{:.4f}'.format(pwu) for pwu in item_pwu_floats]) item_nu = norm_uiddiv[i, :].tolist()[0] # Construct annotated sample scored_sent = '{:s}\t{:.4f}\t{:s}\t{:.4f}\t{:.4f}\t{:s}\t{:4f}\n'. \ format(batch[i], item_ts, item_pws, item_ns, item_tu, item_pwu, item_nu) # Write to file in_file.write(scored_sent) # Update corpus stats dictionary corpus_stats['Total_surprisal'].append(item_ts) corpus_stats['Per_word_surprisal'].extend(item_pws_floats) corpus_stats['Normalized_surprisal'].append(item_ns) corpus_stats['Total_UID_divergence'].append(item_tu) corpus_stats['Per_word_UID_divergence'].extend( item_pwu_floats) corpus_stats['Normalized_UID_divergence'].append(item_nu) # Archive corpus statistics with open(lm_notes, 'a') as notes_file: notes_file.write('\n') notes_file.write( '------------ Annotated Corpus Statistics -------------\n') notes_file.write('Metric\tMean\tMedian\tLowest\tHighest\n') for k, v in corpus_stats.items(): notes_file.write('{:s}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\n'.format( k, np.mean(v), np.median(v), np.min(v), np.max(v))) # Report scoring duration elapsed = time.time() - starting_time print( 'Annotation took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format( int(elapsed // 3600), int((elapsed % 3600)) // 60, elapsed % 60))
def train_session(): """ Executes a training session on the SAE model. """ # Clear the default graph within which the model graph is constructed tf.reset_default_graph() # Load data train_data = load_pickle(train_pickle) valid_data = load_pickle(valid_pickle) # Construct the model graph sae_name = 'seq_ae' + '_{:s}'.format(train_opt.train_id) autoencoder = SeqAE(vocab, train_opt, sae_name) all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) all_init_op = tf.global_variables_initializer() # Extract pre-trained word embeddings from the IDGAN-internal LM and use them to initialize the SAE initialized_vars = [ var for var in all_vars if 'embedding_table' not in var.name or 'optimization' in var.name ] embeddings_sae_keys = [ var.name for var in all_vars if var not in initialized_vars ] embedding_lm_keys = list() # Handle scoping discrepancies between SAE graph and LM checkpoints, # to make LM variables compatible with the instantiated graph for k in embeddings_sae_keys: k = k.replace(sae_name, 'cog_lm') k = k.split(':')[0] embedding_lm_keys.append(k) embeddings_dir = os.path.join(train_opt.root_dir, 'cognitive_language_model/src/checkpoints/') embeddings_epoch = 'best' # Map SAE embedding variables to LM embedding variables, # so that the former may be initialized with values extracted from the latter embeddings_dict = { embedding_lm_keys[i]: [v for v in tf.global_variables() if v.name == embeddings_sae_keys[i]][0] for i in range(len(embedding_lm_keys)) } # Declare saver object for initializing SAE's embedding table with embeddings learned by IDGAN's LM embeddings_saver = tf.train.Saver(embeddings_dict) # Declare OP for initializing other SAE parameters randomly no_embeds_init_op = tf.variables_initializer(initialized_vars) # Time training duration starting_time = time.time() with tf.Session(config=config) as train_sess: # Initialize variables if train_opt.is_local: # No pre-trained embeddings are loaded for experiments on the toy set train_sess.run(all_init_op) else: load_model(train_sess, embeddings_saver, embeddings_dir, embeddings_epoch) train_sess.run(no_embeds_init_op) # Initialize SAE interface and trainer, used for inference and training steps, respectively interface = SeqAEInterface(autoencoder, vocab, train_sess, test_opt) trainer = SeqAETrainer(vocab, train_opt, autoencoder, train_sess, train_data, valid_data, test_opt, interface) # Train model (either for a predefined number of epochs or until early stopping) print('+++TRAINING+++') trainer.train_model() # Report training duration elapsed = time.time() - starting_time logging.info( 'Training took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format( int(elapsed // 3600), int((elapsed % 3600)) // 60, elapsed % 60))