def test_LMGeneralizedVocabulary__generalized_id_to_extended_vector( special_units_all, vocab_type, generalized_id, expected_output): vocab = vocab_type() generalized_vocab = LMGeneralizedVocabulary(vocab) t_vector = generalized_vocab.generalized_id_to_extended_vector()( generalized_id)
def test_LMGeneralizedVocabulary__token_to_id_glove(tokens, expected_id): glove = get_vocabulary("glove300") generalized = LMGeneralizedVocabulary(glove) vocab_id = glove.word_to_id_op()(tokens) generalized_id = generalized.vocab_id_to_generalized_id()(vocab_id) with tf.Session() as sess: glove.after_session_created_hook_fn(sess) sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) r_generalized_id, r_vocab_id = sess.run([generalized_id, vocab_id]) assert (r_generalized_id == expected_id).all()
def test_LMGeneralizedVocabulary__generalized_id_to_vector_features_glove( generalized_id, expected_features): glove = get_vocabulary("glove300") generalized = LMGeneralizedVocabulary(glove) generalized_vector = generalized.generalized_id_to_extended_vector()( generalized_id) with tf.Session() as sess: glove.after_session_created_hook_fn(sess) sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) r_generalized_vector = sess.run(generalized_vector) assert (r_generalized_vector[:, :3] == expected_features).all()
def eval_lm_on_cached_simple_examples_with_glove_check(data_dir, model_dir, subset, hparams, take_first_n=20): vocabulary = get_vocabulary(hparams.vocabulary_name) data = LanguageModelTrainingData( vocabulary_name=hparams.vocabulary_name, corpus_name=hparams.corpus_name, cached_data_dir=data_dir, batch_size=hparams.batch_size, shuffle_examples_buffer_size=None, hparams=hparams) def filter_too_long(features, labels): tf.less_equal(tf.shape(features["inputs"])[1], hparams.max_input_length) def create_input(): input_dataset = data.load_training_data() if hparams.max_input_length > 0: input_dataset = input_dataset.filter(filter_too_long) return input_dataset generalized = LMGeneralizedVocabulary(vocabulary) with without: #tf.device("/device:CPU:0"): model = LanguageModelCallable(generalized, hparams) config=tf.estimator.RunConfig() estimator = tf.estimator.Estimator( model, model_dir=model_dir, config=config) predictions = estimator.predict(create_input) predictions = islice(predictions, take_first_n) return predictions
def train_and_eval(data_dir, model_dir, hparams, warm_start=None): vocabulary = get_vocabulary(hparams.vocabulary_name) data = LanguageModelTrainingData( vocabulary_name=hparams.vocabulary_name, corpus_name=hparams.corpus_name, cached_data_dir=data_dir, batch_size=hparams.batch_size, shuffle_examples_buffer_size=hparams.shuffle_examples_buffer_size, hparams=hparams) def create_input(): return data.load_training_data() generalized = LMGeneralizedVocabulary(vocabulary) config = tf.estimator.RunConfig( save_summary_steps=hparams.save_summary_steps, save_checkpoints_secs=hparams.save_checkpoints_secs, #save_checkpoints_steps=2, session_config=tf.ConfigProto(log_device_placement=False), keep_checkpoint_max=hparams.keep_checkpoint_max, keep_checkpoint_every_n_hours=hparams.keep_checkpoint_every_n_hours, log_step_count_steps=hparams.log_step_count_steps, ) model = LanguageModelCallable(generalized, hparams) estimator = tf.estimator.Estimator(model, model_dir=model_dir, config=config, warm_start_from=warm_start) debug_hook = tf_debug.LocalCLIDebugHook() debug_hook.add_tensor_filter("negative_count", any_negative_filter_callable) hooks = [] if hparams.cli_debug == True: hooks.append(debug_hook) t1 = datetime.datetime.now() estimator.train(create_input, max_steps=hparams.max_training_steps, hooks=hooks) t2 = datetime.datetime.now() logger.info("start: {}".format(t1)) logger.info("stop: {}".format(t2)) logger.info("duration: {}".format(t2-t1))
def __init__(self, vocab, batch_size=20): """Creates preprocessing pipeline that converts token-based dataset into a dataset suitable for LanguageModel training. Input dataset examples should be 1D string tensor representing sentence (each element of such tensor is one word/token). Args: vocab (Vocabulary): vocabulary that will be used to convert tokens into batch_size (int or None): size of batch created by load_data or None - then no batching will be performed """ super(LmInputDataPipeline, self).__init__() self._vocab = vocab self.batch_size = batch_size self._vocab_generalized = vocab_generalized = LMGeneralizedVocabulary( vocab) self.add_unit_transformation(vocab.word_to_id_op()) self.add_unit_transformation( vocab_generalized.vocab_id_to_generalized_id()) self.add_structural_transformation(self.make_input_target_example) self.add_unit_transformation( vocab_generalized.generalized_id_to_extended_vector(), 0, "inputs")