예제 #1
0
def test_LMGeneralizedVocabulary__generalized_id_to_extended_vector(
        special_units_all, vocab_type, generalized_id, expected_output):
    vocab = vocab_type()
    generalized_vocab = LMGeneralizedVocabulary(vocab)

    t_vector = generalized_vocab.generalized_id_to_extended_vector()(
        generalized_id)
예제 #2
0
def test_LMGeneralizedVocabulary__token_to_id_glove(tokens, expected_id):
    glove = get_vocabulary("glove300")
    generalized = LMGeneralizedVocabulary(glove)
    vocab_id = glove.word_to_id_op()(tokens)
    generalized_id = generalized.vocab_id_to_generalized_id()(vocab_id)

    with tf.Session() as sess:
        glove.after_session_created_hook_fn(sess)
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        r_generalized_id, r_vocab_id = sess.run([generalized_id, vocab_id])

    assert (r_generalized_id == expected_id).all()
예제 #3
0
def test_LMGeneralizedVocabulary__generalized_id_to_vector_features_glove(
        generalized_id, expected_features):
    glove = get_vocabulary("glove300")
    generalized = LMGeneralizedVocabulary(glove)
    generalized_vector = generalized.generalized_id_to_extended_vector()(
        generalized_id)

    with tf.Session() as sess:
        glove.after_session_created_hook_fn(sess)
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        r_generalized_vector = sess.run(generalized_vector)

    assert (r_generalized_vector[:, :3] == expected_features).all()
예제 #4
0
def eval_lm_on_cached_simple_examples_with_glove_check(data_dir, model_dir, subset, hparams, take_first_n=20):
    vocabulary = get_vocabulary(hparams.vocabulary_name)
    
    data = LanguageModelTrainingData(
        vocabulary_name=hparams.vocabulary_name, 
        corpus_name=hparams.corpus_name, 
        cached_data_dir=data_dir, 
        batch_size=hparams.batch_size, 
        shuffle_examples_buffer_size=None, 
        hparams=hparams)
    
    def filter_too_long(features, labels):
        tf.less_equal(tf.shape(features["inputs"])[1], hparams.max_input_length)

    def create_input():
        input_dataset = data.load_training_data()
        if hparams.max_input_length > 0:
            input_dataset = input_dataset.filter(filter_too_long)
        return input_dataset

    generalized = LMGeneralizedVocabulary(vocabulary)
    
    with without: #tf.device("/device:CPU:0"):
        model = LanguageModelCallable(generalized, hparams)
        
        config=tf.estimator.RunConfig()

        estimator = tf.estimator.Estimator(
            model, model_dir=model_dir, config=config)
        predictions = estimator.predict(create_input)
    predictions = islice(predictions, take_first_n)
    return predictions
예제 #5
0
def train_and_eval(data_dir, model_dir, hparams, warm_start=None):
    vocabulary = get_vocabulary(hparams.vocabulary_name)

    data = LanguageModelTrainingData(
        vocabulary_name=hparams.vocabulary_name, 
        corpus_name=hparams.corpus_name, 
        cached_data_dir=data_dir, 
        batch_size=hparams.batch_size, 
        shuffle_examples_buffer_size=hparams.shuffle_examples_buffer_size, 
        hparams=hparams)

    def create_input():
        return data.load_training_data()

    generalized = LMGeneralizedVocabulary(vocabulary)

    config = tf.estimator.RunConfig(
            save_summary_steps=hparams.save_summary_steps,
            save_checkpoints_secs=hparams.save_checkpoints_secs,
            #save_checkpoints_steps=2,
            session_config=tf.ConfigProto(log_device_placement=False),
            keep_checkpoint_max=hparams.keep_checkpoint_max,
            keep_checkpoint_every_n_hours=hparams.keep_checkpoint_every_n_hours,
            log_step_count_steps=hparams.log_step_count_steps,
        )
    model = LanguageModelCallable(generalized, hparams)

    estimator = tf.estimator.Estimator(model, model_dir=model_dir, config=config, warm_start_from=warm_start)

    debug_hook = tf_debug.LocalCLIDebugHook()
    debug_hook.add_tensor_filter("negative_count", any_negative_filter_callable)

    hooks = []
    if hparams.cli_debug == True:
        hooks.append(debug_hook)

    t1 = datetime.datetime.now()

    estimator.train(create_input, max_steps=hparams.max_training_steps, hooks=hooks)
    t2 = datetime.datetime.now()

    logger.info("start: {}".format(t1))
    logger.info("stop: {}".format(t2))
    logger.info("duration: {}".format(t2-t1))
예제 #6
0
    def __init__(self, vocab, batch_size=20):
        """Creates preprocessing pipeline that converts token-based dataset into a dataset suitable for LanguageModel training.
        Input dataset examples should be 1D string tensor representing sentence (each element of such tensor is one word/token).

        Args:
            vocab (Vocabulary): vocabulary that will be used to convert tokens into 
            batch_size (int or None): size of batch created by load_data or None - then no batching will be performed
        """
        super(LmInputDataPipeline, self).__init__()
        self._vocab = vocab
        self.batch_size = batch_size
        self._vocab_generalized = vocab_generalized = LMGeneralizedVocabulary(
            vocab)
        self.add_unit_transformation(vocab.word_to_id_op())
        self.add_unit_transformation(
            vocab_generalized.vocab_id_to_generalized_id())
        self.add_structural_transformation(self.make_input_target_example)
        self.add_unit_transformation(
            vocab_generalized.generalized_id_to_extended_vector(), 0, "inputs")